today i did somethign not webdevvy but still requiring dom navigation. i scraped an entire site and analyzed its internal links with a pseudo pageranky algorithm. to wit:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import sys
import unittest , time , re
#import top level daily email articles
driver = webdriver . Firefox ()
driver . implicitly_wait ( 30 )
driver . base_url = "https://stratechery.com/category/daily-email/"
driver . verificationErrors = []
driver . accept_next_alert = True
delay = 3
# driver.get(self.base_url + "/search?q=stckoverflow&src=typd")
driver . get ( driver . base_url )
# driver.find_element_by_link_text("All").click()
for i in range ( 1 , 100 ):
driver . execute_script ( "window.scrollTo(0, document.body.scrollHeight);" )
time . sleep ( 4 )
print i
html_source = driver . page_source
data = html_source . encode ( 'utf-8' )
#parse
from BeautifulSoup import BeautifulSoup as bs
root = data #lh.tostring(data) #convert the generated HTML to a string
soup = bs ( root ) #make BeautifulSoup
temp = [ x . a . attrs [ 0 ][ 1 ] for x in soup . body . findAll ( 'h1' , attrs = { 'class' : 'entry-title' })]
#now that we have all the article URLs in temp, extract all the links from each article
all_links = []
for x in [ x . split ( ">" )[ 0 ][: - 1 ] for x in temp ]: #[x.split(">")[0][:-1] for x in temp if ">" in x]
driver . get ( x )
pagehtml_source = driver . page_source
patedata = pagehtml_source . encode ( 'utf-8' )
root = patedata
soup = bs ( root ) #make BeautifulSoup
# prettyHTML=soup.prettify() #prettify the html
try :
all_links . append ([ x . attrs [ 0 ][ 1 ] for x in soup . body . find ( 'div' , attrs = { 'class' : 'entry-content' }) . findAll ( 'a' )])
except :
print 'error on ' + x
#now postprocess
import itertools
all_links_flat = list ( itertools . chain . from_iterable ( all_links ))
import pandas as pd
all_links_flat = pd . DataFrame ( all_links_flat )
all_links_flat [ 'count' ] = 1
all_links_flat [ 0 ] = [ str ( x . encode ( 'ascii' , 'ignore' )) . replace ( "https://" , "" ) . replace ( "http://" , "" ) for x in all_links_flat [ 0 ]]
summary = all_links_flat . groupby ( 0 ) . sum () . sort_values ( by = "count" , ascending = False ) . reset_index ()
# [str(x) for x in summary[summary[0].str.contains('stratechery')][0]]#[30:]
# summary[~summary[0].str.contains('stratechery.com/20')]
# summary[summary[0].str.contains('stratechery.com/20')]
# summary[~summary[0].str.contains('stratechery')]
# absdlkjs = summary[summary[0].str.contains('hackernoon')][0]
summary #.set_index(0)
trimmed_summary = summary . copy ()
# trimmed_summary[0] = [x.replace("https://","").replace("http://","").split("/") for x in trimmed_summary[0]]
# trimmed_summary[0] = ['/'.join(x if x[-1:][0] != '/' else x[:-1]) for x in trimmed_summary[0]]
# trimmed_summary[0] = [x.replace("https://","").replace("http://","") for x in trimmed_summary[0]]
# trimmed_summary[0] = [x if x[-1:][0] != '/' else x[:-1] for x in trimmed_summary[0]]
trimmed_summary = trimmed_summary #[trimmed_summary['count']>1]
it produced decent results, of course.