today i did somethign not webdevvy but still requiring dom navigation. i scraped an entire site and analyzed its internal links with a pseudo pageranky algorithm. to wit:
from selenium import webdriver
from import By
from selenium.webdriver.common.keys import Keys
from import Select
from import WebDriverWait
from selenium.common.exceptions import TimeoutException
from import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import sys
import unittest, time, re
#import top level daily email articles
driver = webdriver.Firefox()
driver.base_url = ""
driver.verificationErrors = []
driver.accept_next_alert = True
delay = 3
# driver.get(self.base_url + "/search?q=stckoverflow&src=typd")
# driver.find_element_by_link_text("All").click()
for i in range(1,100):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
print i
html_source = driver.page_source
data = html_source.encode('utf-8')
from BeautifulSoup import BeautifulSoup as bs
root=data #lh.tostring(data) #convert the generated HTML to a string
soup=bs(root) #make BeautifulSoup
temp = [x.a.attrs[0][1] for x in soup.body.findAll('h1', attrs={'class':'entry-title'})]
#now that we have all the article URLs in temp, extract all the links from each article
all_links = []
for x in [x.split(">")[0][:-1] for x in temp]: #[x.split(">")[0][:-1] for x in temp if ">" in x]
pagehtml_source = driver.page_source
patedata = pagehtml_source.encode('utf-8')
soup=bs(root) #make BeautifulSoup
# prettyHTML=soup.prettify() #prettify the html
all_links.append([x.attrs[0][1] for x in soup.body.find('div', attrs={'class':'entry-content'}).findAll('a')])
print 'error on ' + x
#now postprocess
import itertools
all_links_flat = list(itertools.chain.from_iterable(all_links))
import pandas as pd
all_links_flat = pd.DataFrame(all_links_flat)
all_links_flat['count'] = 1
all_links_flat[0] = [str(x.encode('ascii', 'ignore')).replace("https://","").replace("http://","") for x in all_links_flat[0]]
summary = all_links_flat.groupby(0).sum().sort_values(by="count",ascending=False).reset_index()
# [str(x) for x in summary[summary[0].str.contains('stratechery')][0]]#[30:]
# summary[~summary[0].str.contains('')]
# summary[summary[0].str.contains('')]
# summary[~summary[0].str.contains('stratechery')]
# absdlkjs = summary[summary[0].str.contains('hackernoon')][0]
trimmed_summary = summary.copy()
# trimmed_summary[0] = [x.replace("https://","").replace("http://","").split("/") for x in trimmed_summary[0]]
# trimmed_summary[0] = ['/'.join(x if x[-1:][0] != '/' else x[:-1]) for x in trimmed_summary[0]]
# trimmed_summary[0] = [x.replace("https://","").replace("http://","") for x in trimmed_summary[0]]
# trimmed_summary[0] = [x if x[-1:][0] != '/' else x[:-1] for x in trimmed_summary[0]]
trimmed_summary = trimmed_summary #[trimmed_summary['count']>1]
it produced decent results, of course.