swyx/dev/blog | scraping python

today i did somethign not webdevvy but still requiring dom navigation. i scraped an entire site and analyzed its internal links with a pseudo pageranky algorithm. to wit:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import sys

import unittest, time, re

#import top level daily email articles
driver = webdriver.Firefox()
driver.implicitly_wait(30)
driver.base_url = "https://stratechery.com/category/daily-email/"
driver.verificationErrors = []
driver.accept_next_alert = True
delay = 3
# driver.get(self.base_url + "/search?q=stckoverflow&src=typd")
driver.get(driver.base_url)
# driver.find_element_by_link_text("All").click()
for i in range(1,100):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(4)
    print i
html_source = driver.page_source
data = html_source.encode('utf-8')

#parse
from BeautifulSoup import BeautifulSoup as bs
root=data #lh.tostring(data) #convert the generated HTML to a string
soup=bs(root)                #make BeautifulSoup
temp = [x.a.attrs[0][1] for x in soup.body.findAll('h1', attrs={'class':'entry-title'})]

#now that we have all the article URLs in temp, extract all the links from each article
all_links = []
for x in [x.split(">")[0][:-1] for x in temp]: #[x.split(">")[0][:-1] for x in temp if ">" in x]
    driver.get(x)
    pagehtml_source = driver.page_source
    patedata = pagehtml_source.encode('utf-8')
    root=patedata
    soup=bs(root)                #make BeautifulSoup
#     prettyHTML=soup.prettify()   #prettify the html
    try:
        all_links.append([x.attrs[0][1] for x in soup.body.find('div', attrs={'class':'entry-content'}).findAll('a')])
    except:
        print 'error on ' + x

#now postprocess

import itertools
all_links_flat = list(itertools.chain.from_iterable(all_links))
import pandas as pd
all_links_flat = pd.DataFrame(all_links_flat)
all_links_flat['count'] = 1
all_links_flat[0] = [str(x.encode('ascii', 'ignore')).replace("https://","").replace("http://","") for x in all_links_flat[0]]
summary = all_links_flat.groupby(0).sum().sort_values(by="count",ascending=False).reset_index()
# [str(x) for x in summary[summary[0].str.contains('stratechery')][0]]#[30:]
# summary[~summary[0].str.contains('stratechery.com/20')]
# summary[summary[0].str.contains('stratechery.com/20')]
# summary[~summary[0].str.contains('stratechery')]
# absdlkjs = summary[summary[0].str.contains('hackernoon')][0]
summary#.set_index(0)
trimmed_summary = summary.copy()
# trimmed_summary[0] = [x.replace("https://","").replace("http://","").split("/") for x in trimmed_summary[0]]
# trimmed_summary[0] = ['/'.join(x if x[-1:][0] != '/' else x[:-1]) for x in trimmed_summary[0]]
# trimmed_summary[0] = [x.replace("https://","").replace("http://","") for x in trimmed_summary[0]]
# trimmed_summary[0] = [x if x[-1:][0] != '/' else x[:-1] for x in trimmed_summary[0]]
trimmed_summary = trimmed_summary #[trimmed_summary['count']>1]
it produced decent results, of course.
scraping python

Recent Posts

Tags