Web-Scraping using Scrapy (Python)

Purpose

This project serves to demonstrate web scraping in Python using Scrapy on the example of Reuters News (https://www.reuters.com). URL, publication date, title and text of the news articles are stored in a pandas dataframe

Required Packages

import pandas as pd
import scrapy
from scrapy.crawler import CrawlerProcess

# for the sake of readability - turn off logging
import logging, sys
logging.disable(sys.maxsize)

Scraper Classes

Any number of scraper classes can be defined here

Reuters News

class reuters_news(scrapy.Spider):
    
    name = "reuters_news"
    
    def __init__(self, dictionary, nr_pages):
        self.dictionary = dictionary
        self.nr_pages = nr_pages

    def start_requests(self):
        urls = ["https://uk.reuters.com/news/archive/euro-zone-news?view=page&page=" + \
                str(page) + \
                "&pageSize=10" for page in range(1,self.nr_pages+1)]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.get_press_releases)
        
    def get_press_releases(self, response):
        urls_to_follow = response.css('div.story-content').xpath('//a/@href').extract()
        for url in urls_to_follow:
            if url.startswith("/article/"):
                url_extended = "https://uk.reuters.com" + url
                yield scrapy.Request(url=url_extended, callback=self.get_information)

    def get_information(self, response):
        self.dictionary["scraper"].append(self.name)
        self.dictionary["url"].append(response.url)
        self.dictionary["date"].append(response.css("div.ArticleHeader_date ::text").extract())
        self.dictionary["title"].append(response.css("h1.ArticleHeader_headline ::text").extract())
        text_tmp = response.css('div.StandardArticleBody_body').xpath('//p/text()').extract()
        # the first (duration of reading) and the last (authors) list element are not useful
        self.dictionary["text"].append(text_tmp[1:-1])

Auxiliary Functions

def convert_if_list(obj):
    """
    This function converts a list of strings (obj) to string
    Input:
        obj - list of strings
    """
    if isinstance(obj, list):
        return " ".join(obj)
    else:
        return obj

def unlist_columns(dct):
    """
    This function applies convert_if_list to all keys in the dictionary
    Input:
        dct - dictionary of scraped data
    """
    tmp = dct.copy()
    tmp["scraper"] = [convert_if_list(sublist) for sublist in tmp["scraper"]]
    tmp["url"]     = [convert_if_list(sublist) for sublist in tmp["url"]]
    tmp["date"]    = [convert_if_list(sublist) for sublist in tmp["date"]]
    tmp["title"]   = [convert_if_list(sublist) for sublist in tmp["title"]]
    tmp["text"]    = [convert_if_list(sublist) for sublist in tmp["text"]]
    return tmp

Main Function - Web Scraping

def crawl(scraping_class, nr_pages):
    """
    This function performs the scraping
    Input:
        scraping_class: list of scrapers
        nr_pages: list of number of pages to scrape
    Returns a dataframe with raw data from the websites
    """
    
    # initialize empty dictionary, where we store the scraping results
    dict_raw = {"scraper":[], 
                "url":[], 
                "date":[], 
                "title":[], 
                "text":[]
    }
        
    # start a scrapy crawler process
    process = CrawlerProcess()
    
    # loop over list of scrapers and add them to the scraping process
    if isinstance(scraping_class, list):
        index_pages = 0
        for scraper in scraping_class:
            process.crawl(scraper, dict_raw, nr_pages[index_pages])
            index_pages += 1
    else:
        process.crawl(scraping_class[0], dict_raw, nr_pages[0])
    
    # start scraping
    process.start()
    
    # unlist list of lists to store results in a pd.DataFrame
    dict_unlisted = unlist_columns(dict_raw)
    
    # convert dict to dataframe 
    df_raw = pd.DataFrame(dict_unlisted).drop_duplicates()
    
    return df_raw

Example

# runs around 20 seconds
df_scraped = crawl([reuters_news], [50])

df_scraped.head(3)

	scraper	url	date	title	text
0	reuters_news	https://uk.reuters.com/article/uk-ireland-econ...	February 5, 2020 / 12:18 AM / 6 days ago	Irish consumer sentiment climbs to six-month high	DUBLIN (Reuters) - Irish consumer sentiment hi...
1	reuters_news	https://uk.reuters.com/article/uk-ireland-elec...	February 8, 2020 / 10:16 PM / 2 days ago	Near tie between three main parties in Irish e...	DUBLIN (Reuters) - An Irish national election ...
2	reuters_news	https://uk.reuters.com/article/uk-ecb-banks-bb...	February 6, 2020 / 9:37 PM / 4 days ago	ECB's de Guindos says BBVA spying case has no ...	MADRID (Reuters) - European Central Bank Vicep...

df_scraped.shape

(626, 5) –> data from more than 600 news articles in around 20 seconds