1.3. Download HTML Files¶

Author: Johannes Maucher
Last update: 2018-10-12

#!pip install slugify

import bs4
import requests
from slugify import slugify
import datetime
import os
from urllib.request import urlopen

1.3.1. Define lists of news domains for different languages and categories¶

today=datetime.datetime.now().strftime("%Y-%m-%d")
print(today)

2021-11-15

cat="GENERAL"
#cat="TECH"
#cat="SPORT"
lang="GERMAN"
#lang="ENGLISH"

general_sources_de = ['http://www.zeit.de',
                      'http://www.spiegel.de/',
                      'http://www.welt.de/',
                      'http://www.sueddeutsche.de',
                      'http://www.faz.net'
                     ]

general_sources_en = ['https://www.washingtonpost.com',
                      'http://www.nytimes.com/',
                      'http://www.chicagotribune.com/',
                      'http://www.bostonherald.com/',
                      'http://www.sfchronicle.com/']

tech_sources_de=['http://chip.de/',
                 'http://t-online.de',
                 'http://www.computerbild.de',
                 'http://www.heise.de',
                 'http://www.golem.de']

tech_sources_en=['http://radar.oreilly.com/',
                 'https://www.cnet.com/news/',
                 'http://www.techradar.com/news/computing'
                ]                 

if lang=="ENGLISH":
    if cat=="GENERAL":
        sources=general_sources_en
    else:
        sources=tech_sources_en
else:
    if cat=="GENERAL":
        sources=general_sources_de
    else:
        sources=tech_sources_de
    
    

1.3.2. Download subdomain HTML pages¶

The below defined function crawl() determines all subdomains of the specified url and saves the HTML files of these subdomains.

def crawl(url,maxSubSites=5,category="GENERAL",lang="GERMAN"):
    domain = url.split("//www.")[-1].split("/")[0]
    print(domain)
    dirname=lang+"/"+category+"/"+"HTML/"+domain.split('.')[0]+"-"+today
    try:
        os.makedirs(dirname)
    except:
        print("Directory %s already exists."%dirname)
    filename = dirname+"/"+domain.split('.')[0] + '.html'
    html = requests.get(url).content
    with open(filename, 'wb') as f:
        #print filename
        f.write(html)
    soup = bs4.BeautifulSoup(html, "html.parser")
    links = set(soup.find_all('a', href=True)) #find all links in this page
    count=0
    for link in links:
        if count > maxSubSites:
            break
        sub_url = link['href']
        page_name = link.string
        if domain in sub_url:
            count+=1
            try:
                page = requests.get(sub_url).content
                filename = dirname+"/"+slugify(page_name).lower() + '.html'
                with open(filename, 'wb') as f:
                    #print filename
                    f.write(page)
            except:
                pass
    return dirname

htmlDirs=[]
for url in sources:
    htmldir=crawl(url,maxSubSites=50,category=cat,lang=lang)
    htmlDirs.append(htmldir)

zeit.de

spiegel.de

welt.de

print(htmlDirs)

['GERMAN/GENERAL/HTML/zeit-2020-09-09', 'GERMAN/GENERAL/HTML/spiegel-2020-09-09', 'GERMAN/GENERAL/HTML/welt-2020-09-09', 'GERMAN/GENERAL/HTML/sueddeutsche-2020-09-09', 'GERMAN/GENERAL/HTML/faz-2020-09-09']

1.3.3. Crawl raw text from local HTML files¶

TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p']
def read_html(path,minlen=15):
    with open(path, 'r') as f:
        html = f.read()
        soup = bs4.BeautifulSoup(html, "html.parser")
        for tag in soup.find_all(TAGS):
            text = tag.get_text()
            if len(text)>minlen:
                yield text

rawtextlist=[]
for dir in htmlDirs:
    textdir=dir.replace("HTML","TEXT")
    try:
        os.makedirs(textdir)
    except:
        pass
    for htmlfile in os.listdir(dir):
        htmlpath=dir+"/"+htmlfile
        rawText=read_html(htmlpath)
        rawtextlist.append(rawText)
        textpath=htmlpath.replace("HTML","TEXT")
        textfilename=textpath.replace(".html",".txt")
        with open(textfilename,"w") as f:
            for cont in rawText:
                f.write(cont)
        f.close()
#print len(rawtextlist)
        

1.3.4. Questions¶

Inspect some of the created raw-text files and suggest improvements.

1.2. Access Contents of HTML Page 1.4. Access RSS Feed