From 8aed1df8c748e18762eb9284b5eda9dfc4700d63 Mon Sep 17 00:00:00 2001 From: Denis Lehmann Date: Sat, 18 Apr 2020 20:28:51 +0200 Subject: [PATCH] add user-agent for web requests --- spiderss.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/spiderss.py b/spiderss.py index afd8a71..bef6b3e 100755 --- a/spiderss.py +++ b/spiderss.py @@ -111,11 +111,11 @@ def get_article_body(article, scrape): body = '' - # TODO: Include appropriate header? # If scrape, get article with readability if scrape: - response = requests.get(article.link) + headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'} + response = requests.get(article.link, headers = headers) doc = Document(response.text) body = doc.summary() @@ -134,7 +134,6 @@ def get_article_body(article, scrape): if new_src.startswith('http'): body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1) - # TODO: catch mailto: for a in soup.find_all('a', href = True): href = a.get('href') splitted_href = urlsplit(href)