add user-agent for web requests

This commit is contained in:
Denis Lehmann 2020-04-18 20:28:51 +02:00
parent 877aff0475
commit 8aed1df8c7

View file

@ -111,11 +111,11 @@ def get_article_body(article, scrape):
body = '' body = ''
# TODO: Include appropriate header?
# If scrape, get article with readability # If scrape, get article with readability
if scrape: if scrape:
response = requests.get(article.link) headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'}
response = requests.get(article.link, headers = headers)
doc = Document(response.text) doc = Document(response.text)
body = doc.summary() body = doc.summary()
@ -134,7 +134,6 @@ def get_article_body(article, scrape):
if new_src.startswith('http'): if new_src.startswith('http'):
body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1) body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1)
# TODO: catch mailto:
for a in soup.find_all('a', href = True): for a in soup.find_all('a', href = True):
href = a.get('href') href = a.get('href')
splitted_href = urlsplit(href) splitted_href = urlsplit(href)