add handling of missing article links

This commit is contained in:
Denis Lehmann 2020-04-24 18:51:20 +02:00
parent 7d4d1311bb
commit d630cc96c4

View file

@ -105,21 +105,38 @@ def get_summary_snippet(text):
# Get article body either from web or its content # Get article body either from web or its content
def get_article_body(article, scrape): def get_article_body(article, feed):
body = '' body = ''
# If scrape, get article with readability # If scrape, get article with readability
if scrape: if feed['scrape']:
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'} headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'}
response = requests.get(article.link, headers = headers) response = requests.get(article.link, headers = headers)
doc = Document(response.text) doc = Document(response.text)
body = doc.summary() body = doc.summary()
# Replace relative site links with absolute ones, using beautifulsoup # Else construct from article object
else:
# Add all content to body
if hasattr(article, 'content'):
for c in article.content:
if c.type == 'text/html' or c.type == 'text/plain':
body += c.value
# Use summary as fallback
elif hasattr(article, 'summary'):
body += article.summary
# Replace relative links with absolute ones, using beautifulsoup
try:
splitted_url = urlsplit(article.link) splitted_url = urlsplit(article.link)
except:
splitted_url = urlsplit(feed['url'])
soup = BeautifulSoup(body, features = 'lxml') soup = BeautifulSoup(body, features = 'lxml')
for img in soup.find_all('img', src = True): for img in soup.find_all('img', src = True):
src = img.get('src') src = img.get('src')
splitted_src = urlsplit(src) splitted_src = urlsplit(src)
@ -144,15 +161,6 @@ def get_article_body(article, scrape):
if new_href.startswith('http'): if new_href.startswith('http'):
body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1) body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1)
# Else construct from article content
else:
if hasattr(article, 'content'):
for c in article.content:
if c.type == 'text/html':
body += c.value
return body return body
@ -172,10 +180,10 @@ def postprocess(text):
# Get constructed article # Get constructed article
def get_article(article, scrape): def get_article(article, feed):
# Get body of article # Get body of article
body = get_article_body(article, scrape) body = get_article_body(article, feed)
# Construct head of article # Construct head of article
image = get_image_snippet(str(article)) image = get_image_snippet(str(article))
@ -188,7 +196,13 @@ def get_article(article, scrape):
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format) date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
except: except:
date = datetime.now().strftime(datetime_format) date = datetime.now().strftime(datetime_format)
head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link) try:
link = article.link
except:
splitted_url = urlsplit(feed['url'])
splitted_link = [splitted_url.scheme, splitted_url.netloc, '', '', '']
link = urlunsplit(splitted_link)
head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, link)
# Postprocess article # Postprocess article
article_text = postprocess('{}\n\n<hr>\n\n{}'.format(head, body)).strip() article_text = postprocess('{}\n\n<hr>\n\n{}'.format(head, body)).strip()
@ -250,12 +264,12 @@ def update_feed(feed):
article_exists = True article_exists = True
if not article_exists: if not article_exists:
text = get_article(a, feed['scrape']) text = get_article(a, feed)
write_to_file(os.path.join(feedpath_new, filename), text) write_to_file(os.path.join(feedpath_new, filename), text)
log(' added article "{}"'.format(a.title)) log(' added article "{}"'.format(a.title))
except Exception as e: except Exception as e:
error('while parsing feed article "{}" from feed "{}": {}'.format(a.title, feed['name'], e)) error('while parsing article "{}" from feed "{}": {}'.format(a.title, feed['name'], e))
# Delete articles older than max_age # Delete articles older than max_age