add fallback for image and summary

This commit is contained in:
Denis Lehmann 2020-04-18 21:24:37 +02:00
parent 8aed1df8c7
commit e4837d77c6

View file

@ -81,19 +81,18 @@ def get_filename(date, title):
return '{}_{}.{}'.format(date, title, fileending) return '{}_{}.{}'.format(date, title, fileending)
# If scraped, use first content image as fallback # Get HTML image snippet from the first image url in a text
# Get image snippet for an article def get_image_snippet(text):
def get_article_image(article):
try: try:
image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image') image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', text, re.IGNORECASE).group('image')
return '<img src="{}" alt="Image">\n\n'.format(image_url) return '<img src="{}" alt="Image">\n\n'.format(image_url)
except: except:
return '' return ''
# Get summary snippet for an article # Get HTML summary snippet from a HTML text
def get_article_summary(article): def get_summary_snippet(text):
try: try:
h = html2text.HTML2Text() h = html2text.HTML2Text()
@ -101,6 +100,7 @@ def get_article_summary(article):
h.ignore_links = True h.ignore_links = True
h.ignore_images = True h.ignore_images = True
h.body_width = 0 h.body_width = 0
summary = h.handle(text).split('\n\n')[0].strip()
return '<p><b>{}</b></p>\n\n'.format(summary) return '<p><b>{}</b></p>\n\n'.format(summary)
except: except:
return '' return ''
@ -176,16 +176,19 @@ def postprocess(text):
# Get constructed article # Get constructed article
def get_article(article, scrape): def get_article(article, scrape):
# Construct head of article
image = get_article_image(article)
summary = get_article_summary(article)
#TODO: Current time as fallback?
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link)
# Get body of article # Get body of article
body = get_article_body(article, scrape) body = get_article_body(article, scrape)
# Construct head of article
image = get_image_snippet(str(article))
if image == '':
image = get_image_snippet(body)
summary = get_summary_snippet(article.summary)
if summary == '':
summary = get_summary_snippet(body)
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link)
# Postprocess article # Postprocess article
article_text = postprocess('{}\n\n<hr>\n\n{}'.format(head, body)).strip() article_text = postprocess('{}\n\n<hr>\n\n{}'.format(head, body)).strip()
@ -214,6 +217,7 @@ def update_feed(feed):
for a in articles: for a in articles:
try: try:
#TODO: Current time as fallback?
date = datetime.fromtimestamp(mktime(a.published_parsed)) date = datetime.fromtimestamp(mktime(a.published_parsed))
if date > threshold_date: if date > threshold_date:
filename = get_filename(date, a.title) filename = get_filename(date, a.title)