From e4837d77c641944296cb0c14302f376005bdfb01 Mon Sep 17 00:00:00 2001 From: Denis Lehmann Date: Sat, 18 Apr 2020 21:24:37 +0200 Subject: [PATCH] add fallback for image and summary --- spiderss.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/spiderss.py b/spiderss.py index bef6b3e..9ef859b 100755 --- a/spiderss.py +++ b/spiderss.py @@ -81,19 +81,18 @@ def get_filename(date, title): return '{}_{}.{}'.format(date, title, fileending) -# If scraped, use first content image as fallback -# Get image snippet for an article -def get_article_image(article): +# Get HTML image snippet from the first image url in a text +def get_image_snippet(text): try: - image_url = re.search('(?Phttps?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image') + image_url = re.search('(?Phttps?://\S+(\.png|\.jpg|\.jpeg))', text, re.IGNORECASE).group('image') return 'Image\n\n'.format(image_url) except: return '' -# Get summary snippet for an article -def get_article_summary(article): +# Get HTML summary snippet from a HTML text +def get_summary_snippet(text): try: h = html2text.HTML2Text() @@ -101,6 +100,7 @@ def get_article_summary(article): h.ignore_links = True h.ignore_images = True h.body_width = 0 + summary = h.handle(text).split('\n\n')[0].strip() return '

{}

\n\n'.format(summary) except: return '' @@ -176,16 +176,19 @@ def postprocess(text): # Get constructed article def get_article(article, scrape): - # Construct head of article - image = get_article_image(article) - summary = get_article_summary(article) - #TODO: Current time as fallback? - date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format) - head = '

{}

\n\n{}{}

{} - Link

'.format(article.title, image, summary, date, article.link) - # Get body of article body = get_article_body(article, scrape) + # Construct head of article + image = get_image_snippet(str(article)) + if image == '': + image = get_image_snippet(body) + summary = get_summary_snippet(article.summary) + if summary == '': + summary = get_summary_snippet(body) + date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format) + head = '

{}

\n\n{}{}

{} - Link

'.format(article.title, image, summary, date, article.link) + # Postprocess article article_text = postprocess('{}\n\n
\n\n{}'.format(head, body)).strip() @@ -214,6 +217,7 @@ def update_feed(feed): for a in articles: try: + #TODO: Current time as fallback? date = datetime.fromtimestamp(mktime(a.published_parsed)) if date > threshold_date: filename = get_filename(date, a.title)