add fallback for image and summary
This commit is contained in:
parent
8aed1df8c7
commit
e4837d77c6
1 changed files with 17 additions and 13 deletions
30
spiderss.py
30
spiderss.py
|
|
@ -81,19 +81,18 @@ def get_filename(date, title):
|
||||||
return '{}_{}.{}'.format(date, title, fileending)
|
return '{}_{}.{}'.format(date, title, fileending)
|
||||||
|
|
||||||
|
|
||||||
# If scraped, use first content image as fallback
|
# Get HTML image snippet from the first image url in a text
|
||||||
# Get image snippet for an article
|
def get_image_snippet(text):
|
||||||
def get_article_image(article):
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image')
|
image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', text, re.IGNORECASE).group('image')
|
||||||
return '<img src="{}" alt="Image">\n\n'.format(image_url)
|
return '<img src="{}" alt="Image">\n\n'.format(image_url)
|
||||||
except:
|
except:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
|
||||||
# Get summary snippet for an article
|
# Get HTML summary snippet from a HTML text
|
||||||
def get_article_summary(article):
|
def get_summary_snippet(text):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
h = html2text.HTML2Text()
|
h = html2text.HTML2Text()
|
||||||
|
|
@ -101,6 +100,7 @@ def get_article_summary(article):
|
||||||
h.ignore_links = True
|
h.ignore_links = True
|
||||||
h.ignore_images = True
|
h.ignore_images = True
|
||||||
h.body_width = 0
|
h.body_width = 0
|
||||||
|
summary = h.handle(text).split('\n\n')[0].strip()
|
||||||
return '<p><b>{}</b></p>\n\n'.format(summary)
|
return '<p><b>{}</b></p>\n\n'.format(summary)
|
||||||
except:
|
except:
|
||||||
return ''
|
return ''
|
||||||
|
|
@ -176,16 +176,19 @@ def postprocess(text):
|
||||||
# Get constructed article
|
# Get constructed article
|
||||||
def get_article(article, scrape):
|
def get_article(article, scrape):
|
||||||
|
|
||||||
# Construct head of article
|
|
||||||
image = get_article_image(article)
|
|
||||||
summary = get_article_summary(article)
|
|
||||||
#TODO: Current time as fallback?
|
|
||||||
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
|
|
||||||
head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link)
|
|
||||||
|
|
||||||
# Get body of article
|
# Get body of article
|
||||||
body = get_article_body(article, scrape)
|
body = get_article_body(article, scrape)
|
||||||
|
|
||||||
|
# Construct head of article
|
||||||
|
image = get_image_snippet(str(article))
|
||||||
|
if image == '':
|
||||||
|
image = get_image_snippet(body)
|
||||||
|
summary = get_summary_snippet(article.summary)
|
||||||
|
if summary == '':
|
||||||
|
summary = get_summary_snippet(body)
|
||||||
|
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
|
||||||
|
head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link)
|
||||||
|
|
||||||
# Postprocess article
|
# Postprocess article
|
||||||
article_text = postprocess('{}\n\n<hr>\n\n{}'.format(head, body)).strip()
|
article_text = postprocess('{}\n\n<hr>\n\n{}'.format(head, body)).strip()
|
||||||
|
|
||||||
|
|
@ -214,6 +217,7 @@ def update_feed(feed):
|
||||||
for a in articles:
|
for a in articles:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
#TODO: Current time as fallback?
|
||||||
date = datetime.fromtimestamp(mktime(a.published_parsed))
|
date = datetime.fromtimestamp(mktime(a.published_parsed))
|
||||||
if date > threshold_date:
|
if date > threshold_date:
|
||||||
filename = get_filename(date, a.title)
|
filename = get_filename(date, a.title)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue