add fallback for articles with unparseable date

This commit is contained in:
Denis Lehmann 2020-04-19 08:56:18 +02:00
parent e4837d77c6
commit d37148ea32

View file

@ -68,17 +68,14 @@ def write_to_file(filepath, text):
file.close() file.close()
# Get filename from a date and a title # Get filename postfix from a title
def get_filename(date, title): def get_filename_postfix(title):
# Get date as single block
date = date.strftime('%Y%m%d%H%M')
# Get title as lowercase words concatenated with underscores # Get title as lowercase words concatenated with underscores
title = re.sub('[^A-Za-z0-9 ]+', '', title.lower()) title = re.sub('[^A-Za-z0-9 ]+', '', title.lower())
title = re.sub(' ', '_', title) title = re.sub(' ', '_', title)
return '{}_{}.{}'.format(date, title, fileending) return '{}.{}'.format(title, fileending)
# Get HTML image snippet from the first image url in a text # Get HTML image snippet from the first image url in a text
@ -186,7 +183,10 @@ def get_article(article, scrape):
summary = get_summary_snippet(article.summary) summary = get_summary_snippet(article.summary)
if summary == '': if summary == '':
summary = get_summary_snippet(body) summary = get_summary_snippet(body)
try:
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format) date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
except:
date = datetime.now().strftime(datetime_format)
head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link) head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link)
# Postprocess article # Postprocess article
@ -210,6 +210,9 @@ def update_feed(feed):
if not os.path.exists(feedpath_read): if not os.path.exists(feedpath_read):
os.makedirs(feedpath_read) os.makedirs(feedpath_read)
# Get exisiting articles
existing_articles = os.listdir(feedpath_new) + os.listdir(feedpath_read) + os.listdir(lovedpath)
# Update articles # Update articles
articles = get_articles(feed) articles = get_articles(feed)
threshold_date = datetime.now() - timedelta(days = max_age) threshold_date = datetime.now() - timedelta(days = max_age)
@ -217,11 +220,32 @@ def update_feed(feed):
for a in articles: for a in articles:
try: try:
#TODO: Current time as fallback?
# Set fallback if no parseable date found
fallback = False
try:
date = datetime.fromtimestamp(mktime(a.published_parsed)) date = datetime.fromtimestamp(mktime(a.published_parsed))
except:
date = datetime.now()
fallback = True
if date > threshold_date: if date > threshold_date:
filename = get_filename(date, a.title)
if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)): # Construct filename
filename_prefix = date.strftime('%Y%m%d%H%M')
filename_postfix = get_filename_postfix(a.title)
filename = '{}_{}'.format(filename_prefix, filename_postfix)
# Check if article exists
article_exists = False
if fallback:
existing_articles_fallback = [a[13:] for a in existing_articles]
if filename_postfix in existing_articles_fallback:
article_exists = True
elif filename in existing_articles:
article_exists = True
if not article_exists:
text = get_article(a, feed['scrape']) text = get_article(a, feed['scrape'])
write_to_file(os.path.join(feedpath_new, filename), text) write_to_file(os.path.join(feedpath_new, filename), text)
log(' added article "{}"'.format(a.title)) log(' added article "{}"'.format(a.title))
@ -270,6 +294,8 @@ def load_config(filepath):
# Initialize spiderss # Initialize spiderss
def initialize(): def initialize():
global lovedpath
# Create 'loved' directory if not existent # Create 'loved' directory if not existent
lovedpath = os.path.join(base_directory, 'loved') lovedpath = os.path.join(base_directory, 'loved')
if not os.path.exists(lovedpath): if not os.path.exists(lovedpath):