add fallback for articles with unparseable date
This commit is contained in:
parent
e4837d77c6
commit
d37148ea32
1 changed files with 37 additions and 11 deletions
48
spiderss.py
48
spiderss.py
|
|
@ -68,17 +68,14 @@ def write_to_file(filepath, text):
|
||||||
file.close()
|
file.close()
|
||||||
|
|
||||||
|
|
||||||
# Get filename from a date and a title
|
# Get filename postfix from a title
|
||||||
def get_filename(date, title):
|
def get_filename_postfix(title):
|
||||||
|
|
||||||
# Get date as single block
|
|
||||||
date = date.strftime('%Y%m%d%H%M')
|
|
||||||
|
|
||||||
# Get title as lowercase words concatenated with underscores
|
# Get title as lowercase words concatenated with underscores
|
||||||
title = re.sub('[^A-Za-z0-9 ]+', '', title.lower())
|
title = re.sub('[^A-Za-z0-9 ]+', '', title.lower())
|
||||||
title = re.sub(' ', '_', title)
|
title = re.sub(' ', '_', title)
|
||||||
|
|
||||||
return '{}_{}.{}'.format(date, title, fileending)
|
return '{}.{}'.format(title, fileending)
|
||||||
|
|
||||||
|
|
||||||
# Get HTML image snippet from the first image url in a text
|
# Get HTML image snippet from the first image url in a text
|
||||||
|
|
@ -186,7 +183,10 @@ def get_article(article, scrape):
|
||||||
summary = get_summary_snippet(article.summary)
|
summary = get_summary_snippet(article.summary)
|
||||||
if summary == '':
|
if summary == '':
|
||||||
summary = get_summary_snippet(body)
|
summary = get_summary_snippet(body)
|
||||||
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
|
try:
|
||||||
|
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
|
||||||
|
except:
|
||||||
|
date = datetime.now().strftime(datetime_format)
|
||||||
head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link)
|
head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link)
|
||||||
|
|
||||||
# Postprocess article
|
# Postprocess article
|
||||||
|
|
@ -210,6 +210,9 @@ def update_feed(feed):
|
||||||
if not os.path.exists(feedpath_read):
|
if not os.path.exists(feedpath_read):
|
||||||
os.makedirs(feedpath_read)
|
os.makedirs(feedpath_read)
|
||||||
|
|
||||||
|
# Get exisiting articles
|
||||||
|
existing_articles = os.listdir(feedpath_new) + os.listdir(feedpath_read) + os.listdir(lovedpath)
|
||||||
|
|
||||||
# Update articles
|
# Update articles
|
||||||
articles = get_articles(feed)
|
articles = get_articles(feed)
|
||||||
threshold_date = datetime.now() - timedelta(days = max_age)
|
threshold_date = datetime.now() - timedelta(days = max_age)
|
||||||
|
|
@ -217,11 +220,32 @@ def update_feed(feed):
|
||||||
for a in articles:
|
for a in articles:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
#TODO: Current time as fallback?
|
|
||||||
date = datetime.fromtimestamp(mktime(a.published_parsed))
|
# Set fallback if no parseable date found
|
||||||
|
fallback = False
|
||||||
|
try:
|
||||||
|
date = datetime.fromtimestamp(mktime(a.published_parsed))
|
||||||
|
except:
|
||||||
|
date = datetime.now()
|
||||||
|
fallback = True
|
||||||
|
|
||||||
if date > threshold_date:
|
if date > threshold_date:
|
||||||
filename = get_filename(date, a.title)
|
|
||||||
if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)):
|
# Construct filename
|
||||||
|
filename_prefix = date.strftime('%Y%m%d%H%M')
|
||||||
|
filename_postfix = get_filename_postfix(a.title)
|
||||||
|
filename = '{}_{}'.format(filename_prefix, filename_postfix)
|
||||||
|
|
||||||
|
# Check if article exists
|
||||||
|
article_exists = False
|
||||||
|
if fallback:
|
||||||
|
existing_articles_fallback = [a[13:] for a in existing_articles]
|
||||||
|
if filename_postfix in existing_articles_fallback:
|
||||||
|
article_exists = True
|
||||||
|
elif filename in existing_articles:
|
||||||
|
article_exists = True
|
||||||
|
|
||||||
|
if not article_exists:
|
||||||
text = get_article(a, feed['scrape'])
|
text = get_article(a, feed['scrape'])
|
||||||
write_to_file(os.path.join(feedpath_new, filename), text)
|
write_to_file(os.path.join(feedpath_new, filename), text)
|
||||||
log(' added article "{}"'.format(a.title))
|
log(' added article "{}"'.format(a.title))
|
||||||
|
|
@ -270,6 +294,8 @@ def load_config(filepath):
|
||||||
# Initialize spiderss
|
# Initialize spiderss
|
||||||
def initialize():
|
def initialize():
|
||||||
|
|
||||||
|
global lovedpath
|
||||||
|
|
||||||
# Create 'loved' directory if not existent
|
# Create 'loved' directory if not existent
|
||||||
lovedpath = os.path.join(base_directory, 'loved')
|
lovedpath = os.path.join(base_directory, 'loved')
|
||||||
if not os.path.exists(lovedpath):
|
if not os.path.exists(lovedpath):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue