add fallback for articles with unparseable date
This commit is contained in:
parent
e4837d77c6
commit
d37148ea32
1 changed files with 37 additions and 11 deletions
44
spiderss.py
44
spiderss.py
|
|
@ -68,17 +68,14 @@ def write_to_file(filepath, text):
|
|||
file.close()
|
||||
|
||||
|
||||
# Get filename from a date and a title
|
||||
def get_filename(date, title):
|
||||
|
||||
# Get date as single block
|
||||
date = date.strftime('%Y%m%d%H%M')
|
||||
# Get filename postfix from a title
|
||||
def get_filename_postfix(title):
|
||||
|
||||
# Get title as lowercase words concatenated with underscores
|
||||
title = re.sub('[^A-Za-z0-9 ]+', '', title.lower())
|
||||
title = re.sub(' ', '_', title)
|
||||
|
||||
return '{}_{}.{}'.format(date, title, fileending)
|
||||
return '{}.{}'.format(title, fileending)
|
||||
|
||||
|
||||
# Get HTML image snippet from the first image url in a text
|
||||
|
|
@ -186,7 +183,10 @@ def get_article(article, scrape):
|
|||
summary = get_summary_snippet(article.summary)
|
||||
if summary == '':
|
||||
summary = get_summary_snippet(body)
|
||||
try:
|
||||
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
|
||||
except:
|
||||
date = datetime.now().strftime(datetime_format)
|
||||
head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link)
|
||||
|
||||
# Postprocess article
|
||||
|
|
@ -210,6 +210,9 @@ def update_feed(feed):
|
|||
if not os.path.exists(feedpath_read):
|
||||
os.makedirs(feedpath_read)
|
||||
|
||||
# Get exisiting articles
|
||||
existing_articles = os.listdir(feedpath_new) + os.listdir(feedpath_read) + os.listdir(lovedpath)
|
||||
|
||||
# Update articles
|
||||
articles = get_articles(feed)
|
||||
threshold_date = datetime.now() - timedelta(days = max_age)
|
||||
|
|
@ -217,11 +220,32 @@ def update_feed(feed):
|
|||
for a in articles:
|
||||
|
||||
try:
|
||||
#TODO: Current time as fallback?
|
||||
|
||||
# Set fallback if no parseable date found
|
||||
fallback = False
|
||||
try:
|
||||
date = datetime.fromtimestamp(mktime(a.published_parsed))
|
||||
except:
|
||||
date = datetime.now()
|
||||
fallback = True
|
||||
|
||||
if date > threshold_date:
|
||||
filename = get_filename(date, a.title)
|
||||
if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)):
|
||||
|
||||
# Construct filename
|
||||
filename_prefix = date.strftime('%Y%m%d%H%M')
|
||||
filename_postfix = get_filename_postfix(a.title)
|
||||
filename = '{}_{}'.format(filename_prefix, filename_postfix)
|
||||
|
||||
# Check if article exists
|
||||
article_exists = False
|
||||
if fallback:
|
||||
existing_articles_fallback = [a[13:] for a in existing_articles]
|
||||
if filename_postfix in existing_articles_fallback:
|
||||
article_exists = True
|
||||
elif filename in existing_articles:
|
||||
article_exists = True
|
||||
|
||||
if not article_exists:
|
||||
text = get_article(a, feed['scrape'])
|
||||
write_to_file(os.path.join(feedpath_new, filename), text)
|
||||
log(' added article "{}"'.format(a.title))
|
||||
|
|
@ -270,6 +294,8 @@ def load_config(filepath):
|
|||
# Initialize spiderss
|
||||
def initialize():
|
||||
|
||||
global lovedpath
|
||||
|
||||
# Create 'loved' directory if not existent
|
||||
lovedpath = os.path.join(base_directory, 'loved')
|
||||
if not os.path.exists(lovedpath):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue