add fallback for articles with unparseable date

2020-04-19 08:56:18 +02:00 · 2020-04-19 08:56:18 +02:00 · d37148ea32
commit d37148ea32
parent e4837d77c6
1 changed files with 37 additions and 11 deletions
--- a/spiderss.py
+++ b/spiderss.py
@ -68,17 +68,14 @@ def write_to_file(filepath, text):
    file.close()


-# Get filename from a date and a title
-def get_filename(date, title):
-
-    # Get date as single block
-    date = date.strftime('%Y%m%d%H%M')
+# Get filename postfix from a title
+def get_filename_postfix(title):

    # Get title as lowercase words concatenated with underscores
    title = re.sub('[^A-Za-z0-9 ]+', '', title.lower())
    title = re.sub(' ', '_', title)
    
-    return '{}_{}.{}'.format(date, title, fileending)
+    return '{}.{}'.format(title, fileending)


 # Get HTML image snippet from the first image url in a text
@ -186,7 +183,10 @@ def get_article(article, scrape):
    summary = get_summary_snippet(article.summary)
    if summary == '':
        summary = get_summary_snippet(body)
+    try:
        date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
+    except:
+        date = datetime.now().strftime(datetime_format)
    head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link)

    # Postprocess article
@ -210,6 +210,9 @@ def update_feed(feed):
    if not os.path.exists(feedpath_read):
        os.makedirs(feedpath_read)

+    # Get exisiting articles
+    existing_articles = os.listdir(feedpath_new) + os.listdir(feedpath_read) + os.listdir(lovedpath)
+
    # Update articles
    articles = get_articles(feed)
    threshold_date = datetime.now() - timedelta(days = max_age)
@ -217,11 +220,32 @@ def update_feed(feed):
    for a in articles:
        
        try:
-            #TODO: Current time as fallback?
+
+            # Set fallback if no parseable date found
+            fallback = False
+            try:
                date = datetime.fromtimestamp(mktime(a.published_parsed))
+            except:
+                date = datetime.now()
+                fallback = True
+            
            if date > threshold_date:
-                filename = get_filename(date, a.title)
-                if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)):
+
+                # Construct filename
+                filename_prefix = date.strftime('%Y%m%d%H%M')
+                filename_postfix = get_filename_postfix(a.title)
+                filename = '{}_{}'.format(filename_prefix, filename_postfix)
+
+                # Check if article exists
+                article_exists = False
+                if fallback:
+                    existing_articles_fallback = [a[13:] for a in existing_articles]
+                    if filename_postfix in existing_articles_fallback:
+                        article_exists = True
+                elif filename in existing_articles:
+                    article_exists = True
+
+                if not article_exists:
                    text = get_article(a, feed['scrape'])
                    write_to_file(os.path.join(feedpath_new, filename), text)
                    log('    added article "{}"'.format(a.title))
@ -270,6 +294,8 @@ def load_config(filepath):
 # Initialize spiderss
 def initialize():

+    global lovedpath
+
    # Create 'loved' directory if not existent
    lovedpath = os.path.join(base_directory, 'loved')
    if not os.path.exists(lovedpath):