From 50f54f20c56e9d2bec2378f08b859b56cd02d62c Mon Sep 17 00:00:00 2001
From: Denis Lehmann <denis@opaque.tech>
Date: Sat, 18 Apr 2020 11:36:31 +0200
Subject: [PATCH] add postprocessing feature

---
 README.md   |  11 ++++-
 config.toml |   6 +++
 spiderss.py | 124 +++++++++++++++++++++++++++++-----------------------
 3 files changed, 85 insertions(+), 56 deletions(-)
diff --git a/README.md b/README.md
index 57e52c6..42d2bc8 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,8 @@ Read the news you want, the way you want it.
 Without advertisements, clickbait and trackers.
 Drop unresponsive web interfaces and stop accepting cookies, because plaintext is God.
 
-Articles are scraped as Markdown files from the original article web page and stored in a special folder structure.
+Articles are scraped by default as Markdown files from the original article web page and stored in a special folder structure.
+You can parse articles in your favourite file format by defining your own postprocessor.
 
 __Note:__ This script is under development and far from being complete.
 Until now it works for the most feeds I read.
@@ -65,6 +66,12 @@ base_directory = '/home/<user>/rss'
 # Articles older than max_age (days) will be deleted and not be added.
 max_age = 30
 
+# Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout.
+postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links --reference-location=document'
+
+# Fileending for the article files.
+fileending = 'md'
+
 # Date and time format as strftime to be included in the articles.
 datetime_format = '%d.%m.%Y %H:%M'
 
@@ -146,6 +153,6 @@ Just synchronize the base_directory with [Syncthing](https://syncthing.net/), [r
 
 ## Acknowledgements
 
-Thanks to all the people which created the nice libraries, this project in based on.
+Thanks to all the people, who created the nice libraries this project in based on.
 And also thanks to Dieter Steffmann who created the Canterbury font, which is used for the logo.
 You can find it in the `fonts/` directory.
diff --git a/config.toml b/config.toml
index 55febb0..118e391 100644
--- a/config.toml
+++ b/config.toml
@@ -7,6 +7,12 @@ max_age = 30
 # Date and time format as strftime to be included in the articles.
 datetime_format = '%d.%m.%Y %H:%M'
 
+# Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout.
+postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links --reference-location=document'
+
+# Fileending for the article files.
+fileending = 'md'
+
 # Feeds
 # The category can be empty (''). The feed fill then be stored in the base_directory.
 # The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware').
diff --git a/spiderss.py b/spiderss.py
index 1c0dc06..3826fd6 100755
--- a/spiderss.py
+++ b/spiderss.py
@@ -52,39 +52,19 @@ def print_logo():
 Utility functions
 '''
 
-# Get readable HTML of a webpage
-def get_readable_html(url):
-    response = requests.get(url)
-    doc = Document(response.text)
-    return doc.summary()
-
-
-# Convert HTML to Markdown
-def html_to_markdown(html):
-    h = html2text.HTML2Text()
-    h.unicode_snob = True
-    h.ignore_links = True
-    h.ignore_images = False
-    #h.ignore_anchors = True
-    #h.skip_internal_links = True
-    #h.protect_links = True
-    #h.use_automatic_links = True
-    h.body_width = 0
-    return h.handle(html).strip()
-
 
 # Get articles of a feed 
-def get_articles(feed_url):
-    feed = feedparser.parse(feed_url)
+def get_articles(feed):
+    feed = feedparser.parse(feed['url'])
     return feed.entries
 
 
 # Write text to file
 def write_to_file(filepath, text):
 
-    # Postprocess article with pandoc and write to file
-    pandoc = subprocess.Popen(['pandoc', '-f', 'markdown', '-t', 'markdown', '-o', filepath], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
-    pandoc.communicate(input = text.encode())
+    file = open(filepath, 'w')
+    file.write(text)
+    file.close()
 
 
 # Get filename from a date and a title
@@ -97,72 +77,104 @@ def get_filename(date, title):
     title = re.sub('[^A-Za-z0-9 ]+', '', title.lower())
     title = re.sub(' ', '_', title)
     
-    return '{}_{}.md'.format(date, title)
+    return '{}_{}.{}'.format(date, title, fileending)
+
+
+# Get image snippet for an article
+def get_article_image(article):
+    
+    try:
+        image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image')
+        return '<img src="{}" alt="Image">\n\n'.format(image_url)
+    except:
+        return ''
 
 
 # Get summary snippet for an article
 def get_article_summary(article):
+
     try:
         h = html2text.HTML2Text()
         h.unicode_snob = True
         h.ignore_links = True
         h.ignore_images = True
-        #h.ignore_anchors = True
-        #h.skip_internal_links = True
         h.body_width = 0
-        summary = h.handle(article.summary).split('\n\n')[0].strip()
-        return '**{}**\n\n'.format(summary)
+        return '<p><b>{}</b></p>\n\n'.format(summary)
     except:
         return ''
 
 
-# Get image snippet for an article
-def get_article_image(article):
-    try:
-        image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image')
-        return '![Image]({})\n\n'.format(image_url)
-    except:
-        return ''
+# Get article body either from web or its content
+def get_article_body(article, scrape):
 
+    body = ''
 
-# Get text from an article
-def get_article(article, scrape):
-
-    # Construct head of article
-    image_url = get_article_image(article)
-    date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
-    head = '# {}\n\n{}{}{}\n\n[Link]({})'.format(article.title, image_url, get_article_summary(article), date, article.link)
-
-    # Get body of article
+    # If scrape, get article with readability
     if scrape:
-        body_html = get_readable_html(article.link)
+
+        response = requests.get(article.link)
+        doc = Document(response.text)
+        body = doc.summary()
+
+    # Else construct from article content
     else:
-        body_html = ''
+        
         if hasattr(article, 'content'):
             for c in article.content:
                 if c.type == 'text/html':
-                    body_html += c.value
+                    body += c.value
 
-    body = html_to_markdown(body_html)
+    return body
+
+
+# Postprocess HTML
+def postprocess(text):
+
+    processor = subprocess.Popen(postprocessor.split(' '), stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
+    output = processor.communicate(input = text.encode())[0].decode().strip()
+
+    return output
+
+
+# Get constructed article
+def get_article(article, scrape):
+
+    # Construct head of article
+    image = get_article_image(article)
+    summary = get_article_summary(article)
+    date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
+    head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link)
+
+    # Get body of article
+    body = get_article_body(article, scrape)
+
+    # Postprocess article
+    article_text = postprocess('{}\n\n<hr>\n\n{}'.format(head, body)).strip()
+
+    return article_text
 
-    return '{}\n\n---\n\n{}'.format(head, body)
-    
 
 # Update feed
 def update_feed(feed):
 
     log('  updating feed "{}"'.format(feed['name']))
 
+    # Set feedpaths
     feedpath_new = os.path.join(base_directory, feed['category'], feed['name'], 'new')
     feedpath_read = os.path.join(base_directory, feed['category'], feed['name'], 'read')
+    
     if not os.path.exists(feedpath_new):
         os.makedirs(feedpath_new)
+        
     if not os.path.exists(feedpath_read):
         os.makedirs(feedpath_read)
 
-    articles = get_articles(feed['url'])
+    # Update articles
+    articles = get_articles(feed)
     threshold_date = datetime.now() - timedelta(days = max_age)
+    
     for a in articles:
+        
         try:
             date = datetime.fromtimestamp(mktime(a.published_parsed))
             if date > threshold_date:
@@ -171,6 +183,7 @@ def update_feed(feed):
                     text = get_article(a, feed['scrape'])
                     write_to_file(os.path.join(feedpath_new, filename), text)
                     log('    added article "{}"'.format(a.title))
+
         except Exception as e:
             error('while parsing feed article "{}" from feed "{}": {}'.format(a.title, feed['name'], e))
 
@@ -193,16 +206,19 @@ def remove_old_articles():
 
     log('  removed {} articles'.format(count))
 
+
 # Parse config file
 def load_config(filepath):
 
-    global base_directory, max_age, datetime_format, feeds
+    global base_directory, max_age, datetime_format, postprocessor, fileending, feeds
 
     try:
         config = toml.load(filepath)
         base_directory = config['base_directory']
         max_age = config['max_age']
         datetime_format = config['datetime_format']
+        postprocessor = config['postprocessor']
+        fileending = config['fileending']
         feeds = config['feed']
     except Exception as e:
         error('while parsing config: {}'.format(e))