add postprocessing feature

2020-04-18 11:36:31 +02:00 · 2020-04-18 11:36:31 +02:00 · 50f54f20c5
commit 50f54f20c5
parent 400631fba9
3 changed files with 85 additions and 56 deletions
--- a/README.md
+++ b/README.md
@ -9,7 +9,8 @@ Read the news you want, the way you want it.
 Without advertisements, clickbait and trackers.
 Drop unresponsive web interfaces and stop accepting cookies, because plaintext is God.

-Articles are scraped as Markdown files from the original article web page and stored in a special folder structure.
+Articles are scraped by default as Markdown files from the original article web page and stored in a special folder structure.
+You can parse articles in your favourite file format by defining your own postprocessor.

 __Note:__ This script is under development and far from being complete.
 Until now it works for the most feeds I read.
@ -65,6 +66,12 @@ base_directory = '/home/<user>/rss'
 # Articles older than max_age (days) will be deleted and not be added.
 max_age = 30

+# Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout.
+postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links --reference-location=document'
+
+# Fileending for the article files.
+fileending = 'md'
+
 # Date and time format as strftime to be included in the articles.
 datetime_format = '%d.%m.%Y %H:%M'

@ -146,6 +153,6 @@ Just synchronize the base_directory with [Syncthing](https://syncthing.net/), [r

 ## Acknowledgements

-Thanks to all the people which created the nice libraries, this project in based on.
+Thanks to all the people, who created the nice libraries this project in based on.
 And also thanks to Dieter Steffmann who created the Canterbury font, which is used for the logo.
 You can find it in the `fonts/` directory.
--- a/config.toml
+++ b/config.toml
@ -7,6 +7,12 @@ max_age = 30
 # Date and time format as strftime to be included in the articles.
 datetime_format = '%d.%m.%Y %H:%M'

+# Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout.
+postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links --reference-location=document'
+
+# Fileending for the article files.
+fileending = 'md'
+
 # Feeds
 # The category can be empty (''). The feed fill then be stored in the base_directory.
 # The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware').
--- a/spiderss.py
+++ b/spiderss.py
@ -52,39 +52,19 @@ def print_logo():
 Utility functions
 '''

-# Get readable HTML of a webpage
-def get_readable_html(url):
-    response = requests.get(url)
-    doc = Document(response.text)
-    return doc.summary()
-
-
-# Convert HTML to Markdown
-def html_to_markdown(html):
-    h = html2text.HTML2Text()
-    h.unicode_snob = True
-    h.ignore_links = True
-    h.ignore_images = False
-    #h.ignore_anchors = True
-    #h.skip_internal_links = True
-    #h.protect_links = True
-    #h.use_automatic_links = True
-    h.body_width = 0
-    return h.handle(html).strip()
-

 # Get articles of a feed 
-def get_articles(feed_url):
-    feed = feedparser.parse(feed_url)
+def get_articles(feed):
+    feed = feedparser.parse(feed['url'])
    return feed.entries


 # Write text to file
 def write_to_file(filepath, text):

-    # Postprocess article with pandoc and write to file
-    pandoc = subprocess.Popen(['pandoc', '-f', 'markdown', '-t', 'markdown', '-o', filepath], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
-    pandoc.communicate(input = text.encode())
+    file = open(filepath, 'w')
+    file.write(text)
+    file.close()


 # Get filename from a date and a title
@ -97,72 +77,104 @@ def get_filename(date, title):
    title = re.sub('[^A-Za-z0-9 ]+', '', title.lower())
    title = re.sub(' ', '_', title)
    
-    return '{}_{}.md'.format(date, title)
+    return '{}_{}.{}'.format(date, title, fileending)
+
+
+# Get image snippet for an article
+def get_article_image(article):
+    
+    try:
+        image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image')
+        return '<img src="{}" alt="Image">\n\n'.format(image_url)
+    except:
+        return ''


 # Get summary snippet for an article
 def get_article_summary(article):
+
    try:
        h = html2text.HTML2Text()
        h.unicode_snob = True
        h.ignore_links = True
        h.ignore_images = True
-        #h.ignore_anchors = True
-        #h.skip_internal_links = True
        h.body_width = 0
-        summary = h.handle(article.summary).split('\n\n')[0].strip()
-        return '**{}**\n\n'.format(summary)
+        return '<p><b>{}</b></p>\n\n'.format(summary)
    except:
        return ''


-# Get image snippet for an article
-def get_article_image(article):
-    try:
-        image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image')
-        return '![Image]({})\n\n'.format(image_url)
-    except:
-        return ''
+# Get article body either from web or its content
+def get_article_body(article, scrape):

+    body = ''

-# Get text from an article
-def get_article(article, scrape):
-
-    # Construct head of article
-    image_url = get_article_image(article)
-    date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
-    head = '# {}\n\n{}{}{}\n\n[Link]({})'.format(article.title, image_url, get_article_summary(article), date, article.link)
-
-    # Get body of article
+    # If scrape, get article with readability
    if scrape:
-        body_html = get_readable_html(article.link)
+
+        response = requests.get(article.link)
+        doc = Document(response.text)
+        body = doc.summary()
+
+    # Else construct from article content
    else:
-        body_html = ''
+        
        if hasattr(article, 'content'):
            for c in article.content:
                if c.type == 'text/html':
-                    body_html += c.value
+                    body += c.value

-    body = html_to_markdown(body_html)
+    return body
+
+
+# Postprocess HTML
+def postprocess(text):
+
+    processor = subprocess.Popen(postprocessor.split(' '), stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
+    output = processor.communicate(input = text.encode())[0].decode().strip()
+
+    return output
+
+
+# Get constructed article
+def get_article(article, scrape):
+
+    # Construct head of article
+    image = get_article_image(article)
+    summary = get_article_summary(article)
+    date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
+    head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link)
+
+    # Get body of article
+    body = get_article_body(article, scrape)
+
+    # Postprocess article
+    article_text = postprocess('{}\n\n<hr>\n\n{}'.format(head, body)).strip()
+
+    return article_text

-    return '{}\n\n---\n\n{}'.format(head, body)
-    

 # Update feed
 def update_feed(feed):

    log('  updating feed "{}"'.format(feed['name']))

+    # Set feedpaths
    feedpath_new = os.path.join(base_directory, feed['category'], feed['name'], 'new')
    feedpath_read = os.path.join(base_directory, feed['category'], feed['name'], 'read')
+    
    if not os.path.exists(feedpath_new):
        os.makedirs(feedpath_new)
+        
    if not os.path.exists(feedpath_read):
        os.makedirs(feedpath_read)

-    articles = get_articles(feed['url'])
+    # Update articles
+    articles = get_articles(feed)
    threshold_date = datetime.now() - timedelta(days = max_age)
+    
    for a in articles:
+        
        try:
            date = datetime.fromtimestamp(mktime(a.published_parsed))
            if date > threshold_date:
@ -171,6 +183,7 @@ def update_feed(feed):
                    text = get_article(a, feed['scrape'])
                    write_to_file(os.path.join(feedpath_new, filename), text)
                    log('    added article "{}"'.format(a.title))
+
        except Exception as e:
            error('while parsing feed article "{}" from feed "{}": {}'.format(a.title, feed['name'], e))

@ -193,16 +206,19 @@ def remove_old_articles():

    log('  removed {} articles'.format(count))

+
 # Parse config file
 def load_config(filepath):

-    global base_directory, max_age, datetime_format, feeds
+    global base_directory, max_age, datetime_format, postprocessor, fileending, feeds

    try:
        config = toml.load(filepath)
        base_directory = config['base_directory']
        max_age = config['max_age']
        datetime_format = config['datetime_format']
+        postprocessor = config['postprocessor']
+        fileending = config['fileending']
        feeds = config['feed']
    except Exception as e:
        error('while parsing config: {}'.format(e))