add pandoc as post processor

2020-04-16 21:40:53 +02:00 · 2020-04-16 21:40:53 +02:00 · 6ccec68923
commit 6ccec68923
parent 6c622bce1f
3 changed files with 10 additions and 7 deletions
--- a/README.md
+++ b/README.md
@ -2,7 +2,7 @@

 ![spiderss logo](images/logo.png)

-__spiderss__ is a plaintext RSS crawler, based on [feedparser](https://github.com/kurtmckee/feedparser), [python-readability](https://github.com/buriy/python-readability) and [html2text](https://github.com/Alir3z4/html2text).
+__spiderss__ is a plaintext RSS crawler, based on [feedparser](https://github.com/kurtmckee/feedparser), [python-readability](https://github.com/buriy/python-readability), [html2text](https://github.com/Alir3z4/html2text) and [Pandoc](https://pandoc.org/).
 Actually, it's just a python script.

 Read the news you want, the way you want it.
@ -33,7 +33,7 @@ Call `nix-shell` in the project directory. This will drop you into a python envi

 ### Legacy OS

-Install the requirements with `pip install -r requirements.txt`.
+Install **Pandoc** and the python requirements with `pip install -r requirements.txt`.

 ### Android

--- a/default.nix
+++ b/default.nix
@ -5,6 +5,7 @@ stdenv.mkDerivation {
  buildInputs = with pkgs; [
    python37Full
    python37Packages.virtualenv
+    pandoc
  ];
  src = null;
  shellHook = ''
--- a/spiderss.py
+++ b/spiderss.py
@ -6,6 +6,7 @@ import html2text
 import os
 import re
 import requests
+import subprocess
 import sys
 import time
 import toml
@ -79,10 +80,11 @@ def get_articles(feed_url):


 # Write text to file
-def write_to_file(filename, text):
-    file = open(filename, 'w')
-    file.write(text)
-    file.close()
+def write_to_file(filepath, text):
+
+    # Postprocess article with pandoc and write to file
+    pandoc = subprocess.Popen(['pandoc', '-f', 'markdown', '-t', 'markdown', '-o', filepath], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    pandoc.communicate(input = text.encode())


 # Get filename from a date and a title
@ -129,7 +131,7 @@ def get_article(article, scrape):
    # Construct head of article
    image_url = get_article_image(article)
    date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
-    head = '# {}\n\n{}{}{} - [Link]({})'.format(article.title, image_url, get_article_summary(article), date, article.link)
+    head = '# {}\n\n{}{}{}\n\n[Link]({})'.format(article.title, image_url, get_article_summary(article), date, article.link)

    # Get body of article
    if scrape: