diff --git a/README.md b/README.md index a8d3532..92ec978 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ![spiderss logo](images/logo.png) -__spiderss__ is a plaintext RSS crawler, based on [feedparser](https://github.com/kurtmckee/feedparser), [python-readability](https://github.com/buriy/python-readability) and [html2text](https://github.com/Alir3z4/html2text). +__spiderss__ is a plaintext RSS crawler, based on [feedparser](https://github.com/kurtmckee/feedparser), [python-readability](https://github.com/buriy/python-readability), [html2text](https://github.com/Alir3z4/html2text) and [Pandoc](https://pandoc.org/). Actually, it's just a python script. Read the news you want, the way you want it. @@ -33,7 +33,7 @@ Call `nix-shell` in the project directory. This will drop you into a python envi ### Legacy OS -Install the requirements with `pip install -r requirements.txt`. +Install **Pandoc** and the python requirements with `pip install -r requirements.txt`. ### Android diff --git a/default.nix b/default.nix index f1f781e..1c000b4 100644 --- a/default.nix +++ b/default.nix @@ -5,6 +5,7 @@ stdenv.mkDerivation { buildInputs = with pkgs; [ python37Full python37Packages.virtualenv + pandoc ]; src = null; shellHook = '' diff --git a/spiderss.py b/spiderss.py index 3b2889a..1c0dc06 100755 --- a/spiderss.py +++ b/spiderss.py @@ -6,6 +6,7 @@ import html2text import os import re import requests +import subprocess import sys import time import toml @@ -79,10 +80,11 @@ def get_articles(feed_url): # Write text to file -def write_to_file(filename, text): - file = open(filename, 'w') - file.write(text) - file.close() +def write_to_file(filepath, text): + + # Postprocess article with pandoc and write to file + pandoc = subprocess.Popen(['pandoc', '-f', 'markdown', '-t', 'markdown', '-o', filepath], stdin=subprocess.PIPE, stdout=subprocess.PIPE) + pandoc.communicate(input = text.encode()) # Get filename from a date and a title @@ -129,7 +131,7 @@ def get_article(article, scrape): # Construct head of article image_url = get_article_image(article) date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format) - head = '# {}\n\n{}{}{} - [Link]({})'.format(article.title, image_url, get_article_summary(article), date, article.link) + head = '# {}\n\n{}{}{}\n\n[Link]({})'.format(article.title, image_url, get_article_summary(article), date, article.link) # Get body of article if scrape: