update Nix file

Blacken code
add comma
2021-03-11 18:52:46 +01:00 · 2020-08-29 11:45:59 +02:00 · 2020-04-26 20:29:03 +02:00 · 2020-04-26 20:04:13 +02:00 · 2020-04-26 19:58:47 +02:00 · 2020-04-24 18:51:20 +02:00
5 changed files with 277 additions and 152 deletions
--- a/README.md
+++ b/README.md
@ -20,6 +20,7 @@ Use at your own risk!
 - Store articles in categories
 - Delete articles after a few days
 - Filter articles, you're not interested in
 - Distinguish __new__ from __read__ articles
 - Store __loved__ articles forever
 - OPML import
@ -66,14 +67,18 @@ base_directory = '/home/<user>/rss'
 # Articles older than max_age (days) will be deleted and not be added.
 max_age = 30
 # Date and time format as strftime, to be included in the articles.
 datetime_format = '%d.%m.%Y %H:%M'
 # Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout.
 postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links --reference-location=document'
 # Fileending for the article files.
 fileending = 'md'
-# Date and time format as strftime to be included in the articles.
+# List of regular expression strings. If any of these matches an lowercase article title, the article won't be saved.
-datetime_format = '%d.%m.%Y %H:%M'
+# E.g. if you wan't to skip news about RSS explicitly, add '(\W|^)rss(\W|$)'.
 filters = []
 # Feeds
 # The category can be empty (''). The feed fill then be stored in the base_directory.
@ -153,6 +158,6 @@ Just synchronize the base_directory with [Syncthing](https://syncthing.net/), [r
 ## Acknowledgements
-Thanks to all the people, who created the nice libraries this project in based on.
+Thanks to all the people who created the nice software, this project in based on.
 And also thanks to Dieter Steffmann who created the Canterbury font, which is used for the logo.
 You can find it in the `fonts/` directory.
--- a/config.toml
+++ b/config.toml
@ -4,7 +4,7 @@ base_directory = '/home/<user>/rss'
 # Articles older than max_age (days) will be deleted and not be added.
 max_age = 30
-# Date and time format as strftime to be included in the articles.
+# Date and time format as strftime, to be included in the articles.
 datetime_format = '%d.%m.%Y %H:%M'
 # Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout.
@ -13,6 +13,10 @@ postprocessor = 'pandoc -f html -t markdown_strict-raw_html+pipe_tables --refere
 # Fileending for the article files.
 fileending = 'md'
 # List of regular expression strings. If any of these matches an lowercase article title, the article won't be saved.
 # E.g. if you wan't to skip news about RSS explicitly, add '(\W|^)rss(\W|$)'.
 filters = []
 # Feeds
 # The category can be empty (''). The feed fill then be stored in the base_directory.
 # The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware').
--- a/default.nix
+++ b/default.nix
@ -1,21 +0,0 @@
 with import <nixpkgs> {};
 stdenv.mkDerivation {
  name = "myPythonEnv";
  buildInputs = with pkgs; [
    python37Full
    python37Packages.virtualenv
    pandoc
  ];
  src = null;
  shellHook = ''
    if [ ! -d .venv ]; then
      python -m venv .venv
    fi
    source .venv/bin/activate
    pip install --upgrade pip
    if [ -s requirements.txt ]; then
      pip install -r requirements.txt
    fi
  '';
 }
--- a/shell.nix
+++ b/shell.nix
@ -0,0 +1,38 @@
 { pkgs ? import <nixpkgs> {} }:
 pkgs.mkShell {
  name = "python-environment";
  buildInputs = with pkgs; [
    pandoc
    python3
    python3Packages.virtualenv
  ];
  shellHook = ''
    function log_header {
      echo -ne "==> \e[32m\e[1m$1\e[0m\n\n"
    }
    function log_subheader {
      echo -ne "--> \e[33m\e[1m$1\e[0m\n\n"
    }
    function log {
      echo -ne "    $1\n"
    }
    echo ""
    log_header "python_environment"
    if [ ! -d .venv ]; then
      python -m venv .venv
    fi
    source .venv/bin/activate
    log_subheader "upgrading pip"
    pip install --upgrade pip
    echo ""
    if [ -s requirements.txt ]; then
      log_subheader "found requirements.txt, installing packages"
      pip install -r requirements.txt
      echo ""
    fi
    log_header "package versions"
    log "$(python --version)"
    log "$(pip --version)"
  '';
 }
--- a/spiderss.py
+++ b/spiderss.py
@ -8,7 +8,6 @@ import re
 import requests
 import subprocess
 import sys
 import time
 import toml
 from bs4 import BeautifulSoup
 from datetime import datetime, timedelta
@ -16,24 +15,25 @@ from readability import Document
 from time import mktime
 from urllib.parse import urlsplit, urlunsplit
-'''
+"""
 Output functions
-'''
+"""
 # Print log message
-def log(text, force = False):
+def log(text, force=False):
    if verbose or force:
-        print('{} | {}'.format(datetime.now().strftime('%d.%m %H:%M'), text))
+        print("{} | {}".format(datetime.now().strftime("%d.%m %H:%M"), text))
 # Print error message and exit
 def error(text):
-    print('{} E {}'.format(datetime.now().strftime('%d.%m %H:%M'), text))
+    print("{} E {}".format(datetime.now().strftime("%d.%m %H:%M"), text))
 # Print spiderss logo
 def print_logo():
-    logo = '''
+    logo = """
                          ;:                                                    
                         .N' ,K:                                                
         ,O                  .0MWx'                               lk         0; 
@ -46,54 +46,55 @@ def print_logo():
 ;kddkNKl.   XMNkWk,    :N0;  .'cOW0c.   ,lOW0;   .:0Nl.  okddOW0:. .kdoxXNd,   
             WMX                                                                
            ;..cc                                                               
-    '''
+    """
    print(logo)
 '''
 Utility functions
 '''
-# Get articles of a feed 
+"""
 Utility functions
 """
 # Get articles of a feed
 def get_articles(feed):
-    feed = feedparser.parse(feed['url'])
+
    feed = feedparser.parse(feed["url"])
    return feed.entries
 # Write text to file
 def write_to_file(filepath, text):
-    file = open(filepath, 'w')
+    file = open(filepath, "w")
    file.write(text)
    file.close()
-# Get filename from a date and a title
+# Get filename postfix from a title
-def get_filename(date, title):
+def get_filename_postfix(title):
    # Get date as single block
    date = date.strftime('%Y%m%d%H%M')
    # Get title as lowercase words concatenated with underscores
-    title = re.sub('[^A-Za-z0-9 ]+', '', title.lower())
+    title = re.sub("[^A-Za-z0-9 ]+", "", title.lower())
-    title = re.sub(' ', '_', title)
+    title = re.sub(" ", "_", title)
-    
+
-    return '{}_{}.{}'.format(date, title, fileending)
+    return "{}.{}".format(title, fileending)
-# If scraped, use first content image as fallback
+# Get HTML image snippet from the first image url in a text
-# Get image snippet for an article
+def get_image_snippet(text):
-def get_article_image(article):
+
    try:
-        image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image')
+        image_url = re.search(
            "(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))", text, re.IGNORECASE
        ).group("image")
        return '<img src="{}" alt="Image">\n\n'.format(image_url)
-    except:
+    except Exception:
-        return ''
+        return ""
-# Get summary snippet for an article
+# Get HTML summary snippet from a HTML text
-def get_article_summary(article):
+def get_summary_snippet(text):
    try:
        h = html2text.HTML2Text()
@ -101,60 +102,82 @@ def get_article_summary(article):
        h.ignore_links = True
        h.ignore_images = True
        h.body_width = 0
-        return '<p><b>{}</b></p>\n\n'.format(summary)
+        summary = h.handle(text).split("\n\n")[0].strip()
-    except:
+        return "<p><b>{}</b></p>\n\n".format(summary)
-        return ''
+    except Exception:
        return ""
 # Get article body either from web or its content
-def get_article_body(article, scrape):
+def get_article_body(article, feed):
-    body = ''
+    body = ""
    # TODO: Include appropriate header?
    # If scrape, get article with readability
-    if scrape:
+    if feed["scrape"]:
-        response = requests.get(article.link)
+        headers = {
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36"
        }
        response = requests.get(article.link, headers=headers)
        doc = Document(response.text)
        body = doc.summary()
-        # Replace relative site links with absolute ones, using beautifulsoup
+    # Else construct from article object
        splitted_url = urlsplit(article.link)
        soup = BeautifulSoup(body, features = 'lxml')
        for img in soup.find_all('img', src = True):
            src = img.get('src')
            splitted_src = urlsplit(src)
            constructed_src = [splitted_src.scheme, splitted_src.netloc, splitted_src.path, splitted_src.query, splitted_src.fragment]
            if constructed_src[0] == '':
                constructed_src[0] = splitted_url.scheme
            if constructed_src[1] == '':
                constructed_src[1] = splitted_url.netloc
            new_src = urlunsplit(constructed_src)
            if new_src.startswith('http'):
                body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1)
        # TODO: catch mailto:
        for a in soup.find_all('a', href = True):
            href = a.get('href')
            splitted_href = urlsplit(href)
            constructed_href = [splitted_href.scheme, splitted_href.netloc, splitted_href.path, splitted_href.query, splitted_href.fragment]
            if constructed_href[0] == '':
                constructed_href[0] = splitted_url.scheme
            if constructed_href[1] == '':
                constructed_href[1] = splitted_url.netloc
            new_href = urlunsplit(constructed_href)
            if new_href.startswith('http'):
                body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1)
    # Else construct from article content
    else:
-        
+
-        if hasattr(article, 'content'):
+        # Add all content to body
        if hasattr(article, "content"):
            for c in article.content:
-                if c.type == 'text/html':
+                if c.type == "text/html" or c.type == "text/plain":
                    body += c.value
        # Use summary as fallback
        elif hasattr(article, "summary"):
            body += article.summary
    # Replace relative links with absolute ones, using beautifulsoup
    try:
        splitted_url = urlsplit(article.link)
    except Exception:
        splitted_url = urlsplit(feed["url"])
    soup = BeautifulSoup(body, features="lxml")
    for img in soup.find_all("img", src=True):
        src = img.get("src")
        splitted_src = urlsplit(src)
        constructed_src = [
            splitted_src.scheme,
            splitted_src.netloc,
            splitted_src.path,
            splitted_src.query,
            splitted_src.fragment,
        ]
        if constructed_src[0] == "":
            constructed_src[0] = splitted_url.scheme
        if constructed_src[1] == "":
            constructed_src[1] = splitted_url.netloc
        new_src = urlunsplit(constructed_src)
        if new_src.startswith("http"):
            body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1)
    for a in soup.find_all("a", href=True):
        href = a.get("href")
        splitted_href = urlsplit(href)
        constructed_href = [
            splitted_href.scheme,
            splitted_href.netloc,
            splitted_href.path,
            splitted_href.query,
            splitted_href.fragment,
        ]
        if constructed_href[0] == "":
            constructed_href[0] = splitted_url.scheme
        if constructed_href[1] == "":
            constructed_href[1] = splitted_url.netloc
        new_href = urlunsplit(constructed_href)
        if new_href.startswith("http"):
            body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1)
    return body
@ -163,32 +186,53 @@ def get_article_body(article, scrape):
 def postprocess(text):
    try:
-        processor = subprocess.Popen(postprocessor.split(' '), stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
+        processor = subprocess.Popen(
-        (output, err) = processor.communicate(input = text.encode())
+            postprocessor.split(" "),
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        (output, err) = processor.communicate(input=text.encode())
        if err:
            raise Exception(err.decode().strip())
    except Exception as e:
-        error('    while postprocessing: {}'.format(e))
+        error("    while postprocessing: {}".format(e))
        sys.exit(1)
    return output.decode().strip()
 # Get constructed article
-def get_article(article, scrape):
+def get_article(article, feed):
    # Construct head of article
    image = get_article_image(article)
    summary = get_article_summary(article)
    #TODO: Current time as fallback?
    date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
    head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link)
    # Get body of article
-    body = get_article_body(article, scrape)
+    body = get_article_body(article, feed)
    # Construct head of article
    image = get_image_snippet(str(article))
    if image == "":
        image = get_image_snippet(body)
    summary = get_summary_snippet(article.summary)
    if summary == "":
        summary = get_summary_snippet(body)
    try:
        date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(
            datetime_format
        )
    except Exception:
        date = datetime.now().strftime(datetime_format)
    try:
        link = article.link
    except Exception:
        splitted_url = urlsplit(feed["url"])
        splitted_link = [splitted_url.scheme, splitted_url.netloc, "", "", ""]
        link = urlunsplit(splitted_link)
    head = "<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>".format(
        article.title, image, summary, date, link
    )
    # Postprocess article
-    article_text = postprocess('{}\n\n<hr>\n\n{}'.format(head, body)).strip()
+    article_text = postprocess("{}\n\n<hr>\n\n{}".format(head, body)).strip()
    return article_text
@ -196,105 +240,160 @@ def get_article(article, scrape):
 # Update feed
 def update_feed(feed):
-    log('  updating feed "{}"'.format(feed['name']))
+    log('  updating feed "{}"'.format(feed["name"]))
    # Set feedpaths
-    feedpath_new = os.path.join(base_directory, feed['category'], feed['name'], 'new')
+    feedpath_new = os.path.join(base_directory, feed["category"], feed["name"], "new")
-    feedpath_read = os.path.join(base_directory, feed['category'], feed['name'], 'read')
+    feedpath_read = os.path.join(base_directory, feed["category"], feed["name"], "read")
-    
+
    if not os.path.exists(feedpath_new):
        os.makedirs(feedpath_new)
-        
+
    if not os.path.exists(feedpath_read):
        os.makedirs(feedpath_read)
    # Get exisiting articles
    existing_articles = (
        os.listdir(feedpath_new) + os.listdir(feedpath_read) + os.listdir(lovedpath)
    )
    # Update articles
    articles = get_articles(feed)
-    threshold_date = datetime.now() - timedelta(days = max_age)
+    threshold_date = datetime.now() - timedelta(days=max_age)
-    
+
    if len(articles) == 0:
        error('no articles received from feed "{}"'.format(feed["name"]))
    for a in articles:
-        
+
        try:
-            date = datetime.fromtimestamp(mktime(a.published_parsed))
+
            # Set fallback if no parseable date found
            fallback = False
            try:
                date = datetime.fromtimestamp(mktime(a.published_parsed))
            except Exception:
                date = datetime.now()
                fallback = True
            if date > threshold_date:
-                filename = get_filename(date, a.title)
+
-                if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)):
+                # Check if article should be filtered
-                    text = get_article(a, feed['scrape'])
+                filter = False
-                    write_to_file(os.path.join(feedpath_new, filename), text)
+                for f in filters:
-                    log('    added article "{}"'.format(a.title))
+                    if re.search(f, a.title.lower()):
                        filter = True
                        log('    filtered article "{}"'.format(a.title))
                if not filter:
                    # Construct filename
                    filename_prefix = date.strftime("%Y%m%d%H%M")
                    filename_postfix = get_filename_postfix(a.title)
                    filename = "{}_{}".format(filename_prefix, filename_postfix)
                    # Check if article exists
                    article_exists = False
                    if fallback:
                        existing_articles_fallback = [a[13:] for a in existing_articles]
                        if filename_postfix in existing_articles_fallback:
                            article_exists = True
                    elif filename in existing_articles:
                        article_exists = True
                    if not article_exists:
                        text = get_article(a, feed)
                        write_to_file(os.path.join(feedpath_new, filename), text)
                        log('    added article "{}"'.format(a.title))
        except Exception as e:
-            error('while parsing feed article "{}" from feed "{}": {}'.format(a.title, feed['name'], e))
+            error(
                'while parsing article "{}" from feed "{}": {}'.format(
                    a.title, feed["name"], e
                )
            )
 # Delete articles older than max_age
 def remove_old_articles():
-    threshold_date = datetime.now() - timedelta(days = max_age)
+    threshold_date = datetime.now() - timedelta(days=max_age)
    count = 0
-    
+
    for subdir, dirs, files in os.walk(base_directory):
        # Skip 'loved' directory
-        if not os.path.join(base_directory, 'loved') in subdir:
+        if not os.path.join(base_directory, "loved") in subdir:
            for file in files:
-                 date = datetime.strptime(file[:12], '%Y%m%d%H%M')
+                date = datetime.strptime(file[:12], "%Y%m%d%H%M")
-                 if threshold_date > date:
+                if threshold_date > date:
-                     os.remove(os.path.join(subdir, file))
+                    os.remove(os.path.join(subdir, file))
-                     count += 1
+                    count += 1
-    log('  removed {} articles'.format(count))
+    log("  removed {} articles".format(count))
 # Parse config file
 def load_config(filepath):
-    global base_directory, max_age, datetime_format, postprocessor, fileending, feeds
+    global base_directory, max_age, datetime_format, postprocessor, fileending, filters, feeds
    try:
        config = toml.load(filepath)
-        base_directory = config['base_directory']
+        base_directory = config["base_directory"]
-        max_age = config['max_age']
+        max_age = config["max_age"]
-        datetime_format = config['datetime_format']
+        datetime_format = config["datetime_format"]
-        postprocessor = config['postprocessor']
+        postprocessor = config["postprocessor"]
-        fileending = config['fileending']
+        fileending = config["fileending"]
-        feeds = config['feed']
+        filters = config["filters"]
        feeds = config["feed"]
    except Exception as e:
-        error('while parsing config: {}'.format(e))
+        error("while parsing config: {}".format(e))
        sys.exit(1)
 # Initialize spiderss
 def initialize():
    global lovedpath
    # Create 'loved' directory if not existent
-    lovedpath = os.path.join(base_directory, 'loved')
+    lovedpath = os.path.join(base_directory, "loved")
    if not os.path.exists(lovedpath):
        os.makedirs(lovedpath)
-# Update all feeds and delete old messages
+# Update all feeds and remove old articles
 def crawl():
-    log('crawling feeds', True)
+    log("crawling feeds", True)
    for feed in feeds:
        update_feed(feed)
-    log('removing old articles', True)
+    log("removing old articles", True)
    remove_old_articles()
-'''
+
 """
 Main
-'''
+"""
 def main():
    global verbose
    # Initialize parser
-    parser = argparse.ArgumentParser(description = 'Crawl RSS feeds and store articles as Markdown files.')
+    parser = argparse.ArgumentParser(
-    parser.add_argument('-v', '--verbose', action = 'store_true', help = 'verbose output')
+        description="Crawl RSS feeds and store articles as Markdown files."
-    parser.add_argument('-c', '--config', default = './config.toml', help = 'config file (default: ./config.toml)')
+    )
    parser.add_argument("-v", "--verbose", action="store_true", help="verbose output")
    parser.add_argument(
        "-c",
        "--config",
        default="./config.toml",
        help="config file (default: ./config.toml)",
    )
    # Get args
    args = parser.parse_args()
@ -308,5 +407,5 @@ def main():
    crawl()
-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
Author	SHA1	Message	Date
Denis Lehmann	de2782117a	update Nix file	2021-03-11 18:52:46 +01:00
Denis Lehmann	5726c201b9	Blacken code	2020-08-29 11:45:59 +02:00
Denis Lehmann	3d9531eb8d	add comma	2020-04-26 20:29:03 +02:00
Denis Lehmann	1ddc57c30a	update features	2020-04-26 20:04:13 +02:00
Denis Lehmann	fa542ee56e	add filter feature	2020-04-26 19:58:47 +02:00
Denis Lehmann	d630cc96c4	add handling of missing article links	2020-04-24 18:51:20 +02:00
Denis Lehmann	7d4d1311bb	show error message if no articles were returned from feedparser	2020-04-19 16:04:22 +02:00
Denis Lehmann	d37148ea32	add fallback for articles with unparseable date	2020-04-19 08:56:18 +02:00
Denis Lehmann	e4837d77c6	add fallback for image and summary	2020-04-18 21:24:37 +02:00
Denis Lehmann	8aed1df8c7	add user-agent for web requests	2020-04-18 20:28:51 +02:00