update Nix file

Blacken code
add comma
2021-03-11 18:52:46 +01:00 · 2020-08-29 11:45:59 +02:00 · 2020-04-26 20:29:03 +02:00 · 2020-04-26 20:04:13 +02:00 · 2020-04-26 19:58:47 +02:00 · 2020-04-24 18:51:20 +02:00
5 changed files with 277 additions and 152 deletions
--- a/README.md
+++ b/README.md
@ -20,6 +20,7 @@ Use at your own risk!

 - Store articles in categories
 - Delete articles after a few days
+- Filter articles, you're not interested in
 - Distinguish __new__ from __read__ articles
 - Store __loved__ articles forever
 - OPML import
@ -66,14 +67,18 @@ base_directory = '/home/<user>/rss'
 # Articles older than max_age (days) will be deleted and not be added.
 max_age = 30

+# Date and time format as strftime, to be included in the articles.
+datetime_format = '%d.%m.%Y %H:%M'
+
 # Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout.
 postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links --reference-location=document'

 # Fileending for the article files.
 fileending = 'md'

-# Date and time format as strftime to be included in the articles.
-datetime_format = '%d.%m.%Y %H:%M'
+# List of regular expression strings. If any of these matches an lowercase article title, the article won't be saved.
+# E.g. if you wan't to skip news about RSS explicitly, add '(\W|^)rss(\W|$)'.
+filters = []

 # Feeds
 # The category can be empty (''). The feed fill then be stored in the base_directory.
@ -153,6 +158,6 @@ Just synchronize the base_directory with [Syncthing](https://syncthing.net/), [r

 ## Acknowledgements

-Thanks to all the people, who created the nice libraries this project in based on.
+Thanks to all the people who created the nice software, this project in based on.
 And also thanks to Dieter Steffmann who created the Canterbury font, which is used for the logo.
 You can find it in the `fonts/` directory.
--- a/config.toml
+++ b/config.toml
@ -4,7 +4,7 @@ base_directory = '/home/<user>/rss'
 # Articles older than max_age (days) will be deleted and not be added.
 max_age = 30

-# Date and time format as strftime to be included in the articles.
+# Date and time format as strftime, to be included in the articles.
 datetime_format = '%d.%m.%Y %H:%M'

 # Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout.
@ -13,6 +13,10 @@ postprocessor = 'pandoc -f html -t markdown_strict-raw_html+pipe_tables --refere
 # Fileending for the article files.
 fileending = 'md'

+# List of regular expression strings. If any of these matches an lowercase article title, the article won't be saved.
+# E.g. if you wan't to skip news about RSS explicitly, add '(\W|^)rss(\W|$)'.
+filters = []
+
 # Feeds
 # The category can be empty (''). The feed fill then be stored in the base_directory.
 # The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware').
--- a/default.nix
+++ b/default.nix
@ -1,21 +0,0 @@
-with import <nixpkgs> {};
-
-stdenv.mkDerivation {
-  name = "myPythonEnv";
-  buildInputs = with pkgs; [
-    python37Full
-    python37Packages.virtualenv
-    pandoc
-  ];
-  src = null;
-  shellHook = ''
-    if [ ! -d .venv ]; then
-      python -m venv .venv
-    fi
-    source .venv/bin/activate
-    pip install --upgrade pip
-    if [ -s requirements.txt ]; then
-      pip install -r requirements.txt
-    fi
-  '';
-}
--- a/shell.nix
+++ b/shell.nix
@ -0,0 +1,38 @@
+{ pkgs ? import <nixpkgs> {} }:
+pkgs.mkShell {
+  name = "python-environment";
+  buildInputs = with pkgs; [
+    pandoc
+    python3
+    python3Packages.virtualenv
+  ];
+  shellHook = ''
+    function log_header {
+      echo -ne "==> \e[32m\e[1m$1\e[0m\n\n"
+    }
+    function log_subheader {
+      echo -ne "--> \e[33m\e[1m$1\e[0m\n\n"
+    }
+    function log {
+      echo -ne "    $1\n"
+    }
+
+    echo ""
+    log_header "python_environment"
+    if [ ! -d .venv ]; then
+      python -m venv .venv
+    fi
+    source .venv/bin/activate
+    log_subheader "upgrading pip"
+    pip install --upgrade pip
+    echo ""
+    if [ -s requirements.txt ]; then
+      log_subheader "found requirements.txt, installing packages"
+      pip install -r requirements.txt
+      echo ""
+    fi
+    log_header "package versions"
+    log "$(python --version)"
+    log "$(pip --version)"
+  '';
+}
--- a/spiderss.py
+++ b/spiderss.py
@ -8,7 +8,6 @@ import re
 import requests
 import subprocess
 import sys
-import time
 import toml
 from bs4 import BeautifulSoup
 from datetime import datetime, timedelta
@ -16,24 +15,25 @@ from readability import Document
 from time import mktime
 from urllib.parse import urlsplit, urlunsplit

-'''
+"""
 Output functions
-'''
+"""
+

 # Print log message
-def log(text, force = False):
+def log(text, force=False):
    if verbose or force:
-        print('{} | {}'.format(datetime.now().strftime('%d.%m %H:%M'), text))
+        print("{} | {}".format(datetime.now().strftime("%d.%m %H:%M"), text))


 # Print error message and exit
 def error(text):
-    print('{} E {}'.format(datetime.now().strftime('%d.%m %H:%M'), text))
+    print("{} E {}".format(datetime.now().strftime("%d.%m %H:%M"), text))


 # Print spiderss logo
 def print_logo():
-    logo = '''
+    logo = """
                          ;:                                                    
                         .N' ,K:                                                
         ,O                  .0MWx'                               lk         0; 
@ -46,54 +46,55 @@ def print_logo():
 ;kddkNKl.   XMNkWk,    :N0;  .'cOW0c.   ,lOW0;   .:0Nl.  okddOW0:. .kdoxXNd,   
             WMX                                                                
            ;..cc                                                               
-    '''
+    """

    print(logo)

-'''
-Utility functions
-'''

-# Get articles of a feed 
+"""
+Utility functions
+"""
+
+
+# Get articles of a feed
 def get_articles(feed):
-    feed = feedparser.parse(feed['url'])
+
+    feed = feedparser.parse(feed["url"])
    return feed.entries


 # Write text to file
 def write_to_file(filepath, text):

-    file = open(filepath, 'w')
+    file = open(filepath, "w")
    file.write(text)
    file.close()


-# Get filename from a date and a title
-def get_filename(date, title):
-
-    # Get date as single block
-    date = date.strftime('%Y%m%d%H%M')
+# Get filename postfix from a title
+def get_filename_postfix(title):

    # Get title as lowercase words concatenated with underscores
-    title = re.sub('[^A-Za-z0-9 ]+', '', title.lower())
-    title = re.sub(' ', '_', title)
-    
-    return '{}_{}.{}'.format(date, title, fileending)
+    title = re.sub("[^A-Za-z0-9 ]+", "", title.lower())
+    title = re.sub(" ", "_", title)
+
+    return "{}.{}".format(title, fileending)


-# If scraped, use first content image as fallback
-# Get image snippet for an article
-def get_article_image(article):
-    
+# Get HTML image snippet from the first image url in a text
+def get_image_snippet(text):
+
    try:
-        image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image')
+        image_url = re.search(
+            "(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))", text, re.IGNORECASE
+        ).group("image")
        return '<img src="{}" alt="Image">\n\n'.format(image_url)
-    except:
-        return ''
+    except Exception:
+        return ""


-# Get summary snippet for an article
-def get_article_summary(article):
+# Get HTML summary snippet from a HTML text
+def get_summary_snippet(text):

    try:
        h = html2text.HTML2Text()
@ -101,60 +102,82 @@ def get_article_summary(article):
        h.ignore_links = True
        h.ignore_images = True
        h.body_width = 0
-        return '<p><b>{}</b></p>\n\n'.format(summary)
-    except:
-        return ''
+        summary = h.handle(text).split("\n\n")[0].strip()
+        return "<p><b>{}</b></p>\n\n".format(summary)
+    except Exception:
+        return ""


 # Get article body either from web or its content
-def get_article_body(article, scrape):
+def get_article_body(article, feed):

-    body = ''
+    body = ""

-    # TODO: Include appropriate header?
    # If scrape, get article with readability
-    if scrape:
+    if feed["scrape"]:

-        response = requests.get(article.link)
+        headers = {
+            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36"
+        }
+        response = requests.get(article.link, headers=headers)
        doc = Document(response.text)
        body = doc.summary()

-        # Replace relative site links with absolute ones, using beautifulsoup
-        splitted_url = urlsplit(article.link)
-        soup = BeautifulSoup(body, features = 'lxml')
-        for img in soup.find_all('img', src = True):
-            src = img.get('src')
-            splitted_src = urlsplit(src)
-            constructed_src = [splitted_src.scheme, splitted_src.netloc, splitted_src.path, splitted_src.query, splitted_src.fragment]
-            if constructed_src[0] == '':
-                constructed_src[0] = splitted_url.scheme
-            if constructed_src[1] == '':
-                constructed_src[1] = splitted_url.netloc
-            new_src = urlunsplit(constructed_src)
-            if new_src.startswith('http'):
-                body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1)
-            
-        # TODO: catch mailto:
-        for a in soup.find_all('a', href = True):
-            href = a.get('href')
-            splitted_href = urlsplit(href)
-            constructed_href = [splitted_href.scheme, splitted_href.netloc, splitted_href.path, splitted_href.query, splitted_href.fragment]
-            if constructed_href[0] == '':
-                constructed_href[0] = splitted_url.scheme
-            if constructed_href[1] == '':
-                constructed_href[1] = splitted_url.netloc
-            new_href = urlunsplit(constructed_href)
-            if new_href.startswith('http'):
-                body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1)
-            
-
-    # Else construct from article content
+    # Else construct from article object
    else:
-        
-        if hasattr(article, 'content'):
+
+        # Add all content to body
+        if hasattr(article, "content"):
            for c in article.content:
-                if c.type == 'text/html':
+                if c.type == "text/html" or c.type == "text/plain":
                    body += c.value
+        # Use summary as fallback
+        elif hasattr(article, "summary"):
+            body += article.summary
+
+    # Replace relative links with absolute ones, using beautifulsoup
+    try:
+        splitted_url = urlsplit(article.link)
+    except Exception:
+        splitted_url = urlsplit(feed["url"])
+
+    soup = BeautifulSoup(body, features="lxml")
+
+    for img in soup.find_all("img", src=True):
+        src = img.get("src")
+        splitted_src = urlsplit(src)
+        constructed_src = [
+            splitted_src.scheme,
+            splitted_src.netloc,
+            splitted_src.path,
+            splitted_src.query,
+            splitted_src.fragment,
+        ]
+        if constructed_src[0] == "":
+            constructed_src[0] = splitted_url.scheme
+        if constructed_src[1] == "":
+            constructed_src[1] = splitted_url.netloc
+        new_src = urlunsplit(constructed_src)
+        if new_src.startswith("http"):
+            body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1)
+
+    for a in soup.find_all("a", href=True):
+        href = a.get("href")
+        splitted_href = urlsplit(href)
+        constructed_href = [
+            splitted_href.scheme,
+            splitted_href.netloc,
+            splitted_href.path,
+            splitted_href.query,
+            splitted_href.fragment,
+        ]
+        if constructed_href[0] == "":
+            constructed_href[0] = splitted_url.scheme
+        if constructed_href[1] == "":
+            constructed_href[1] = splitted_url.netloc
+        new_href = urlunsplit(constructed_href)
+        if new_href.startswith("http"):
+            body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1)

    return body

@ -163,32 +186,53 @@ def get_article_body(article, scrape):
 def postprocess(text):

    try:
-        processor = subprocess.Popen(postprocessor.split(' '), stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
-        (output, err) = processor.communicate(input = text.encode())
+        processor = subprocess.Popen(
+            postprocessor.split(" "),
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        (output, err) = processor.communicate(input=text.encode())
        if err:
            raise Exception(err.decode().strip())
    except Exception as e:
-        error('    while postprocessing: {}'.format(e))
+        error("    while postprocessing: {}".format(e))
        sys.exit(1)

    return output.decode().strip()


 # Get constructed article
-def get_article(article, scrape):
-
-    # Construct head of article
-    image = get_article_image(article)
-    summary = get_article_summary(article)
-    #TODO: Current time as fallback?
-    date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
-    head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link)
+def get_article(article, feed):

    # Get body of article
-    body = get_article_body(article, scrape)
+    body = get_article_body(article, feed)
+
+    # Construct head of article
+    image = get_image_snippet(str(article))
+    if image == "":
+        image = get_image_snippet(body)
+    summary = get_summary_snippet(article.summary)
+    if summary == "":
+        summary = get_summary_snippet(body)
+    try:
+        date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(
+            datetime_format
+        )
+    except Exception:
+        date = datetime.now().strftime(datetime_format)
+    try:
+        link = article.link
+    except Exception:
+        splitted_url = urlsplit(feed["url"])
+        splitted_link = [splitted_url.scheme, splitted_url.netloc, "", "", ""]
+        link = urlunsplit(splitted_link)
+    head = "<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>".format(
+        article.title, image, summary, date, link
+    )

    # Postprocess article
-    article_text = postprocess('{}\n\n<hr>\n\n{}'.format(head, body)).strip()
+    article_text = postprocess("{}\n\n<hr>\n\n{}".format(head, body)).strip()

    return article_text

@ -196,105 +240,160 @@ def get_article(article, scrape):
 # Update feed
 def update_feed(feed):

-    log('  updating feed "{}"'.format(feed['name']))
+    log('  updating feed "{}"'.format(feed["name"]))

    # Set feedpaths
-    feedpath_new = os.path.join(base_directory, feed['category'], feed['name'], 'new')
-    feedpath_read = os.path.join(base_directory, feed['category'], feed['name'], 'read')
-    
+    feedpath_new = os.path.join(base_directory, feed["category"], feed["name"], "new")
+    feedpath_read = os.path.join(base_directory, feed["category"], feed["name"], "read")
+
    if not os.path.exists(feedpath_new):
        os.makedirs(feedpath_new)
-        
+
    if not os.path.exists(feedpath_read):
        os.makedirs(feedpath_read)

+    # Get exisiting articles
+    existing_articles = (
+        os.listdir(feedpath_new) + os.listdir(feedpath_read) + os.listdir(lovedpath)
+    )
+
    # Update articles
    articles = get_articles(feed)
-    threshold_date = datetime.now() - timedelta(days = max_age)
-    
+    threshold_date = datetime.now() - timedelta(days=max_age)
+
+    if len(articles) == 0:
+        error('no articles received from feed "{}"'.format(feed["name"]))
+
    for a in articles:
-        
+
        try:
-            date = datetime.fromtimestamp(mktime(a.published_parsed))
+
+            # Set fallback if no parseable date found
+            fallback = False
+            try:
+                date = datetime.fromtimestamp(mktime(a.published_parsed))
+            except Exception:
+                date = datetime.now()
+                fallback = True
+
            if date > threshold_date:
-                filename = get_filename(date, a.title)
-                if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)):
-                    text = get_article(a, feed['scrape'])
-                    write_to_file(os.path.join(feedpath_new, filename), text)
-                    log('    added article "{}"'.format(a.title))
+
+                # Check if article should be filtered
+                filter = False
+                for f in filters:
+                    if re.search(f, a.title.lower()):
+                        filter = True
+                        log('    filtered article "{}"'.format(a.title))
+
+                if not filter:
+
+                    # Construct filename
+                    filename_prefix = date.strftime("%Y%m%d%H%M")
+                    filename_postfix = get_filename_postfix(a.title)
+                    filename = "{}_{}".format(filename_prefix, filename_postfix)
+
+                    # Check if article exists
+                    article_exists = False
+                    if fallback:
+                        existing_articles_fallback = [a[13:] for a in existing_articles]
+                        if filename_postfix in existing_articles_fallback:
+                            article_exists = True
+                    elif filename in existing_articles:
+                        article_exists = True
+
+                    if not article_exists:
+                        text = get_article(a, feed)
+                        write_to_file(os.path.join(feedpath_new, filename), text)
+                        log('    added article "{}"'.format(a.title))

        except Exception as e:
-            error('while parsing feed article "{}" from feed "{}": {}'.format(a.title, feed['name'], e))
+            error(
+                'while parsing article "{}" from feed "{}": {}'.format(
+                    a.title, feed["name"], e
+                )
+            )


 # Delete articles older than max_age
 def remove_old_articles():

-    threshold_date = datetime.now() - timedelta(days = max_age)
+    threshold_date = datetime.now() - timedelta(days=max_age)
    count = 0
-    
+
    for subdir, dirs, files in os.walk(base_directory):

        # Skip 'loved' directory
-        if not os.path.join(base_directory, 'loved') in subdir:
+        if not os.path.join(base_directory, "loved") in subdir:
            for file in files:
-                 date = datetime.strptime(file[:12], '%Y%m%d%H%M')
-                 if threshold_date > date:
-                     os.remove(os.path.join(subdir, file))
-                     count += 1
+                date = datetime.strptime(file[:12], "%Y%m%d%H%M")
+                if threshold_date > date:
+                    os.remove(os.path.join(subdir, file))
+                    count += 1

-    log('  removed {} articles'.format(count))
+    log("  removed {} articles".format(count))


 # Parse config file
 def load_config(filepath):

-    global base_directory, max_age, datetime_format, postprocessor, fileending, feeds
+    global base_directory, max_age, datetime_format, postprocessor, fileending, filters, feeds

    try:
        config = toml.load(filepath)
-        base_directory = config['base_directory']
-        max_age = config['max_age']
-        datetime_format = config['datetime_format']
-        postprocessor = config['postprocessor']
-        fileending = config['fileending']
-        feeds = config['feed']
+        base_directory = config["base_directory"]
+        max_age = config["max_age"]
+        datetime_format = config["datetime_format"]
+        postprocessor = config["postprocessor"]
+        fileending = config["fileending"]
+        filters = config["filters"]
+        feeds = config["feed"]
    except Exception as e:
-        error('while parsing config: {}'.format(e))
+        error("while parsing config: {}".format(e))
        sys.exit(1)


 # Initialize spiderss
 def initialize():

+    global lovedpath
+
    # Create 'loved' directory if not existent
-    lovedpath = os.path.join(base_directory, 'loved')
+    lovedpath = os.path.join(base_directory, "loved")
    if not os.path.exists(lovedpath):
        os.makedirs(lovedpath)


-# Update all feeds and delete old messages
+# Update all feeds and remove old articles
 def crawl():

-    log('crawling feeds', True)
+    log("crawling feeds", True)
    for feed in feeds:
        update_feed(feed)

-    log('removing old articles', True)
+    log("removing old articles", True)
    remove_old_articles()

-'''
+
+"""
 Main
-'''
+"""
+

 def main():

    global verbose

    # Initialize parser
-    parser = argparse.ArgumentParser(description = 'Crawl RSS feeds and store articles as Markdown files.')
-    parser.add_argument('-v', '--verbose', action = 'store_true', help = 'verbose output')
-    parser.add_argument('-c', '--config', default = './config.toml', help = 'config file (default: ./config.toml)')
+    parser = argparse.ArgumentParser(
+        description="Crawl RSS feeds and store articles as Markdown files."
+    )
+    parser.add_argument("-v", "--verbose", action="store_true", help="verbose output")
+    parser.add_argument(
+        "-c",
+        "--config",
+        default="./config.toml",
+        help="config file (default: ./config.toml)",
+    )

    # Get args
    args = parser.parse_args()
@ -308,5 +407,5 @@ def main():
    crawl()


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
Author	SHA1	Message	Date
Denis Lehmann	de2782117a	update Nix file	2021-03-11 18:52:46 +01:00
Denis Lehmann	5726c201b9	Blacken code	2020-08-29 11:45:59 +02:00
Denis Lehmann	3d9531eb8d	add comma	2020-04-26 20:29:03 +02:00
Denis Lehmann	1ddc57c30a	update features	2020-04-26 20:04:13 +02:00
Denis Lehmann	fa542ee56e	add filter feature	2020-04-26 19:58:47 +02:00
Denis Lehmann	d630cc96c4	add handling of missing article links	2020-04-24 18:51:20 +02:00
Denis Lehmann	7d4d1311bb	show error message if no articles were returned from feedparser	2020-04-19 16:04:22 +02:00
Denis Lehmann	d37148ea32	add fallback for articles with unparseable date	2020-04-19 08:56:18 +02:00
Denis Lehmann	e4837d77c6	add fallback for image and summary	2020-04-18 21:24:37 +02:00
Denis Lehmann	8aed1df8c7	add user-agent for web requests	2020-04-18 20:28:51 +02:00