diff --git a/README.md b/README.md index 36fc654..4b7b3ee 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ Use at your own risk! - Store articles in categories - Delete articles after a few days +- Filter articles, you're not interested in - Distinguish __new__ from __read__ articles - Store __loved__ articles forever - OPML import @@ -66,14 +67,18 @@ base_directory = '/home//rss' # Articles older than max_age (days) will be deleted and not be added. max_age = 30 +# Date and time format as strftime, to be included in the articles. +datetime_format = '%d.%m.%Y %H:%M' + # Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout. postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links --reference-location=document' # Fileending for the article files. fileending = 'md' -# Date and time format as strftime to be included in the articles. -datetime_format = '%d.%m.%Y %H:%M' +# List of regular expression strings. If any of these matches an lowercase article title, the article won't be saved. +# E.g. if you wan't to skip news about RSS explicitly, add '(\W|^)rss(\W|$)'. +filters = [] # Feeds # The category can be empty (''). The feed fill then be stored in the base_directory. @@ -153,6 +158,6 @@ Just synchronize the base_directory with [Syncthing](https://syncthing.net/), [r ## Acknowledgements -Thanks to all the people, who created the nice libraries this project in based on. +Thanks to all the people who created the nice software, this project in based on. And also thanks to Dieter Steffmann who created the Canterbury font, which is used for the logo. You can find it in the `fonts/` directory. diff --git a/config.toml b/config.toml index fc60926..f7780c6 100644 --- a/config.toml +++ b/config.toml @@ -4,7 +4,7 @@ base_directory = '/home//rss' # Articles older than max_age (days) will be deleted and not be added. max_age = 30 -# Date and time format as strftime to be included in the articles. +# Date and time format as strftime, to be included in the articles. datetime_format = '%d.%m.%Y %H:%M' # Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout. @@ -13,6 +13,10 @@ postprocessor = 'pandoc -f html -t markdown_strict-raw_html+pipe_tables --refere # Fileending for the article files. fileending = 'md' +# List of regular expression strings. If any of these matches an lowercase article title, the article won't be saved. +# E.g. if you wan't to skip news about RSS explicitly, add '(\W|^)rss(\W|$)'. +filters = [] + # Feeds # The category can be empty (''). The feed fill then be stored in the base_directory. # The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware'). diff --git a/default.nix b/default.nix deleted file mode 100644 index 1c000b4..0000000 --- a/default.nix +++ /dev/null @@ -1,21 +0,0 @@ -with import {}; - -stdenv.mkDerivation { - name = "myPythonEnv"; - buildInputs = with pkgs; [ - python37Full - python37Packages.virtualenv - pandoc - ]; - src = null; - shellHook = '' - if [ ! -d .venv ]; then - python -m venv .venv - fi - source .venv/bin/activate - pip install --upgrade pip - if [ -s requirements.txt ]; then - pip install -r requirements.txt - fi - ''; -} diff --git a/shell.nix b/shell.nix new file mode 100644 index 0000000..eb0e5d6 --- /dev/null +++ b/shell.nix @@ -0,0 +1,38 @@ +{ pkgs ? import {} }: +pkgs.mkShell { + name = "python-environment"; + buildInputs = with pkgs; [ + pandoc + python3 + python3Packages.virtualenv + ]; + shellHook = '' + function log_header { + echo -ne "==> \e[32m\e[1m$1\e[0m\n\n" + } + function log_subheader { + echo -ne "--> \e[33m\e[1m$1\e[0m\n\n" + } + function log { + echo -ne " $1\n" + } + + echo "" + log_header "python_environment" + if [ ! -d .venv ]; then + python -m venv .venv + fi + source .venv/bin/activate + log_subheader "upgrading pip" + pip install --upgrade pip + echo "" + if [ -s requirements.txt ]; then + log_subheader "found requirements.txt, installing packages" + pip install -r requirements.txt + echo "" + fi + log_header "package versions" + log "$(python --version)" + log "$(pip --version)" + ''; +} diff --git a/spiderss.py b/spiderss.py index afd8a71..0c64051 100755 --- a/spiderss.py +++ b/spiderss.py @@ -8,7 +8,6 @@ import re import requests import subprocess import sys -import time import toml from bs4 import BeautifulSoup from datetime import datetime, timedelta @@ -16,24 +15,25 @@ from readability import Document from time import mktime from urllib.parse import urlsplit, urlunsplit -''' +""" Output functions -''' +""" + # Print log message -def log(text, force = False): +def log(text, force=False): if verbose or force: - print('{} | {}'.format(datetime.now().strftime('%d.%m %H:%M'), text)) + print("{} | {}".format(datetime.now().strftime("%d.%m %H:%M"), text)) # Print error message and exit def error(text): - print('{} E {}'.format(datetime.now().strftime('%d.%m %H:%M'), text)) + print("{} E {}".format(datetime.now().strftime("%d.%m %H:%M"), text)) # Print spiderss logo def print_logo(): - logo = ''' + logo = """ ;: .N' ,K: ,O .0MWx' lk 0; @@ -46,54 +46,55 @@ def print_logo(): ;kddkNKl. XMNkWk, :N0; .'cOW0c. ,lOW0; .:0Nl. okddOW0:. .kdoxXNd, WMX ;..cc - ''' + """ print(logo) -''' -Utility functions -''' -# Get articles of a feed +""" +Utility functions +""" + + +# Get articles of a feed def get_articles(feed): - feed = feedparser.parse(feed['url']) + + feed = feedparser.parse(feed["url"]) return feed.entries # Write text to file def write_to_file(filepath, text): - file = open(filepath, 'w') + file = open(filepath, "w") file.write(text) file.close() -# Get filename from a date and a title -def get_filename(date, title): - - # Get date as single block - date = date.strftime('%Y%m%d%H%M') +# Get filename postfix from a title +def get_filename_postfix(title): # Get title as lowercase words concatenated with underscores - title = re.sub('[^A-Za-z0-9 ]+', '', title.lower()) - title = re.sub(' ', '_', title) - - return '{}_{}.{}'.format(date, title, fileending) + title = re.sub("[^A-Za-z0-9 ]+", "", title.lower()) + title = re.sub(" ", "_", title) + + return "{}.{}".format(title, fileending) -# If scraped, use first content image as fallback -# Get image snippet for an article -def get_article_image(article): - +# Get HTML image snippet from the first image url in a text +def get_image_snippet(text): + try: - image_url = re.search('(?Phttps?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image') + image_url = re.search( + "(?Phttps?://\S+(\.png|\.jpg|\.jpeg))", text, re.IGNORECASE + ).group("image") return 'Image\n\n'.format(image_url) - except: - return '' + except Exception: + return "" -# Get summary snippet for an article -def get_article_summary(article): +# Get HTML summary snippet from a HTML text +def get_summary_snippet(text): try: h = html2text.HTML2Text() @@ -101,60 +102,82 @@ def get_article_summary(article): h.ignore_links = True h.ignore_images = True h.body_width = 0 - return '

{}

\n\n'.format(summary) - except: - return '' + summary = h.handle(text).split("\n\n")[0].strip() + return "

{}

\n\n".format(summary) + except Exception: + return "" # Get article body either from web or its content -def get_article_body(article, scrape): +def get_article_body(article, feed): - body = '' + body = "" - # TODO: Include appropriate header? # If scrape, get article with readability - if scrape: + if feed["scrape"]: - response = requests.get(article.link) + headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36" + } + response = requests.get(article.link, headers=headers) doc = Document(response.text) body = doc.summary() - # Replace relative site links with absolute ones, using beautifulsoup - splitted_url = urlsplit(article.link) - soup = BeautifulSoup(body, features = 'lxml') - for img in soup.find_all('img', src = True): - src = img.get('src') - splitted_src = urlsplit(src) - constructed_src = [splitted_src.scheme, splitted_src.netloc, splitted_src.path, splitted_src.query, splitted_src.fragment] - if constructed_src[0] == '': - constructed_src[0] = splitted_url.scheme - if constructed_src[1] == '': - constructed_src[1] = splitted_url.netloc - new_src = urlunsplit(constructed_src) - if new_src.startswith('http'): - body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1) - - # TODO: catch mailto: - for a in soup.find_all('a', href = True): - href = a.get('href') - splitted_href = urlsplit(href) - constructed_href = [splitted_href.scheme, splitted_href.netloc, splitted_href.path, splitted_href.query, splitted_href.fragment] - if constructed_href[0] == '': - constructed_href[0] = splitted_url.scheme - if constructed_href[1] == '': - constructed_href[1] = splitted_url.netloc - new_href = urlunsplit(constructed_href) - if new_href.startswith('http'): - body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1) - - - # Else construct from article content + # Else construct from article object else: - - if hasattr(article, 'content'): + + # Add all content to body + if hasattr(article, "content"): for c in article.content: - if c.type == 'text/html': + if c.type == "text/html" or c.type == "text/plain": body += c.value + # Use summary as fallback + elif hasattr(article, "summary"): + body += article.summary + + # Replace relative links with absolute ones, using beautifulsoup + try: + splitted_url = urlsplit(article.link) + except Exception: + splitted_url = urlsplit(feed["url"]) + + soup = BeautifulSoup(body, features="lxml") + + for img in soup.find_all("img", src=True): + src = img.get("src") + splitted_src = urlsplit(src) + constructed_src = [ + splitted_src.scheme, + splitted_src.netloc, + splitted_src.path, + splitted_src.query, + splitted_src.fragment, + ] + if constructed_src[0] == "": + constructed_src[0] = splitted_url.scheme + if constructed_src[1] == "": + constructed_src[1] = splitted_url.netloc + new_src = urlunsplit(constructed_src) + if new_src.startswith("http"): + body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1) + + for a in soup.find_all("a", href=True): + href = a.get("href") + splitted_href = urlsplit(href) + constructed_href = [ + splitted_href.scheme, + splitted_href.netloc, + splitted_href.path, + splitted_href.query, + splitted_href.fragment, + ] + if constructed_href[0] == "": + constructed_href[0] = splitted_url.scheme + if constructed_href[1] == "": + constructed_href[1] = splitted_url.netloc + new_href = urlunsplit(constructed_href) + if new_href.startswith("http"): + body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1) return body @@ -163,32 +186,53 @@ def get_article_body(article, scrape): def postprocess(text): try: - processor = subprocess.Popen(postprocessor.split(' '), stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE) - (output, err) = processor.communicate(input = text.encode()) + processor = subprocess.Popen( + postprocessor.split(" "), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + (output, err) = processor.communicate(input=text.encode()) if err: raise Exception(err.decode().strip()) except Exception as e: - error(' while postprocessing: {}'.format(e)) + error(" while postprocessing: {}".format(e)) sys.exit(1) return output.decode().strip() # Get constructed article -def get_article(article, scrape): - - # Construct head of article - image = get_article_image(article) - summary = get_article_summary(article) - #TODO: Current time as fallback? - date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format) - head = '

{}

\n\n{}{}

{} - Link

'.format(article.title, image, summary, date, article.link) +def get_article(article, feed): # Get body of article - body = get_article_body(article, scrape) + body = get_article_body(article, feed) + + # Construct head of article + image = get_image_snippet(str(article)) + if image == "": + image = get_image_snippet(body) + summary = get_summary_snippet(article.summary) + if summary == "": + summary = get_summary_snippet(body) + try: + date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime( + datetime_format + ) + except Exception: + date = datetime.now().strftime(datetime_format) + try: + link = article.link + except Exception: + splitted_url = urlsplit(feed["url"]) + splitted_link = [splitted_url.scheme, splitted_url.netloc, "", "", ""] + link = urlunsplit(splitted_link) + head = "

{}

\n\n{}{}

{} - Link

".format( + article.title, image, summary, date, link + ) # Postprocess article - article_text = postprocess('{}\n\n
\n\n{}'.format(head, body)).strip() + article_text = postprocess("{}\n\n
\n\n{}".format(head, body)).strip() return article_text @@ -196,105 +240,160 @@ def get_article(article, scrape): # Update feed def update_feed(feed): - log(' updating feed "{}"'.format(feed['name'])) + log(' updating feed "{}"'.format(feed["name"])) # Set feedpaths - feedpath_new = os.path.join(base_directory, feed['category'], feed['name'], 'new') - feedpath_read = os.path.join(base_directory, feed['category'], feed['name'], 'read') - + feedpath_new = os.path.join(base_directory, feed["category"], feed["name"], "new") + feedpath_read = os.path.join(base_directory, feed["category"], feed["name"], "read") + if not os.path.exists(feedpath_new): os.makedirs(feedpath_new) - + if not os.path.exists(feedpath_read): os.makedirs(feedpath_read) + # Get exisiting articles + existing_articles = ( + os.listdir(feedpath_new) + os.listdir(feedpath_read) + os.listdir(lovedpath) + ) + # Update articles articles = get_articles(feed) - threshold_date = datetime.now() - timedelta(days = max_age) - + threshold_date = datetime.now() - timedelta(days=max_age) + + if len(articles) == 0: + error('no articles received from feed "{}"'.format(feed["name"])) + for a in articles: - + try: - date = datetime.fromtimestamp(mktime(a.published_parsed)) + + # Set fallback if no parseable date found + fallback = False + try: + date = datetime.fromtimestamp(mktime(a.published_parsed)) + except Exception: + date = datetime.now() + fallback = True + if date > threshold_date: - filename = get_filename(date, a.title) - if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)): - text = get_article(a, feed['scrape']) - write_to_file(os.path.join(feedpath_new, filename), text) - log(' added article "{}"'.format(a.title)) + + # Check if article should be filtered + filter = False + for f in filters: + if re.search(f, a.title.lower()): + filter = True + log(' filtered article "{}"'.format(a.title)) + + if not filter: + + # Construct filename + filename_prefix = date.strftime("%Y%m%d%H%M") + filename_postfix = get_filename_postfix(a.title) + filename = "{}_{}".format(filename_prefix, filename_postfix) + + # Check if article exists + article_exists = False + if fallback: + existing_articles_fallback = [a[13:] for a in existing_articles] + if filename_postfix in existing_articles_fallback: + article_exists = True + elif filename in existing_articles: + article_exists = True + + if not article_exists: + text = get_article(a, feed) + write_to_file(os.path.join(feedpath_new, filename), text) + log(' added article "{}"'.format(a.title)) except Exception as e: - error('while parsing feed article "{}" from feed "{}": {}'.format(a.title, feed['name'], e)) + error( + 'while parsing article "{}" from feed "{}": {}'.format( + a.title, feed["name"], e + ) + ) # Delete articles older than max_age def remove_old_articles(): - threshold_date = datetime.now() - timedelta(days = max_age) + threshold_date = datetime.now() - timedelta(days=max_age) count = 0 - + for subdir, dirs, files in os.walk(base_directory): # Skip 'loved' directory - if not os.path.join(base_directory, 'loved') in subdir: + if not os.path.join(base_directory, "loved") in subdir: for file in files: - date = datetime.strptime(file[:12], '%Y%m%d%H%M') - if threshold_date > date: - os.remove(os.path.join(subdir, file)) - count += 1 + date = datetime.strptime(file[:12], "%Y%m%d%H%M") + if threshold_date > date: + os.remove(os.path.join(subdir, file)) + count += 1 - log(' removed {} articles'.format(count)) + log(" removed {} articles".format(count)) # Parse config file def load_config(filepath): - global base_directory, max_age, datetime_format, postprocessor, fileending, feeds + global base_directory, max_age, datetime_format, postprocessor, fileending, filters, feeds try: config = toml.load(filepath) - base_directory = config['base_directory'] - max_age = config['max_age'] - datetime_format = config['datetime_format'] - postprocessor = config['postprocessor'] - fileending = config['fileending'] - feeds = config['feed'] + base_directory = config["base_directory"] + max_age = config["max_age"] + datetime_format = config["datetime_format"] + postprocessor = config["postprocessor"] + fileending = config["fileending"] + filters = config["filters"] + feeds = config["feed"] except Exception as e: - error('while parsing config: {}'.format(e)) + error("while parsing config: {}".format(e)) sys.exit(1) # Initialize spiderss def initialize(): + global lovedpath + # Create 'loved' directory if not existent - lovedpath = os.path.join(base_directory, 'loved') + lovedpath = os.path.join(base_directory, "loved") if not os.path.exists(lovedpath): os.makedirs(lovedpath) -# Update all feeds and delete old messages +# Update all feeds and remove old articles def crawl(): - log('crawling feeds', True) + log("crawling feeds", True) for feed in feeds: update_feed(feed) - log('removing old articles', True) + log("removing old articles", True) remove_old_articles() -''' + +""" Main -''' +""" + def main(): global verbose # Initialize parser - parser = argparse.ArgumentParser(description = 'Crawl RSS feeds and store articles as Markdown files.') - parser.add_argument('-v', '--verbose', action = 'store_true', help = 'verbose output') - parser.add_argument('-c', '--config', default = './config.toml', help = 'config file (default: ./config.toml)') + parser = argparse.ArgumentParser( + description="Crawl RSS feeds and store articles as Markdown files." + ) + parser.add_argument("-v", "--verbose", action="store_true", help="verbose output") + parser.add_argument( + "-c", + "--config", + default="./config.toml", + help="config file (default: ./config.toml)", + ) # Get args args = parser.parse_args() @@ -308,5 +407,5 @@ def main(): crawl() -if __name__ == '__main__': +if __name__ == "__main__": main()