From fa542ee56e47806546d3dcb23847bd8352b3b697 Mon Sep 17 00:00:00 2001 From: Denis Lehmann Date: Sun, 26 Apr 2020 19:58:47 +0200 Subject: [PATCH] add filter feature --- README.md | 6 +++++- config.toml | 4 ++++ spiderss.py | 44 +++++++++++++++++++++++++++----------------- 3 files changed, 36 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 36fc654..66aa225 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,10 @@ postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links -- # Fileending for the article files. fileending = 'md' +# List of regular expression strings. If any of these matches an lowercase article title, the article won't be saved. +# E.g. if you wan't to skip news about RSS explicitly, add '(\W|^)rss(\W|$)'. +filters = [] + # Date and time format as strftime to be included in the articles. datetime_format = '%d.%m.%Y %H:%M' @@ -153,6 +157,6 @@ Just synchronize the base_directory with [Syncthing](https://syncthing.net/), [r ## Acknowledgements -Thanks to all the people, who created the nice libraries this project in based on. +Thanks to all the people who created the nice software, this project in based on. And also thanks to Dieter Steffmann who created the Canterbury font, which is used for the logo. You can find it in the `fonts/` directory. diff --git a/config.toml b/config.toml index fc60926..7d3de98 100644 --- a/config.toml +++ b/config.toml @@ -13,6 +13,10 @@ postprocessor = 'pandoc -f html -t markdown_strict-raw_html+pipe_tables --refere # Fileending for the article files. fileending = 'md' +# List of regular expression strings. If any of these matches an lowercase article title, the article won't be saved. +# E.g. if you wan't to skip news about RSS explicitly, add '(\W|^)rss(\W|$)'. +filters = [] + # Feeds # The category can be empty (''). The feed fill then be stored in the base_directory. # The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware'). diff --git a/spiderss.py b/spiderss.py index 0b1397c..b30ede0 100755 --- a/spiderss.py +++ b/spiderss.py @@ -246,27 +246,36 @@ def update_feed(feed): except: date = datetime.now() fallback = True - + if date > threshold_date: - # Construct filename - filename_prefix = date.strftime('%Y%m%d%H%M') - filename_postfix = get_filename_postfix(a.title) - filename = '{}_{}'.format(filename_prefix, filename_postfix) + # Check if article should be filtered + filter = False + for f in filters: + if re.search(f, a.title.lower()): + filter = True + log(' filtered article "{}"'.format(a.title)) - # Check if article exists - article_exists = False - if fallback: - existing_articles_fallback = [a[13:] for a in existing_articles] - if filename_postfix in existing_articles_fallback: + if not filter: + + # Construct filename + filename_prefix = date.strftime('%Y%m%d%H%M') + filename_postfix = get_filename_postfix(a.title) + filename = '{}_{}'.format(filename_prefix, filename_postfix) + + # Check if article exists + article_exists = False + if fallback: + existing_articles_fallback = [a[13:] for a in existing_articles] + if filename_postfix in existing_articles_fallback: + article_exists = True + elif filename in existing_articles: article_exists = True - elif filename in existing_articles: - article_exists = True - if not article_exists: - text = get_article(a, feed) - write_to_file(os.path.join(feedpath_new, filename), text) - log(' added article "{}"'.format(a.title)) + if not article_exists: + text = get_article(a, feed) + write_to_file(os.path.join(feedpath_new, filename), text) + log(' added article "{}"'.format(a.title)) except Exception as e: error('while parsing article "{}" from feed "{}": {}'.format(a.title, feed['name'], e)) @@ -294,7 +303,7 @@ def remove_old_articles(): # Parse config file def load_config(filepath): - global base_directory, max_age, datetime_format, postprocessor, fileending, feeds + global base_directory, max_age, datetime_format, postprocessor, fileending, filters, feeds try: config = toml.load(filepath) @@ -303,6 +312,7 @@ def load_config(filepath): datetime_format = config['datetime_format'] postprocessor = config['postprocessor'] fileending = config['fileending'] + filters = config['filters'] feeds = config['feed'] except Exception as e: error('while parsing config: {}'.format(e))