add filter feature

2020-04-26 19:58:47 +02:00 · 2020-04-26 19:58:47 +02:00 · fa542ee56e
commit fa542ee56e
parent d630cc96c4
3 changed files with 36 additions and 18 deletions
--- a/README.md
+++ b/README.md
@ -72,6 +72,10 @@ postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links --
 # Fileending for the article files.
 fileending = 'md'

+# List of regular expression strings. If any of these matches an lowercase article title, the article won't be saved.
+# E.g. if you wan't to skip news about RSS explicitly, add '(\W|^)rss(\W|$)'.
+filters = []
+
 # Date and time format as strftime to be included in the articles.
 datetime_format = '%d.%m.%Y %H:%M'

@ -153,6 +157,6 @@ Just synchronize the base_directory with [Syncthing](https://syncthing.net/), [r

 ## Acknowledgements

-Thanks to all the people, who created the nice libraries this project in based on.
+Thanks to all the people who created the nice software, this project in based on.
 And also thanks to Dieter Steffmann who created the Canterbury font, which is used for the logo.
 You can find it in the `fonts/` directory.
--- a/config.toml
+++ b/config.toml
@ -13,6 +13,10 @@ postprocessor = 'pandoc -f html -t markdown_strict-raw_html+pipe_tables --refere
 # Fileending for the article files.
 fileending = 'md'

+# List of regular expression strings. If any of these matches an lowercase article title, the article won't be saved.
+# E.g. if you wan't to skip news about RSS explicitly, add '(\W|^)rss(\W|$)'.
+filters = []
+
 # Feeds
 # The category can be empty (''). The feed fill then be stored in the base_directory.
 # The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware').
--- a/spiderss.py
+++ b/spiderss.py
@ -249,24 +249,33 @@ def update_feed(feed):

            if date > threshold_date:

-                # Construct filename
-                filename_prefix = date.strftime('%Y%m%d%H%M')
-                filename_postfix = get_filename_postfix(a.title)
-                filename = '{}_{}'.format(filename_prefix, filename_postfix)
+                # Check if article should be filtered
+                filter = False
+                for f in filters:
+                    if re.search(f, a.title.lower()):
+                        filter = True
+                        log('    filtered article "{}"'.format(a.title))

-                # Check if article exists
-                article_exists = False
-                if fallback:
-                    existing_articles_fallback = [a[13:] for a in existing_articles]
-                    if filename_postfix in existing_articles_fallback:
+                if not filter:
+
+                    # Construct filename
+                    filename_prefix = date.strftime('%Y%m%d%H%M')
+                    filename_postfix = get_filename_postfix(a.title)
+                    filename = '{}_{}'.format(filename_prefix, filename_postfix)
+
+                    # Check if article exists
+                    article_exists = False
+                    if fallback:
+                        existing_articles_fallback = [a[13:] for a in existing_articles]
+                        if filename_postfix in existing_articles_fallback:
+                            article_exists = True
+                    elif filename in existing_articles:
                        article_exists = True
-                elif filename in existing_articles:
-                    article_exists = True

-                if not article_exists:
-                    text = get_article(a, feed)
-                    write_to_file(os.path.join(feedpath_new, filename), text)
-                    log('    added article "{}"'.format(a.title))
+                    if not article_exists:
+                        text = get_article(a, feed)
+                        write_to_file(os.path.join(feedpath_new, filename), text)
+                        log('    added article "{}"'.format(a.title))

        except Exception as e:
            error('while parsing article "{}" from feed "{}": {}'.format(a.title, feed['name'], e))
@ -294,7 +303,7 @@ def remove_old_articles():
 # Parse config file
 def load_config(filepath):

-    global base_directory, max_age, datetime_format, postprocessor, fileending, feeds
+    global base_directory, max_age, datetime_format, postprocessor, fileending, filters, feeds

    try:
        config = toml.load(filepath)
@ -303,6 +312,7 @@ def load_config(filepath):
        datetime_format = config['datetime_format']
        postprocessor = config['postprocessor']
        fileending = config['fileending']
+        filters = config['filters']
        feeds = config['feed']
    except Exception as e:
        error('while parsing config: {}'.format(e))