From 8aed1df8c748e18762eb9284b5eda9dfc4700d63 Mon Sep 17 00:00:00 2001 From: Denis Lehmann Date: Sat, 18 Apr 2020 20:28:51 +0200 Subject: [PATCH 01/10] add user-agent for web requests --- spiderss.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/spiderss.py b/spiderss.py index afd8a71..bef6b3e 100755 --- a/spiderss.py +++ b/spiderss.py @@ -111,11 +111,11 @@ def get_article_body(article, scrape): body = '' - # TODO: Include appropriate header? # If scrape, get article with readability if scrape: - response = requests.get(article.link) + headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'} + response = requests.get(article.link, headers = headers) doc = Document(response.text) body = doc.summary() @@ -134,7 +134,6 @@ def get_article_body(article, scrape): if new_src.startswith('http'): body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1) - # TODO: catch mailto: for a in soup.find_all('a', href = True): href = a.get('href') splitted_href = urlsplit(href) From e4837d77c641944296cb0c14302f376005bdfb01 Mon Sep 17 00:00:00 2001 From: Denis Lehmann Date: Sat, 18 Apr 2020 21:24:37 +0200 Subject: [PATCH 02/10] add fallback for image and summary --- spiderss.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/spiderss.py b/spiderss.py index bef6b3e..9ef859b 100755 --- a/spiderss.py +++ b/spiderss.py @@ -81,19 +81,18 @@ def get_filename(date, title): return '{}_{}.{}'.format(date, title, fileending) -# If scraped, use first content image as fallback -# Get image snippet for an article -def get_article_image(article): +# Get HTML image snippet from the first image url in a text +def get_image_snippet(text): try: - image_url = re.search('(?Phttps?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image') + image_url = re.search('(?Phttps?://\S+(\.png|\.jpg|\.jpeg))', text, re.IGNORECASE).group('image') return 'Image\n\n'.format(image_url) except: return '' -# Get summary snippet for an article -def get_article_summary(article): +# Get HTML summary snippet from a HTML text +def get_summary_snippet(text): try: h = html2text.HTML2Text() @@ -101,6 +100,7 @@ def get_article_summary(article): h.ignore_links = True h.ignore_images = True h.body_width = 0 + summary = h.handle(text).split('\n\n')[0].strip() return '

{}

\n\n'.format(summary) except: return '' @@ -176,16 +176,19 @@ def postprocess(text): # Get constructed article def get_article(article, scrape): - # Construct head of article - image = get_article_image(article) - summary = get_article_summary(article) - #TODO: Current time as fallback? - date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format) - head = '

{}

\n\n{}{}

{} - Link

'.format(article.title, image, summary, date, article.link) - # Get body of article body = get_article_body(article, scrape) + # Construct head of article + image = get_image_snippet(str(article)) + if image == '': + image = get_image_snippet(body) + summary = get_summary_snippet(article.summary) + if summary == '': + summary = get_summary_snippet(body) + date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format) + head = '

{}

\n\n{}{}

{} - Link

'.format(article.title, image, summary, date, article.link) + # Postprocess article article_text = postprocess('{}\n\n
\n\n{}'.format(head, body)).strip() @@ -214,6 +217,7 @@ def update_feed(feed): for a in articles: try: + #TODO: Current time as fallback? date = datetime.fromtimestamp(mktime(a.published_parsed)) if date > threshold_date: filename = get_filename(date, a.title) From d37148ea3267de3b39ad44463a980ab96cbd1f87 Mon Sep 17 00:00:00 2001 From: Denis Lehmann Date: Sun, 19 Apr 2020 08:56:18 +0200 Subject: [PATCH 03/10] add fallback for articles with unparseable date --- spiderss.py | 48 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/spiderss.py b/spiderss.py index 9ef859b..99ea317 100755 --- a/spiderss.py +++ b/spiderss.py @@ -68,17 +68,14 @@ def write_to_file(filepath, text): file.close() -# Get filename from a date and a title -def get_filename(date, title): - - # Get date as single block - date = date.strftime('%Y%m%d%H%M') +# Get filename postfix from a title +def get_filename_postfix(title): # Get title as lowercase words concatenated with underscores title = re.sub('[^A-Za-z0-9 ]+', '', title.lower()) title = re.sub(' ', '_', title) - return '{}_{}.{}'.format(date, title, fileending) + return '{}.{}'.format(title, fileending) # Get HTML image snippet from the first image url in a text @@ -186,7 +183,10 @@ def get_article(article, scrape): summary = get_summary_snippet(article.summary) if summary == '': summary = get_summary_snippet(body) - date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format) + try: + date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format) + except: + date = datetime.now().strftime(datetime_format) head = '

{}

\n\n{}{}

{} - Link

'.format(article.title, image, summary, date, article.link) # Postprocess article @@ -210,6 +210,9 @@ def update_feed(feed): if not os.path.exists(feedpath_read): os.makedirs(feedpath_read) + # Get exisiting articles + existing_articles = os.listdir(feedpath_new) + os.listdir(feedpath_read) + os.listdir(lovedpath) + # Update articles articles = get_articles(feed) threshold_date = datetime.now() - timedelta(days = max_age) @@ -217,11 +220,32 @@ def update_feed(feed): for a in articles: try: - #TODO: Current time as fallback? - date = datetime.fromtimestamp(mktime(a.published_parsed)) + + # Set fallback if no parseable date found + fallback = False + try: + date = datetime.fromtimestamp(mktime(a.published_parsed)) + except: + date = datetime.now() + fallback = True + if date > threshold_date: - filename = get_filename(date, a.title) - if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)): + + # Construct filename + filename_prefix = date.strftime('%Y%m%d%H%M') + filename_postfix = get_filename_postfix(a.title) + filename = '{}_{}'.format(filename_prefix, filename_postfix) + + # Check if article exists + article_exists = False + if fallback: + existing_articles_fallback = [a[13:] for a in existing_articles] + if filename_postfix in existing_articles_fallback: + article_exists = True + elif filename in existing_articles: + article_exists = True + + if not article_exists: text = get_article(a, feed['scrape']) write_to_file(os.path.join(feedpath_new, filename), text) log(' added article "{}"'.format(a.title)) @@ -270,6 +294,8 @@ def load_config(filepath): # Initialize spiderss def initialize(): + global lovedpath + # Create 'loved' directory if not existent lovedpath = os.path.join(base_directory, 'loved') if not os.path.exists(lovedpath): From 7d4d1311bb8fb96938ad2a15669a0dcf39071611 Mon Sep 17 00:00:00 2001 From: Denis Lehmann Date: Sun, 19 Apr 2020 16:04:22 +0200 Subject: [PATCH 04/10] show error message if no articles were returned from feedparser --- spiderss.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spiderss.py b/spiderss.py index 99ea317..45ba037 100755 --- a/spiderss.py +++ b/spiderss.py @@ -56,6 +56,7 @@ Utility functions # Get articles of a feed def get_articles(feed): + feed = feedparser.parse(feed['url']) return feed.entries @@ -216,6 +217,9 @@ def update_feed(feed): # Update articles articles = get_articles(feed) threshold_date = datetime.now() - timedelta(days = max_age) + + if len(articles) == 0: + error('no articles received from feed "{}"'.format(feed['name'])) for a in articles: @@ -302,7 +306,7 @@ def initialize(): os.makedirs(lovedpath) -# Update all feeds and delete old messages +# Update all feeds and remove old articles def crawl(): log('crawling feeds', True) From d630cc96c4fc9d41f7e213ab2be099c328873d8b Mon Sep 17 00:00:00 2001 From: Denis Lehmann Date: Fri, 24 Apr 2020 18:51:20 +0200 Subject: [PATCH 05/10] add handling of missing article links --- spiderss.py | 90 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 38 deletions(-) diff --git a/spiderss.py b/spiderss.py index 45ba037..0b1397c 100755 --- a/spiderss.py +++ b/spiderss.py @@ -105,54 +105,62 @@ def get_summary_snippet(text): # Get article body either from web or its content -def get_article_body(article, scrape): +def get_article_body(article, feed): body = '' # If scrape, get article with readability - if scrape: + if feed['scrape']: headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'} response = requests.get(article.link, headers = headers) doc = Document(response.text) body = doc.summary() - # Replace relative site links with absolute ones, using beautifulsoup - splitted_url = urlsplit(article.link) - soup = BeautifulSoup(body, features = 'lxml') - for img in soup.find_all('img', src = True): - src = img.get('src') - splitted_src = urlsplit(src) - constructed_src = [splitted_src.scheme, splitted_src.netloc, splitted_src.path, splitted_src.query, splitted_src.fragment] - if constructed_src[0] == '': - constructed_src[0] = splitted_url.scheme - if constructed_src[1] == '': - constructed_src[1] = splitted_url.netloc - new_src = urlunsplit(constructed_src) - if new_src.startswith('http'): - body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1) - - for a in soup.find_all('a', href = True): - href = a.get('href') - splitted_href = urlsplit(href) - constructed_href = [splitted_href.scheme, splitted_href.netloc, splitted_href.path, splitted_href.query, splitted_href.fragment] - if constructed_href[0] == '': - constructed_href[0] = splitted_url.scheme - if constructed_href[1] == '': - constructed_href[1] = splitted_url.netloc - new_href = urlunsplit(constructed_href) - if new_href.startswith('http'): - body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1) - - - # Else construct from article content + # Else construct from article object else: - + + # Add all content to body if hasattr(article, 'content'): for c in article.content: - if c.type == 'text/html': + if c.type == 'text/html' or c.type == 'text/plain': body += c.value + # Use summary as fallback + elif hasattr(article, 'summary'): + body += article.summary + # Replace relative links with absolute ones, using beautifulsoup + try: + splitted_url = urlsplit(article.link) + except: + splitted_url = urlsplit(feed['url']) + + soup = BeautifulSoup(body, features = 'lxml') + + for img in soup.find_all('img', src = True): + src = img.get('src') + splitted_src = urlsplit(src) + constructed_src = [splitted_src.scheme, splitted_src.netloc, splitted_src.path, splitted_src.query, splitted_src.fragment] + if constructed_src[0] == '': + constructed_src[0] = splitted_url.scheme + if constructed_src[1] == '': + constructed_src[1] = splitted_url.netloc + new_src = urlunsplit(constructed_src) + if new_src.startswith('http'): + body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1) + + for a in soup.find_all('a', href = True): + href = a.get('href') + splitted_href = urlsplit(href) + constructed_href = [splitted_href.scheme, splitted_href.netloc, splitted_href.path, splitted_href.query, splitted_href.fragment] + if constructed_href[0] == '': + constructed_href[0] = splitted_url.scheme + if constructed_href[1] == '': + constructed_href[1] = splitted_url.netloc + new_href = urlunsplit(constructed_href) + if new_href.startswith('http'): + body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1) + return body @@ -172,10 +180,10 @@ def postprocess(text): # Get constructed article -def get_article(article, scrape): +def get_article(article, feed): # Get body of article - body = get_article_body(article, scrape) + body = get_article_body(article, feed) # Construct head of article image = get_image_snippet(str(article)) @@ -188,7 +196,13 @@ def get_article(article, scrape): date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format) except: date = datetime.now().strftime(datetime_format) - head = '

{}

\n\n{}{}

{} - Link

'.format(article.title, image, summary, date, article.link) + try: + link = article.link + except: + splitted_url = urlsplit(feed['url']) + splitted_link = [splitted_url.scheme, splitted_url.netloc, '', '', ''] + link = urlunsplit(splitted_link) + head = '

{}

\n\n{}{}

{} - Link

'.format(article.title, image, summary, date, link) # Postprocess article article_text = postprocess('{}\n\n
\n\n{}'.format(head, body)).strip() @@ -250,12 +264,12 @@ def update_feed(feed): article_exists = True if not article_exists: - text = get_article(a, feed['scrape']) + text = get_article(a, feed) write_to_file(os.path.join(feedpath_new, filename), text) log(' added article "{}"'.format(a.title)) except Exception as e: - error('while parsing feed article "{}" from feed "{}": {}'.format(a.title, feed['name'], e)) + error('while parsing article "{}" from feed "{}": {}'.format(a.title, feed['name'], e)) # Delete articles older than max_age From fa542ee56e47806546d3dcb23847bd8352b3b697 Mon Sep 17 00:00:00 2001 From: Denis Lehmann Date: Sun, 26 Apr 2020 19:58:47 +0200 Subject: [PATCH 06/10] add filter feature --- README.md | 6 +++++- config.toml | 4 ++++ spiderss.py | 44 +++++++++++++++++++++++++++----------------- 3 files changed, 36 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 36fc654..66aa225 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,10 @@ postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links -- # Fileending for the article files. fileending = 'md' +# List of regular expression strings. If any of these matches an lowercase article title, the article won't be saved. +# E.g. if you wan't to skip news about RSS explicitly, add '(\W|^)rss(\W|$)'. +filters = [] + # Date and time format as strftime to be included in the articles. datetime_format = '%d.%m.%Y %H:%M' @@ -153,6 +157,6 @@ Just synchronize the base_directory with [Syncthing](https://syncthing.net/), [r ## Acknowledgements -Thanks to all the people, who created the nice libraries this project in based on. +Thanks to all the people who created the nice software, this project in based on. And also thanks to Dieter Steffmann who created the Canterbury font, which is used for the logo. You can find it in the `fonts/` directory. diff --git a/config.toml b/config.toml index fc60926..7d3de98 100644 --- a/config.toml +++ b/config.toml @@ -13,6 +13,10 @@ postprocessor = 'pandoc -f html -t markdown_strict-raw_html+pipe_tables --refere # Fileending for the article files. fileending = 'md' +# List of regular expression strings. If any of these matches an lowercase article title, the article won't be saved. +# E.g. if you wan't to skip news about RSS explicitly, add '(\W|^)rss(\W|$)'. +filters = [] + # Feeds # The category can be empty (''). The feed fill then be stored in the base_directory. # The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware'). diff --git a/spiderss.py b/spiderss.py index 0b1397c..b30ede0 100755 --- a/spiderss.py +++ b/spiderss.py @@ -246,27 +246,36 @@ def update_feed(feed): except: date = datetime.now() fallback = True - + if date > threshold_date: - # Construct filename - filename_prefix = date.strftime('%Y%m%d%H%M') - filename_postfix = get_filename_postfix(a.title) - filename = '{}_{}'.format(filename_prefix, filename_postfix) + # Check if article should be filtered + filter = False + for f in filters: + if re.search(f, a.title.lower()): + filter = True + log(' filtered article "{}"'.format(a.title)) - # Check if article exists - article_exists = False - if fallback: - existing_articles_fallback = [a[13:] for a in existing_articles] - if filename_postfix in existing_articles_fallback: + if not filter: + + # Construct filename + filename_prefix = date.strftime('%Y%m%d%H%M') + filename_postfix = get_filename_postfix(a.title) + filename = '{}_{}'.format(filename_prefix, filename_postfix) + + # Check if article exists + article_exists = False + if fallback: + existing_articles_fallback = [a[13:] for a in existing_articles] + if filename_postfix in existing_articles_fallback: + article_exists = True + elif filename in existing_articles: article_exists = True - elif filename in existing_articles: - article_exists = True - if not article_exists: - text = get_article(a, feed) - write_to_file(os.path.join(feedpath_new, filename), text) - log(' added article "{}"'.format(a.title)) + if not article_exists: + text = get_article(a, feed) + write_to_file(os.path.join(feedpath_new, filename), text) + log(' added article "{}"'.format(a.title)) except Exception as e: error('while parsing article "{}" from feed "{}": {}'.format(a.title, feed['name'], e)) @@ -294,7 +303,7 @@ def remove_old_articles(): # Parse config file def load_config(filepath): - global base_directory, max_age, datetime_format, postprocessor, fileending, feeds + global base_directory, max_age, datetime_format, postprocessor, fileending, filters, feeds try: config = toml.load(filepath) @@ -303,6 +312,7 @@ def load_config(filepath): datetime_format = config['datetime_format'] postprocessor = config['postprocessor'] fileending = config['fileending'] + filters = config['filters'] feeds = config['feed'] except Exception as e: error('while parsing config: {}'.format(e)) From 1ddc57c30a68aced8a456b136ea557cc821204fc Mon Sep 17 00:00:00 2001 From: Denis Lehmann Date: Sun, 26 Apr 2020 20:04:13 +0200 Subject: [PATCH 07/10] update features --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 66aa225..143676f 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ Use at your own risk! - Store articles in categories - Delete articles after a few days +- Filter articles, you're not interested in - Distinguish __new__ from __read__ articles - Store __loved__ articles forever - OPML import From 3d9531eb8dc090aa0a450272b9499bdb29ec4023 Mon Sep 17 00:00:00 2001 From: Denis Lehmann Date: Sun, 26 Apr 2020 20:29:03 +0200 Subject: [PATCH 08/10] add comma --- README.md | 6 +++--- config.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 143676f..4b7b3ee 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,9 @@ base_directory = '/home//rss' # Articles older than max_age (days) will be deleted and not be added. max_age = 30 +# Date and time format as strftime, to be included in the articles. +datetime_format = '%d.%m.%Y %H:%M' + # Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout. postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links --reference-location=document' @@ -77,9 +80,6 @@ fileending = 'md' # E.g. if you wan't to skip news about RSS explicitly, add '(\W|^)rss(\W|$)'. filters = [] -# Date and time format as strftime to be included in the articles. -datetime_format = '%d.%m.%Y %H:%M' - # Feeds # The category can be empty (''). The feed fill then be stored in the base_directory. # The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware'). diff --git a/config.toml b/config.toml index 7d3de98..f7780c6 100644 --- a/config.toml +++ b/config.toml @@ -4,7 +4,7 @@ base_directory = '/home//rss' # Articles older than max_age (days) will be deleted and not be added. max_age = 30 -# Date and time format as strftime to be included in the articles. +# Date and time format as strftime, to be included in the articles. datetime_format = '%d.%m.%Y %H:%M' # Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout. From 5726c201b955a1b5dd4f1d3e956fb9afb1df18d6 Mon Sep 17 00:00:00 2001 From: Denis Lehmann Date: Sat, 29 Aug 2020 11:45:59 +0200 Subject: [PATCH 09/10] Blacken code --- spiderss.py | 250 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 146 insertions(+), 104 deletions(-) diff --git a/spiderss.py b/spiderss.py index b30ede0..0c64051 100755 --- a/spiderss.py +++ b/spiderss.py @@ -8,7 +8,6 @@ import re import requests import subprocess import sys -import time import toml from bs4 import BeautifulSoup from datetime import datetime, timedelta @@ -16,24 +15,25 @@ from readability import Document from time import mktime from urllib.parse import urlsplit, urlunsplit -''' +""" Output functions -''' +""" + # Print log message -def log(text, force = False): +def log(text, force=False): if verbose or force: - print('{} | {}'.format(datetime.now().strftime('%d.%m %H:%M'), text)) + print("{} | {}".format(datetime.now().strftime("%d.%m %H:%M"), text)) # Print error message and exit def error(text): - print('{} E {}'.format(datetime.now().strftime('%d.%m %H:%M'), text)) + print("{} E {}".format(datetime.now().strftime("%d.%m %H:%M"), text)) # Print spiderss logo def print_logo(): - logo = ''' + logo = """ ;: .N' ,K: ,O .0MWx' lk 0; @@ -46,25 +46,27 @@ def print_logo(): ;kddkNKl. XMNkWk, :N0; .'cOW0c. ,lOW0; .:0Nl. okddOW0:. .kdoxXNd, WMX ;..cc - ''' + """ print(logo) -''' -Utility functions -''' -# Get articles of a feed +""" +Utility functions +""" + + +# Get articles of a feed def get_articles(feed): - feed = feedparser.parse(feed['url']) + feed = feedparser.parse(feed["url"]) return feed.entries # Write text to file def write_to_file(filepath, text): - file = open(filepath, 'w') + file = open(filepath, "w") file.write(text) file.close() @@ -73,23 +75,25 @@ def write_to_file(filepath, text): def get_filename_postfix(title): # Get title as lowercase words concatenated with underscores - title = re.sub('[^A-Za-z0-9 ]+', '', title.lower()) - title = re.sub(' ', '_', title) - - return '{}.{}'.format(title, fileending) + title = re.sub("[^A-Za-z0-9 ]+", "", title.lower()) + title = re.sub(" ", "_", title) + + return "{}.{}".format(title, fileending) # Get HTML image snippet from the first image url in a text def get_image_snippet(text): - + try: - image_url = re.search('(?Phttps?://\S+(\.png|\.jpg|\.jpeg))', text, re.IGNORECASE).group('image') + image_url = re.search( + "(?Phttps?://\S+(\.png|\.jpg|\.jpeg))", text, re.IGNORECASE + ).group("image") return 'Image\n\n'.format(image_url) - except: - return '' + except Exception: + return "" -# Get HTML summary snippet from a HTML text +# Get HTML summary snippet from a HTML text def get_summary_snippet(text): try: @@ -98,22 +102,24 @@ def get_summary_snippet(text): h.ignore_links = True h.ignore_images = True h.body_width = 0 - summary = h.handle(text).split('\n\n')[0].strip() - return '

{}

\n\n'.format(summary) - except: - return '' + summary = h.handle(text).split("\n\n")[0].strip() + return "

{}

\n\n".format(summary) + except Exception: + return "" # Get article body either from web or its content def get_article_body(article, feed): - body = '' + body = "" # If scrape, get article with readability - if feed['scrape']: + if feed["scrape"]: - headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'} - response = requests.get(article.link, headers = headers) + headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36" + } + response = requests.get(article.link, headers=headers) doc = Document(response.text) body = doc.summary() @@ -121,46 +127,58 @@ def get_article_body(article, feed): else: # Add all content to body - if hasattr(article, 'content'): + if hasattr(article, "content"): for c in article.content: - if c.type == 'text/html' or c.type == 'text/plain': + if c.type == "text/html" or c.type == "text/plain": body += c.value # Use summary as fallback - elif hasattr(article, 'summary'): + elif hasattr(article, "summary"): body += article.summary # Replace relative links with absolute ones, using beautifulsoup try: splitted_url = urlsplit(article.link) - except: - splitted_url = urlsplit(feed['url']) - - soup = BeautifulSoup(body, features = 'lxml') + except Exception: + splitted_url = urlsplit(feed["url"]) - for img in soup.find_all('img', src = True): - src = img.get('src') + soup = BeautifulSoup(body, features="lxml") + + for img in soup.find_all("img", src=True): + src = img.get("src") splitted_src = urlsplit(src) - constructed_src = [splitted_src.scheme, splitted_src.netloc, splitted_src.path, splitted_src.query, splitted_src.fragment] - if constructed_src[0] == '': + constructed_src = [ + splitted_src.scheme, + splitted_src.netloc, + splitted_src.path, + splitted_src.query, + splitted_src.fragment, + ] + if constructed_src[0] == "": constructed_src[0] = splitted_url.scheme - if constructed_src[1] == '': + if constructed_src[1] == "": constructed_src[1] = splitted_url.netloc new_src = urlunsplit(constructed_src) - if new_src.startswith('http'): + if new_src.startswith("http"): body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1) - - for a in soup.find_all('a', href = True): - href = a.get('href') + + for a in soup.find_all("a", href=True): + href = a.get("href") splitted_href = urlsplit(href) - constructed_href = [splitted_href.scheme, splitted_href.netloc, splitted_href.path, splitted_href.query, splitted_href.fragment] - if constructed_href[0] == '': + constructed_href = [ + splitted_href.scheme, + splitted_href.netloc, + splitted_href.path, + splitted_href.query, + splitted_href.fragment, + ] + if constructed_href[0] == "": constructed_href[0] = splitted_url.scheme - if constructed_href[1] == '': + if constructed_href[1] == "": constructed_href[1] = splitted_url.netloc new_href = urlunsplit(constructed_href) - if new_href.startswith('http'): + if new_href.startswith("http"): body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1) - + return body @@ -168,12 +186,17 @@ def get_article_body(article, feed): def postprocess(text): try: - processor = subprocess.Popen(postprocessor.split(' '), stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE) - (output, err) = processor.communicate(input = text.encode()) + processor = subprocess.Popen( + postprocessor.split(" "), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + (output, err) = processor.communicate(input=text.encode()) if err: raise Exception(err.decode().strip()) except Exception as e: - error(' while postprocessing: {}'.format(e)) + error(" while postprocessing: {}".format(e)) sys.exit(1) return output.decode().strip() @@ -187,25 +210,29 @@ def get_article(article, feed): # Construct head of article image = get_image_snippet(str(article)) - if image == '': + if image == "": image = get_image_snippet(body) summary = get_summary_snippet(article.summary) - if summary == '': + if summary == "": summary = get_summary_snippet(body) try: - date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format) - except: + date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime( + datetime_format + ) + except Exception: date = datetime.now().strftime(datetime_format) try: link = article.link - except: - splitted_url = urlsplit(feed['url']) - splitted_link = [splitted_url.scheme, splitted_url.netloc, '', '', ''] - link = urlunsplit(splitted_link) - head = '

{}

\n\n{}{}

{} - Link

'.format(article.title, image, summary, date, link) + except Exception: + splitted_url = urlsplit(feed["url"]) + splitted_link = [splitted_url.scheme, splitted_url.netloc, "", "", ""] + link = urlunsplit(splitted_link) + head = "

{}

\n\n{}{}

{} - Link

".format( + article.title, image, summary, date, link + ) # Postprocess article - article_text = postprocess('{}\n\n
\n\n{}'.format(head, body)).strip() + article_text = postprocess("{}\n\n
\n\n{}".format(head, body)).strip() return article_text @@ -213,37 +240,39 @@ def get_article(article, feed): # Update feed def update_feed(feed): - log(' updating feed "{}"'.format(feed['name'])) + log(' updating feed "{}"'.format(feed["name"])) # Set feedpaths - feedpath_new = os.path.join(base_directory, feed['category'], feed['name'], 'new') - feedpath_read = os.path.join(base_directory, feed['category'], feed['name'], 'read') - + feedpath_new = os.path.join(base_directory, feed["category"], feed["name"], "new") + feedpath_read = os.path.join(base_directory, feed["category"], feed["name"], "read") + if not os.path.exists(feedpath_new): os.makedirs(feedpath_new) - + if not os.path.exists(feedpath_read): os.makedirs(feedpath_read) # Get exisiting articles - existing_articles = os.listdir(feedpath_new) + os.listdir(feedpath_read) + os.listdir(lovedpath) + existing_articles = ( + os.listdir(feedpath_new) + os.listdir(feedpath_read) + os.listdir(lovedpath) + ) # Update articles articles = get_articles(feed) - threshold_date = datetime.now() - timedelta(days = max_age) + threshold_date = datetime.now() - timedelta(days=max_age) if len(articles) == 0: - error('no articles received from feed "{}"'.format(feed['name'])) - + error('no articles received from feed "{}"'.format(feed["name"])) + for a in articles: - + try: # Set fallback if no parseable date found fallback = False try: date = datetime.fromtimestamp(mktime(a.published_parsed)) - except: + except Exception: date = datetime.now() fallback = True @@ -259,9 +288,9 @@ def update_feed(feed): if not filter: # Construct filename - filename_prefix = date.strftime('%Y%m%d%H%M') + filename_prefix = date.strftime("%Y%m%d%H%M") filename_postfix = get_filename_postfix(a.title) - filename = '{}_{}'.format(filename_prefix, filename_postfix) + filename = "{}_{}".format(filename_prefix, filename_postfix) # Check if article exists article_exists = False @@ -278,26 +307,30 @@ def update_feed(feed): log(' added article "{}"'.format(a.title)) except Exception as e: - error('while parsing article "{}" from feed "{}": {}'.format(a.title, feed['name'], e)) + error( + 'while parsing article "{}" from feed "{}": {}'.format( + a.title, feed["name"], e + ) + ) # Delete articles older than max_age def remove_old_articles(): - threshold_date = datetime.now() - timedelta(days = max_age) + threshold_date = datetime.now() - timedelta(days=max_age) count = 0 - + for subdir, dirs, files in os.walk(base_directory): # Skip 'loved' directory - if not os.path.join(base_directory, 'loved') in subdir: + if not os.path.join(base_directory, "loved") in subdir: for file in files: - date = datetime.strptime(file[:12], '%Y%m%d%H%M') - if threshold_date > date: - os.remove(os.path.join(subdir, file)) - count += 1 + date = datetime.strptime(file[:12], "%Y%m%d%H%M") + if threshold_date > date: + os.remove(os.path.join(subdir, file)) + count += 1 - log(' removed {} articles'.format(count)) + log(" removed {} articles".format(count)) # Parse config file @@ -307,15 +340,15 @@ def load_config(filepath): try: config = toml.load(filepath) - base_directory = config['base_directory'] - max_age = config['max_age'] - datetime_format = config['datetime_format'] - postprocessor = config['postprocessor'] - fileending = config['fileending'] - filters = config['filters'] - feeds = config['feed'] + base_directory = config["base_directory"] + max_age = config["max_age"] + datetime_format = config["datetime_format"] + postprocessor = config["postprocessor"] + fileending = config["fileending"] + filters = config["filters"] + feeds = config["feed"] except Exception as e: - error('while parsing config: {}'.format(e)) + error("while parsing config: {}".format(e)) sys.exit(1) @@ -325,7 +358,7 @@ def initialize(): global lovedpath # Create 'loved' directory if not existent - lovedpath = os.path.join(base_directory, 'loved') + lovedpath = os.path.join(base_directory, "loved") if not os.path.exists(lovedpath): os.makedirs(lovedpath) @@ -333,25 +366,34 @@ def initialize(): # Update all feeds and remove old articles def crawl(): - log('crawling feeds', True) + log("crawling feeds", True) for feed in feeds: update_feed(feed) - log('removing old articles', True) + log("removing old articles", True) remove_old_articles() -''' + +""" Main -''' +""" + def main(): global verbose # Initialize parser - parser = argparse.ArgumentParser(description = 'Crawl RSS feeds and store articles as Markdown files.') - parser.add_argument('-v', '--verbose', action = 'store_true', help = 'verbose output') - parser.add_argument('-c', '--config', default = './config.toml', help = 'config file (default: ./config.toml)') + parser = argparse.ArgumentParser( + description="Crawl RSS feeds and store articles as Markdown files." + ) + parser.add_argument("-v", "--verbose", action="store_true", help="verbose output") + parser.add_argument( + "-c", + "--config", + default="./config.toml", + help="config file (default: ./config.toml)", + ) # Get args args = parser.parse_args() @@ -365,5 +407,5 @@ def main(): crawl() -if __name__ == '__main__': +if __name__ == "__main__": main() From de2782117a76584d7a5437c7e406112b95687302 Mon Sep 17 00:00:00 2001 From: Denis Lehmann Date: Thu, 11 Mar 2021 18:52:46 +0100 Subject: [PATCH 10/10] update Nix file --- default.nix | 21 --------------------- shell.nix | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 21 deletions(-) delete mode 100644 default.nix create mode 100644 shell.nix diff --git a/default.nix b/default.nix deleted file mode 100644 index 1c000b4..0000000 --- a/default.nix +++ /dev/null @@ -1,21 +0,0 @@ -with import {}; - -stdenv.mkDerivation { - name = "myPythonEnv"; - buildInputs = with pkgs; [ - python37Full - python37Packages.virtualenv - pandoc - ]; - src = null; - shellHook = '' - if [ ! -d .venv ]; then - python -m venv .venv - fi - source .venv/bin/activate - pip install --upgrade pip - if [ -s requirements.txt ]; then - pip install -r requirements.txt - fi - ''; -} diff --git a/shell.nix b/shell.nix new file mode 100644 index 0000000..eb0e5d6 --- /dev/null +++ b/shell.nix @@ -0,0 +1,38 @@ +{ pkgs ? import {} }: +pkgs.mkShell { + name = "python-environment"; + buildInputs = with pkgs; [ + pandoc + python3 + python3Packages.virtualenv + ]; + shellHook = '' + function log_header { + echo -ne "==> \e[32m\e[1m$1\e[0m\n\n" + } + function log_subheader { + echo -ne "--> \e[33m\e[1m$1\e[0m\n\n" + } + function log { + echo -ne " $1\n" + } + + echo "" + log_header "python_environment" + if [ ! -d .venv ]; then + python -m venv .venv + fi + source .venv/bin/activate + log_subheader "upgrading pip" + pip install --upgrade pip + echo "" + if [ -s requirements.txt ]; then + log_subheader "found requirements.txt, installing packages" + pip install -r requirements.txt + echo "" + fi + log_header "package versions" + log "$(python --version)" + log "$(pip --version)" + ''; +}