Compare commits

...

10 commits

5 changed files with 277 additions and 152 deletions

View file

@ -20,6 +20,7 @@ Use at your own risk!
- Store articles in categories - Store articles in categories
- Delete articles after a few days - Delete articles after a few days
- Filter articles, you're not interested in
- Distinguish __new__ from __read__ articles - Distinguish __new__ from __read__ articles
- Store __loved__ articles forever - Store __loved__ articles forever
- OPML import - OPML import
@ -66,14 +67,18 @@ base_directory = '/home/<user>/rss'
# Articles older than max_age (days) will be deleted and not be added. # Articles older than max_age (days) will be deleted and not be added.
max_age = 30 max_age = 30
# Date and time format as strftime, to be included in the articles.
datetime_format = '%d.%m.%Y %H:%M'
# Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout. # Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout.
postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links --reference-location=document' postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links --reference-location=document'
# Fileending for the article files. # Fileending for the article files.
fileending = 'md' fileending = 'md'
# Date and time format as strftime to be included in the articles. # List of regular expression strings. If any of these matches an lowercase article title, the article won't be saved.
datetime_format = '%d.%m.%Y %H:%M' # E.g. if you wan't to skip news about RSS explicitly, add '(\W|^)rss(\W|$)'.
filters = []
# Feeds # Feeds
# The category can be empty (''). The feed fill then be stored in the base_directory. # The category can be empty (''). The feed fill then be stored in the base_directory.
@ -153,6 +158,6 @@ Just synchronize the base_directory with [Syncthing](https://syncthing.net/), [r
## Acknowledgements ## Acknowledgements
Thanks to all the people, who created the nice libraries this project in based on. Thanks to all the people who created the nice software, this project in based on.
And also thanks to Dieter Steffmann who created the Canterbury font, which is used for the logo. And also thanks to Dieter Steffmann who created the Canterbury font, which is used for the logo.
You can find it in the `fonts/` directory. You can find it in the `fonts/` directory.

View file

@ -4,7 +4,7 @@ base_directory = '/home/<user>/rss'
# Articles older than max_age (days) will be deleted and not be added. # Articles older than max_age (days) will be deleted and not be added.
max_age = 30 max_age = 30
# Date and time format as strftime to be included in the articles. # Date and time format as strftime, to be included in the articles.
datetime_format = '%d.%m.%Y %H:%M' datetime_format = '%d.%m.%Y %H:%M'
# Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout. # Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout.
@ -13,6 +13,10 @@ postprocessor = 'pandoc -f html -t markdown_strict-raw_html+pipe_tables --refere
# Fileending for the article files. # Fileending for the article files.
fileending = 'md' fileending = 'md'
# List of regular expression strings. If any of these matches an lowercase article title, the article won't be saved.
# E.g. if you wan't to skip news about RSS explicitly, add '(\W|^)rss(\W|$)'.
filters = []
# Feeds # Feeds
# The category can be empty (''). The feed fill then be stored in the base_directory. # The category can be empty (''). The feed fill then be stored in the base_directory.
# The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware'). # The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware').

View file

@ -1,21 +0,0 @@
with import <nixpkgs> {};
stdenv.mkDerivation {
name = "myPythonEnv";
buildInputs = with pkgs; [
python37Full
python37Packages.virtualenv
pandoc
];
src = null;
shellHook = ''
if [ ! -d .venv ]; then
python -m venv .venv
fi
source .venv/bin/activate
pip install --upgrade pip
if [ -s requirements.txt ]; then
pip install -r requirements.txt
fi
'';
}

38
shell.nix Normal file
View file

@ -0,0 +1,38 @@
{ pkgs ? import <nixpkgs> {} }:
pkgs.mkShell {
name = "python-environment";
buildInputs = with pkgs; [
pandoc
python3
python3Packages.virtualenv
];
shellHook = ''
function log_header {
echo -ne "==> \e[32m\e[1m$1\e[0m\n\n"
}
function log_subheader {
echo -ne "--> \e[33m\e[1m$1\e[0m\n\n"
}
function log {
echo -ne " $1\n"
}
echo ""
log_header "python_environment"
if [ ! -d .venv ]; then
python -m venv .venv
fi
source .venv/bin/activate
log_subheader "upgrading pip"
pip install --upgrade pip
echo ""
if [ -s requirements.txt ]; then
log_subheader "found requirements.txt, installing packages"
pip install -r requirements.txt
echo ""
fi
log_header "package versions"
log "$(python --version)"
log "$(pip --version)"
'';
}

View file

@ -8,7 +8,6 @@ import re
import requests import requests
import subprocess import subprocess
import sys import sys
import time
import toml import toml
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -16,24 +15,25 @@ from readability import Document
from time import mktime from time import mktime
from urllib.parse import urlsplit, urlunsplit from urllib.parse import urlsplit, urlunsplit
''' """
Output functions Output functions
''' """
# Print log message # Print log message
def log(text, force = False): def log(text, force=False):
if verbose or force: if verbose or force:
print('{} | {}'.format(datetime.now().strftime('%d.%m %H:%M'), text)) print("{} | {}".format(datetime.now().strftime("%d.%m %H:%M"), text))
# Print error message and exit # Print error message and exit
def error(text): def error(text):
print('{} E {}'.format(datetime.now().strftime('%d.%m %H:%M'), text)) print("{} E {}".format(datetime.now().strftime("%d.%m %H:%M"), text))
# Print spiderss logo # Print spiderss logo
def print_logo(): def print_logo():
logo = ''' logo = """
;: ;:
.N' ,K: .N' ,K:
,O .0MWx' lk 0; ,O .0MWx' lk 0;
@ -46,54 +46,55 @@ def print_logo():
;kddkNKl. XMNkWk, :N0; .'cOW0c. ,lOW0; .:0Nl. okddOW0:. .kdoxXNd, ;kddkNKl. XMNkWk, :N0; .'cOW0c. ,lOW0; .:0Nl. okddOW0:. .kdoxXNd,
WMX WMX
;..cc ;..cc
''' """
print(logo) print(logo)
'''
Utility functions
'''
# Get articles of a feed """
Utility functions
"""
# Get articles of a feed
def get_articles(feed): def get_articles(feed):
feed = feedparser.parse(feed['url'])
feed = feedparser.parse(feed["url"])
return feed.entries return feed.entries
# Write text to file # Write text to file
def write_to_file(filepath, text): def write_to_file(filepath, text):
file = open(filepath, 'w') file = open(filepath, "w")
file.write(text) file.write(text)
file.close() file.close()
# Get filename from a date and a title # Get filename postfix from a title
def get_filename(date, title): def get_filename_postfix(title):
# Get date as single block
date = date.strftime('%Y%m%d%H%M')
# Get title as lowercase words concatenated with underscores # Get title as lowercase words concatenated with underscores
title = re.sub('[^A-Za-z0-9 ]+', '', title.lower()) title = re.sub("[^A-Za-z0-9 ]+", "", title.lower())
title = re.sub(' ', '_', title) title = re.sub(" ", "_", title)
return '{}_{}.{}'.format(date, title, fileending) return "{}.{}".format(title, fileending)
# If scraped, use first content image as fallback # Get HTML image snippet from the first image url in a text
# Get image snippet for an article def get_image_snippet(text):
def get_article_image(article):
try: try:
image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image') image_url = re.search(
"(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))", text, re.IGNORECASE
).group("image")
return '<img src="{}" alt="Image">\n\n'.format(image_url) return '<img src="{}" alt="Image">\n\n'.format(image_url)
except: except Exception:
return '' return ""
# Get summary snippet for an article # Get HTML summary snippet from a HTML text
def get_article_summary(article): def get_summary_snippet(text):
try: try:
h = html2text.HTML2Text() h = html2text.HTML2Text()
@ -101,60 +102,82 @@ def get_article_summary(article):
h.ignore_links = True h.ignore_links = True
h.ignore_images = True h.ignore_images = True
h.body_width = 0 h.body_width = 0
return '<p><b>{}</b></p>\n\n'.format(summary) summary = h.handle(text).split("\n\n")[0].strip()
except: return "<p><b>{}</b></p>\n\n".format(summary)
return '' except Exception:
return ""
# Get article body either from web or its content # Get article body either from web or its content
def get_article_body(article, scrape): def get_article_body(article, feed):
body = '' body = ""
# TODO: Include appropriate header?
# If scrape, get article with readability # If scrape, get article with readability
if scrape: if feed["scrape"]:
response = requests.get(article.link) headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36"
}
response = requests.get(article.link, headers=headers)
doc = Document(response.text) doc = Document(response.text)
body = doc.summary() body = doc.summary()
# Replace relative site links with absolute ones, using beautifulsoup # Else construct from article object
splitted_url = urlsplit(article.link)
soup = BeautifulSoup(body, features = 'lxml')
for img in soup.find_all('img', src = True):
src = img.get('src')
splitted_src = urlsplit(src)
constructed_src = [splitted_src.scheme, splitted_src.netloc, splitted_src.path, splitted_src.query, splitted_src.fragment]
if constructed_src[0] == '':
constructed_src[0] = splitted_url.scheme
if constructed_src[1] == '':
constructed_src[1] = splitted_url.netloc
new_src = urlunsplit(constructed_src)
if new_src.startswith('http'):
body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1)
# TODO: catch mailto:
for a in soup.find_all('a', href = True):
href = a.get('href')
splitted_href = urlsplit(href)
constructed_href = [splitted_href.scheme, splitted_href.netloc, splitted_href.path, splitted_href.query, splitted_href.fragment]
if constructed_href[0] == '':
constructed_href[0] = splitted_url.scheme
if constructed_href[1] == '':
constructed_href[1] = splitted_url.netloc
new_href = urlunsplit(constructed_href)
if new_href.startswith('http'):
body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1)
# Else construct from article content
else: else:
if hasattr(article, 'content'): # Add all content to body
if hasattr(article, "content"):
for c in article.content: for c in article.content:
if c.type == 'text/html': if c.type == "text/html" or c.type == "text/plain":
body += c.value body += c.value
# Use summary as fallback
elif hasattr(article, "summary"):
body += article.summary
# Replace relative links with absolute ones, using beautifulsoup
try:
splitted_url = urlsplit(article.link)
except Exception:
splitted_url = urlsplit(feed["url"])
soup = BeautifulSoup(body, features="lxml")
for img in soup.find_all("img", src=True):
src = img.get("src")
splitted_src = urlsplit(src)
constructed_src = [
splitted_src.scheme,
splitted_src.netloc,
splitted_src.path,
splitted_src.query,
splitted_src.fragment,
]
if constructed_src[0] == "":
constructed_src[0] = splitted_url.scheme
if constructed_src[1] == "":
constructed_src[1] = splitted_url.netloc
new_src = urlunsplit(constructed_src)
if new_src.startswith("http"):
body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1)
for a in soup.find_all("a", href=True):
href = a.get("href")
splitted_href = urlsplit(href)
constructed_href = [
splitted_href.scheme,
splitted_href.netloc,
splitted_href.path,
splitted_href.query,
splitted_href.fragment,
]
if constructed_href[0] == "":
constructed_href[0] = splitted_url.scheme
if constructed_href[1] == "":
constructed_href[1] = splitted_url.netloc
new_href = urlunsplit(constructed_href)
if new_href.startswith("http"):
body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1)
return body return body
@ -163,32 +186,53 @@ def get_article_body(article, scrape):
def postprocess(text): def postprocess(text):
try: try:
processor = subprocess.Popen(postprocessor.split(' '), stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE) processor = subprocess.Popen(
(output, err) = processor.communicate(input = text.encode()) postprocessor.split(" "),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
(output, err) = processor.communicate(input=text.encode())
if err: if err:
raise Exception(err.decode().strip()) raise Exception(err.decode().strip())
except Exception as e: except Exception as e:
error(' while postprocessing: {}'.format(e)) error(" while postprocessing: {}".format(e))
sys.exit(1) sys.exit(1)
return output.decode().strip() return output.decode().strip()
# Get constructed article # Get constructed article
def get_article(article, scrape): def get_article(article, feed):
# Construct head of article
image = get_article_image(article)
summary = get_article_summary(article)
#TODO: Current time as fallback?
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link)
# Get body of article # Get body of article
body = get_article_body(article, scrape) body = get_article_body(article, feed)
# Construct head of article
image = get_image_snippet(str(article))
if image == "":
image = get_image_snippet(body)
summary = get_summary_snippet(article.summary)
if summary == "":
summary = get_summary_snippet(body)
try:
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(
datetime_format
)
except Exception:
date = datetime.now().strftime(datetime_format)
try:
link = article.link
except Exception:
splitted_url = urlsplit(feed["url"])
splitted_link = [splitted_url.scheme, splitted_url.netloc, "", "", ""]
link = urlunsplit(splitted_link)
head = "<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>".format(
article.title, image, summary, date, link
)
# Postprocess article # Postprocess article
article_text = postprocess('{}\n\n<hr>\n\n{}'.format(head, body)).strip() article_text = postprocess("{}\n\n<hr>\n\n{}".format(head, body)).strip()
return article_text return article_text
@ -196,105 +240,160 @@ def get_article(article, scrape):
# Update feed # Update feed
def update_feed(feed): def update_feed(feed):
log(' updating feed "{}"'.format(feed['name'])) log(' updating feed "{}"'.format(feed["name"]))
# Set feedpaths # Set feedpaths
feedpath_new = os.path.join(base_directory, feed['category'], feed['name'], 'new') feedpath_new = os.path.join(base_directory, feed["category"], feed["name"], "new")
feedpath_read = os.path.join(base_directory, feed['category'], feed['name'], 'read') feedpath_read = os.path.join(base_directory, feed["category"], feed["name"], "read")
if not os.path.exists(feedpath_new): if not os.path.exists(feedpath_new):
os.makedirs(feedpath_new) os.makedirs(feedpath_new)
if not os.path.exists(feedpath_read): if not os.path.exists(feedpath_read):
os.makedirs(feedpath_read) os.makedirs(feedpath_read)
# Get exisiting articles
existing_articles = (
os.listdir(feedpath_new) + os.listdir(feedpath_read) + os.listdir(lovedpath)
)
# Update articles # Update articles
articles = get_articles(feed) articles = get_articles(feed)
threshold_date = datetime.now() - timedelta(days = max_age) threshold_date = datetime.now() - timedelta(days=max_age)
if len(articles) == 0:
error('no articles received from feed "{}"'.format(feed["name"]))
for a in articles: for a in articles:
try: try:
date = datetime.fromtimestamp(mktime(a.published_parsed))
# Set fallback if no parseable date found
fallback = False
try:
date = datetime.fromtimestamp(mktime(a.published_parsed))
except Exception:
date = datetime.now()
fallback = True
if date > threshold_date: if date > threshold_date:
filename = get_filename(date, a.title)
if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)): # Check if article should be filtered
text = get_article(a, feed['scrape']) filter = False
write_to_file(os.path.join(feedpath_new, filename), text) for f in filters:
log(' added article "{}"'.format(a.title)) if re.search(f, a.title.lower()):
filter = True
log(' filtered article "{}"'.format(a.title))
if not filter:
# Construct filename
filename_prefix = date.strftime("%Y%m%d%H%M")
filename_postfix = get_filename_postfix(a.title)
filename = "{}_{}".format(filename_prefix, filename_postfix)
# Check if article exists
article_exists = False
if fallback:
existing_articles_fallback = [a[13:] for a in existing_articles]
if filename_postfix in existing_articles_fallback:
article_exists = True
elif filename in existing_articles:
article_exists = True
if not article_exists:
text = get_article(a, feed)
write_to_file(os.path.join(feedpath_new, filename), text)
log(' added article "{}"'.format(a.title))
except Exception as e: except Exception as e:
error('while parsing feed article "{}" from feed "{}": {}'.format(a.title, feed['name'], e)) error(
'while parsing article "{}" from feed "{}": {}'.format(
a.title, feed["name"], e
)
)
# Delete articles older than max_age # Delete articles older than max_age
def remove_old_articles(): def remove_old_articles():
threshold_date = datetime.now() - timedelta(days = max_age) threshold_date = datetime.now() - timedelta(days=max_age)
count = 0 count = 0
for subdir, dirs, files in os.walk(base_directory): for subdir, dirs, files in os.walk(base_directory):
# Skip 'loved' directory # Skip 'loved' directory
if not os.path.join(base_directory, 'loved') in subdir: if not os.path.join(base_directory, "loved") in subdir:
for file in files: for file in files:
date = datetime.strptime(file[:12], '%Y%m%d%H%M') date = datetime.strptime(file[:12], "%Y%m%d%H%M")
if threshold_date > date: if threshold_date > date:
os.remove(os.path.join(subdir, file)) os.remove(os.path.join(subdir, file))
count += 1 count += 1
log(' removed {} articles'.format(count)) log(" removed {} articles".format(count))
# Parse config file # Parse config file
def load_config(filepath): def load_config(filepath):
global base_directory, max_age, datetime_format, postprocessor, fileending, feeds global base_directory, max_age, datetime_format, postprocessor, fileending, filters, feeds
try: try:
config = toml.load(filepath) config = toml.load(filepath)
base_directory = config['base_directory'] base_directory = config["base_directory"]
max_age = config['max_age'] max_age = config["max_age"]
datetime_format = config['datetime_format'] datetime_format = config["datetime_format"]
postprocessor = config['postprocessor'] postprocessor = config["postprocessor"]
fileending = config['fileending'] fileending = config["fileending"]
feeds = config['feed'] filters = config["filters"]
feeds = config["feed"]
except Exception as e: except Exception as e:
error('while parsing config: {}'.format(e)) error("while parsing config: {}".format(e))
sys.exit(1) sys.exit(1)
# Initialize spiderss # Initialize spiderss
def initialize(): def initialize():
global lovedpath
# Create 'loved' directory if not existent # Create 'loved' directory if not existent
lovedpath = os.path.join(base_directory, 'loved') lovedpath = os.path.join(base_directory, "loved")
if not os.path.exists(lovedpath): if not os.path.exists(lovedpath):
os.makedirs(lovedpath) os.makedirs(lovedpath)
# Update all feeds and delete old messages # Update all feeds and remove old articles
def crawl(): def crawl():
log('crawling feeds', True) log("crawling feeds", True)
for feed in feeds: for feed in feeds:
update_feed(feed) update_feed(feed)
log('removing old articles', True) log("removing old articles", True)
remove_old_articles() remove_old_articles()
'''
"""
Main Main
''' """
def main(): def main():
global verbose global verbose
# Initialize parser # Initialize parser
parser = argparse.ArgumentParser(description = 'Crawl RSS feeds and store articles as Markdown files.') parser = argparse.ArgumentParser(
parser.add_argument('-v', '--verbose', action = 'store_true', help = 'verbose output') description="Crawl RSS feeds and store articles as Markdown files."
parser.add_argument('-c', '--config', default = './config.toml', help = 'config file (default: ./config.toml)') )
parser.add_argument("-v", "--verbose", action="store_true", help="verbose output")
parser.add_argument(
"-c",
"--config",
default="./config.toml",
help="config file (default: ./config.toml)",
)
# Get args # Get args
args = parser.parse_args() args = parser.parse_args()
@ -308,5 +407,5 @@ def main():
crawl() crawl()
if __name__ == '__main__': if __name__ == "__main__":
main() main()