add config and rename main.py to spiderrss.py

This commit is contained in:
Denis Lehmann 2020-04-14 03:20:10 +02:00
parent 66d9ba1fb5
commit 2e7e3da309
3 changed files with 81 additions and 26 deletions

17
config.py Normal file
View file

@ -0,0 +1,17 @@
# This defines the base directory for the feeds relative to this config file
base_directory = '/home/denis/spiderrss'
# Update interval in minutes
update_interval = 15
# Articles older than max_age will be deleted and not be added
max_age = 365
# Enable verbose output
verbose = True
# Feeds in the form (category, name, url) - the category can be empty ('')
feeds = [
('News', 'Tagesschau', 'https://www.tagesschau.de/xml/rss2'),
('News', 'Vice', 'htdtps://www.vice.com/de/rss'),
]

View file

@ -1,3 +1,4 @@
feedparser feedparser
readability-lxml readability-lxml
requests requests
html2text

View file

@ -4,16 +4,21 @@ import requests
import html2text import html2text
import re import re
import os import os
import time
from time import mktime from time import mktime
from datetime import datetime, timedelta from datetime import datetime, timedelta
from config import base_directory, update_interval, max_age, verbose, feeds
import logging
import sys, getopt
feeds = [('News', 'Tagesschau', 'https://www.tagesschau.de/xml/rss2'), def log(text):
('Linux', 'NixOS', 'https://nixos.org/blogs.xml'), if verbose:
('News', 'Vice', 'https://www.vice.com/de/rss') #logging.info(text)
] print('{} - {}'.format(datetime.now().strftime('%d.%m %H:%M'), text))
out_directory = './out' def error(text):
delta = 365 #logging.error(text)
print('{} - ERROR: {}'.format(datetime.now().strftime('%d.%m %H:%M'), text))
# Get content of a webpage # Get content of a webpage
@ -27,10 +32,14 @@ def html_to_markdown(html):
return html2text.html2text(html) return html2text.html2text(html)
# Get articles of a RSS feed # Get articles of a feed
def get_articles(url): def get_articles(feed):
feed = feedparser.parse(url) try:
return feed.entries feed = feedparser.parse(feed[2])
return feed.entries
except Exception as e:
error('failed to get feed "{}: {}"'.format(feed[1], e.msg))
return []
def write_to_file(filename, text): def write_to_file(filename, text):
@ -54,20 +63,21 @@ def get_filename(date, title):
# Update feed # Update feed
def update_feed(feed): def update_feed(feed):
category = feed[0] category = feed[0]
name = feed[1] name = feed[1]
url = feed[2]
feedpath_new = os.path.join(out_directory, category, name, 'new') log('updating feed "{}"'.format(name))
feedpath_read = os.path.join(out_directory, category, name, 'read')
feedpath_new = os.path.join(base_directory, category, name, 'new')
feedpath_read = os.path.join(base_directory, category, name, 'read')
if not os.path.exists(feedpath_new): if not os.path.exists(feedpath_new):
os.makedirs(feedpath_new) os.makedirs(feedpath_new)
if not os.path.exists(feedpath_read): if not os.path.exists(feedpath_read):
os.makedirs(feedpath_read) os.makedirs(feedpath_read)
articles = get_articles(url) articles = get_articles(feed)
threshold_date = datetime.now() - timedelta(days = delta) threshold_date = datetime.now() - timedelta(days = max_age)
for a in articles: for a in articles:
date = datetime.fromtimestamp(mktime(a.published_parsed)) date = datetime.fromtimestamp(mktime(a.published_parsed))
if date > threshold_date: if date > threshold_date:
@ -77,31 +87,58 @@ def update_feed(feed):
write_to_file(os.path.join(feedpath_new, filename), text) write_to_file(os.path.join(feedpath_new, filename), text)
# Delete articles older than day delta # Delete articles older than max_age
def delete_old_articles(): def delete_old_articles():
threshold_date = datetime.now() - timedelta(days = delta) threshold_date = datetime.now() - timedelta(days = max_age)
for subdir, dirs, files in os.walk(out_directory): for subdir, dirs, files in os.walk(base_directory):
# Skip 'loved' directory # Skip 'loved' directory
if not os.path.join(out_directory, 'loved') in subdir: if not os.path.join(base_directory, 'loved') in subdir:
for file in files: for file in files:
date = datetime.strptime(file[:12], '%Y%m%d%H%M') date = datetime.strptime(file[:12], '%Y%m%d%H%M')
if threshold_date > date: if threshold_date > date:
os.remove(os.path.join(subdir, file)) os.remove(os.path.join(subdir, file))
log('deleted old articles')
def initialize():
def main(): # Create 'loved' directory if not existent
lovedpath = os.path.join(out_directory, 'loved') lovedpath = os.path.join(base_directory, 'loved')
if not os.path.exists(lovedpath): if not os.path.exists(lovedpath):
os.makedirs(lovedpath) os.makedirs(lovedpath)
for feed in feeds:
update_feed(feed)
delete_old_articles() def crawl():
# Main loop
while True:
for feed in feeds:
update_feed(feed)
delete_old_articles()
time.sleep(update_interval * 60)
def get_help_message():
return 'spiderrss.py | run'
def main(argv):
# Get arguments
try:
opts, args = getopt,getopt(argv, 'h', ['ifile=', 'ofile='])
except:
print('spiderrss.py [ run | create_config <file> ]')
for opt, arg in opts:
if opt == '-h'
#initialize()
#crawl()
if __name__ == '__main__': if __name__ == '__main__':
main() main(sys.argv[1:])