chango config format and add arguments

This commit is contained in:
Denis Lehmann 2020-04-14 21:42:02 +02:00
parent 426e68dce7
commit e97a6bd09b
4 changed files with 95 additions and 63 deletions

View file

@ -1,17 +0,0 @@
# This defines the base directory for the feeds relative to this config file
base_directory = '/home/denis/spiderrss'
# Update interval in minutes
update_interval = 15
# Articles older than max_age will be deleted and not be added
max_age = 365
# Enable verbose output
verbose = True
# Feeds in the form (category, name, url) - the category can be empty ('')
feeds = [
('News', 'Tagesschau', 'https://www.tagesschau.de/xml/rss2'),
('News', 'Vice', 'https://www.vice.com/de/rss'),
]

17
config.toml Normal file
View file

@ -0,0 +1,17 @@
# This defines the base directory for the feeds relative to this config file
base_directory = '~/spiderss'
# Articles older than max_age will be deleted and not be added
max_age = 30
# Feeds - the category can be empty ('')
[[feed]]
category = 'News'
name = 'Tagesschau'
url = 'https://www.tagesschau.de/xml/rss2'
[[feed]]
category = 'News'
name = 'Vice'
url = 'https://www.vice.com/de/rss'

View file

@ -2,3 +2,4 @@ feedparser
readability-lxml readability-lxml
requests requests
html2text html2text
toml

View file

@ -9,26 +9,33 @@ import os
import time import time
from time import mktime from time import mktime
from datetime import datetime, timedelta from datetime import datetime, timedelta
from config import base_directory, update_interval, max_age, verbose, feeds import toml
import logging import argparse
import sys, getopt import sys
'''
Static variables
'''
version = '0.1'
''' '''
Output functions Output functions
''' '''
def log(text): # Print log message
if verbose: def log(text, force = False):
#logging.info(text) if verbose or force:
print('{} - {}'.format(datetime.now().strftime('%d.%m %H:%M'), text)) print('{} - {}'.format(datetime.now().strftime('%d.%m %H:%M'), text))
# Print error message and exit
def error(text): def error(text):
#logging.error(text) print('{} - ERROR - {}'.format(datetime.now().strftime('%d.%m %H:%M'), text))
print('{} - ERROR: {}'.format(datetime.now().strftime('%d.%m %H:%M'), text)) sys.exit(1)
# Print spiderss logo
def print_logo(): def print_logo():
logo = ''' logo = '''
;: ;:
@ -51,23 +58,25 @@ def print_logo():
Utility functions Utility functions
''' '''
# Get content of a webpage # Get HTML content of a webpage
def get_html_content(url): def get_html_content(url):
response = requests.get(url) response = requests.get(url)
doc = Document(response.text) doc = Document(response.text)
return doc.summary() return doc.summary()
# Convert HTML to Markdown
def html_to_markdown(html): def html_to_markdown(html):
return html2text.html2text(html) return html2text.html2text(html)
# Get articles of a feed # Get articles of a feed
def get_articles(feed): def get_articles(feed_url):
feed = feedparser.parse(feed[2]) feed = feedparser.parse(feed_url)
return feed.entries return feed.entries
# Write text to file
def write_to_file(filename, text): def write_to_file(filename, text):
file = open(filename, 'w') file = open(filename, 'w')
file.write(text) file.write(text)
@ -97,19 +106,16 @@ def get_article_text(article):
# Update feed # Update feed
def update_feed(feed): def update_feed(feed):
category = feed[0] log(' updating feed "{}"'.format(feed['name']))
name = feed[1]
log('updating feed "{}"'.format(name)) feedpath_new = os.path.join(base_directory, feed['category'], feed['name'], 'new')
feedpath_read = os.path.join(base_directory, feed['category'], feed['name'], 'read')
feedpath_new = os.path.join(base_directory, category, name, 'new')
feedpath_read = os.path.join(base_directory, category, name, 'read')
if not os.path.exists(feedpath_new): if not os.path.exists(feedpath_new):
os.makedirs(feedpath_new) os.makedirs(feedpath_new)
if not os.path.exists(feedpath_read): if not os.path.exists(feedpath_read):
os.makedirs(feedpath_read) os.makedirs(feedpath_read)
articles = get_articles(feed) articles = get_articles(feed['url'])
threshold_date = datetime.now() - timedelta(days = max_age) threshold_date = datetime.now() - timedelta(days = max_age)
for a in articles: for a in articles:
date = datetime.fromtimestamp(mktime(a.published_parsed)) date = datetime.fromtimestamp(mktime(a.published_parsed))
@ -118,14 +124,15 @@ def update_feed(feed):
if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)): if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)):
text = get_article_text(a) text = get_article_text(a)
write_to_file(os.path.join(feedpath_new, filename), text) write_to_file(os.path.join(feedpath_new, filename), text)
log(' added article "{}"'.format(a.title))
# Delete articles older than max_age # Delete articles older than max_age
def delete_old_articles(): def remove_old_articles():
log('removing old articles')
threshold_date = datetime.now() - timedelta(days = max_age) threshold_date = datetime.now() - timedelta(days = max_age)
count = 0
for subdir, dirs, files in os.walk(base_directory): for subdir, dirs, files in os.walk(base_directory):
# Skip 'loved' directory # Skip 'loved' directory
@ -134,8 +141,25 @@ def delete_old_articles():
date = datetime.strptime(file[:12], '%Y%m%d%H%M') date = datetime.strptime(file[:12], '%Y%m%d%H%M')
if threshold_date > date: if threshold_date > date:
os.remove(os.path.join(subdir, file)) os.remove(os.path.join(subdir, file))
count += 1
log(' removed {} articles'.format(count))
# Parse config file
def load_config(filepath):
global base_directory, max_age, feeds
try:
config = toml.load(filepath)
base_directory = config['base_directory']
max_age = config['max_age']
feeds = config['feed']
except Exception as e:
error('while parsing config: {}'.format(e))
# Initialize spiderss
def initialize(): def initialize():
# Create 'loved' directory if not existent # Create 'loved' directory if not existent
@ -144,39 +168,46 @@ def initialize():
os.makedirs(lovedpath) os.makedirs(lovedpath)
# Update all feeds and delete old messages
def crawl(): def crawl():
# Main loop log('crawling feeds', True)
while True: for feed in feeds:
log('starting crawl') update_feed(feed)
for feed in feeds:
update_feed(feed)
delete_old_articles()
time.sleep(update_interval * 60)
def get_help_message(): log('removing old articles', True)
return 'spiderrss.py | run' remove_old_articles()
def main(argv): '''
Main
'''
def main():
global verbose
# Initialize parser
parser = argparse.ArgumentParser(description = 'Crawl RSS feeds and store articles as Markdown files.')
parser.add_argument('-V', '--version', action = 'store_true', help = 'show version and exit')
parser.add_argument('-v', '--verbose', action = 'store_true', help = 'verbose output')
parser.add_argument('-c', '--config', default = './config.toml', help = 'config file (default: ./config.toml)')
# Get args
args = parser.parse_args()
show_version = args.version
verbose = args.verbose
config = args.config
if show_version:
print('spiderss v{}'.format(version))
sys.exit()
# Main routine
print_logo() print_logo()
load_config(config)
## Get arguments
#try:
# opts, args = getopt,getopt(argv, 'h', ['ifile=', 'ofile='])
#except:
# print('spiderrss.py [ run | create_config <file> ]')
#for opt, arg in opts:
# if opt == '-h':
# print('spiderrss.py [ run | create_config <file> ]')
initialize() initialize()
crawl() crawl()
if __name__ == '__main__': if __name__ == '__main__':
main(sys.argv[1:]) main()