diff --git a/config.py b/config.py deleted file mode 100644 index 39a7815..0000000 --- a/config.py +++ /dev/null @@ -1,17 +0,0 @@ -# This defines the base directory for the feeds relative to this config file -base_directory = '/home/denis/spiderrss' - -# Update interval in minutes -update_interval = 15 - -# Articles older than max_age will be deleted and not be added -max_age = 365 - -# Enable verbose output -verbose = True - -# Feeds in the form (category, name, url) - the category can be empty ('') -feeds = [ - ('News', 'Tagesschau', 'https://www.tagesschau.de/xml/rss2'), - ('News', 'Vice', 'https://www.vice.com/de/rss'), -] diff --git a/config.toml b/config.toml new file mode 100644 index 0000000..f536da1 --- /dev/null +++ b/config.toml @@ -0,0 +1,17 @@ +# This defines the base directory for the feeds relative to this config file +base_directory = '~/spiderss' + +# Articles older than max_age will be deleted and not be added +max_age = 30 + +# Feeds - the category can be empty ('') + +[[feed]] +category = 'News' +name = 'Tagesschau' +url = 'https://www.tagesschau.de/xml/rss2' + +[[feed]] +category = 'News' +name = 'Vice' +url = 'https://www.vice.com/de/rss' diff --git a/requirements.txt b/requirements.txt index 1a4b02f..5f558e4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ feedparser readability-lxml requests html2text +toml diff --git a/spiderss.py b/spiderss.py index 6a6aec4..6ff71a0 100755 --- a/spiderss.py +++ b/spiderss.py @@ -9,26 +9,33 @@ import os import time from time import mktime from datetime import datetime, timedelta -from config import base_directory, update_interval, max_age, verbose, feeds -import logging -import sys, getopt +import toml +import argparse +import sys +''' +Static variables +''' + +version = '0.1' ''' Output functions ''' -def log(text): - if verbose: - #logging.info(text) +# Print log message +def log(text, force = False): + if verbose or force: print('{} - {}'.format(datetime.now().strftime('%d.%m %H:%M'), text)) +# Print error message and exit def error(text): - #logging.error(text) - print('{} - ERROR: {}'.format(datetime.now().strftime('%d.%m %H:%M'), text)) + print('{} - ERROR - {}'.format(datetime.now().strftime('%d.%m %H:%M'), text)) + sys.exit(1) +# Print spiderss logo def print_logo(): logo = ''' ;: @@ -51,23 +58,25 @@ def print_logo(): Utility functions ''' -# Get content of a webpage +# Get HTML content of a webpage def get_html_content(url): response = requests.get(url) doc = Document(response.text) return doc.summary() +# Convert HTML to Markdown def html_to_markdown(html): return html2text.html2text(html) # Get articles of a feed -def get_articles(feed): - feed = feedparser.parse(feed[2]) +def get_articles(feed_url): + feed = feedparser.parse(feed_url) return feed.entries +# Write text to file def write_to_file(filename, text): file = open(filename, 'w') file.write(text) @@ -97,19 +106,16 @@ def get_article_text(article): # Update feed def update_feed(feed): - category = feed[0] - name = feed[1] + log(' updating feed "{}"'.format(feed['name'])) - log('updating feed "{}"'.format(name)) - - feedpath_new = os.path.join(base_directory, category, name, 'new') - feedpath_read = os.path.join(base_directory, category, name, 'read') + feedpath_new = os.path.join(base_directory, feed['category'], feed['name'], 'new') + feedpath_read = os.path.join(base_directory, feed['category'], feed['name'], 'read') if not os.path.exists(feedpath_new): os.makedirs(feedpath_new) if not os.path.exists(feedpath_read): os.makedirs(feedpath_read) - articles = get_articles(feed) + articles = get_articles(feed['url']) threshold_date = datetime.now() - timedelta(days = max_age) for a in articles: date = datetime.fromtimestamp(mktime(a.published_parsed)) @@ -118,14 +124,15 @@ def update_feed(feed): if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)): text = get_article_text(a) write_to_file(os.path.join(feedpath_new, filename), text) + log(' added article "{}"'.format(a.title)) # Delete articles older than max_age -def delete_old_articles(): - - log('removing old articles') +def remove_old_articles(): threshold_date = datetime.now() - timedelta(days = max_age) + count = 0 + for subdir, dirs, files in os.walk(base_directory): # Skip 'loved' directory @@ -134,8 +141,25 @@ def delete_old_articles(): date = datetime.strptime(file[:12], '%Y%m%d%H%M') if threshold_date > date: os.remove(os.path.join(subdir, file)) + count += 1 + + log(' removed {} articles'.format(count)) + +# Parse config file +def load_config(filepath): + + global base_directory, max_age, feeds + + try: + config = toml.load(filepath) + base_directory = config['base_directory'] + max_age = config['max_age'] + feeds = config['feed'] + except Exception as e: + error('while parsing config: {}'.format(e)) +# Initialize spiderss def initialize(): # Create 'loved' directory if not existent @@ -144,39 +168,46 @@ def initialize(): os.makedirs(lovedpath) +# Update all feeds and delete old messages def crawl(): - # Main loop - while True: - log('starting crawl') - for feed in feeds: - update_feed(feed) - delete_old_articles() - time.sleep(update_interval * 60) + log('crawling feeds', True) + for feed in feeds: + update_feed(feed) -def get_help_message(): - return 'spiderrss.py | run' + log('removing old articles', True) + remove_old_articles() -def main(argv): +''' +Main +''' +def main(): + + global verbose + + # Initialize parser + parser = argparse.ArgumentParser(description = 'Crawl RSS feeds and store articles as Markdown files.') + parser.add_argument('-V', '--version', action = 'store_true', help = 'show version and exit') + parser.add_argument('-v', '--verbose', action = 'store_true', help = 'verbose output') + parser.add_argument('-c', '--config', default = './config.toml', help = 'config file (default: ./config.toml)') + + # Get args + args = parser.parse_args() + show_version = args.version + verbose = args.verbose + config = args.config + + if show_version: + print('spiderss v{}'.format(version)) + sys.exit() + + # Main routine print_logo() - - ## Get arguments - #try: - # opts, args = getopt,getopt(argv, 'h', ['ifile=', 'ofile=']) - #except: - # print('spiderrss.py [ run | create_config ]') - - #for opt, arg in opts: - # if opt == '-h': - # print('spiderrss.py [ run | create_config ]') - - + load_config(config) initialize() crawl() - if __name__ == '__main__': - main(sys.argv[1:]) - + main()