chango config format and add arguments
This commit is contained in:
parent
426e68dce7
commit
e97a6bd09b
4 changed files with 95 additions and 63 deletions
17
config.py
17
config.py
|
|
@ -1,17 +0,0 @@
|
||||||
# This defines the base directory for the feeds relative to this config file
|
|
||||||
base_directory = '/home/denis/spiderrss'
|
|
||||||
|
|
||||||
# Update interval in minutes
|
|
||||||
update_interval = 15
|
|
||||||
|
|
||||||
# Articles older than max_age will be deleted and not be added
|
|
||||||
max_age = 365
|
|
||||||
|
|
||||||
# Enable verbose output
|
|
||||||
verbose = True
|
|
||||||
|
|
||||||
# Feeds in the form (category, name, url) - the category can be empty ('')
|
|
||||||
feeds = [
|
|
||||||
('News', 'Tagesschau', 'https://www.tagesschau.de/xml/rss2'),
|
|
||||||
('News', 'Vice', 'https://www.vice.com/de/rss'),
|
|
||||||
]
|
|
||||||
17
config.toml
Normal file
17
config.toml
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
# This defines the base directory for the feeds relative to this config file
|
||||||
|
base_directory = '~/spiderss'
|
||||||
|
|
||||||
|
# Articles older than max_age will be deleted and not be added
|
||||||
|
max_age = 30
|
||||||
|
|
||||||
|
# Feeds - the category can be empty ('')
|
||||||
|
|
||||||
|
[[feed]]
|
||||||
|
category = 'News'
|
||||||
|
name = 'Tagesschau'
|
||||||
|
url = 'https://www.tagesschau.de/xml/rss2'
|
||||||
|
|
||||||
|
[[feed]]
|
||||||
|
category = 'News'
|
||||||
|
name = 'Vice'
|
||||||
|
url = 'https://www.vice.com/de/rss'
|
||||||
|
|
@ -2,3 +2,4 @@ feedparser
|
||||||
readability-lxml
|
readability-lxml
|
||||||
requests
|
requests
|
||||||
html2text
|
html2text
|
||||||
|
toml
|
||||||
|
|
|
||||||
123
spiderss.py
123
spiderss.py
|
|
@ -9,26 +9,33 @@ import os
|
||||||
import time
|
import time
|
||||||
from time import mktime
|
from time import mktime
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from config import base_directory, update_interval, max_age, verbose, feeds
|
import toml
|
||||||
import logging
|
import argparse
|
||||||
import sys, getopt
|
import sys
|
||||||
|
|
||||||
|
'''
|
||||||
|
Static variables
|
||||||
|
'''
|
||||||
|
|
||||||
|
version = '0.1'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Output functions
|
Output functions
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def log(text):
|
# Print log message
|
||||||
if verbose:
|
def log(text, force = False):
|
||||||
#logging.info(text)
|
if verbose or force:
|
||||||
print('{} - {}'.format(datetime.now().strftime('%d.%m %H:%M'), text))
|
print('{} - {}'.format(datetime.now().strftime('%d.%m %H:%M'), text))
|
||||||
|
|
||||||
|
|
||||||
|
# Print error message and exit
|
||||||
def error(text):
|
def error(text):
|
||||||
#logging.error(text)
|
print('{} - ERROR - {}'.format(datetime.now().strftime('%d.%m %H:%M'), text))
|
||||||
print('{} - ERROR: {}'.format(datetime.now().strftime('%d.%m %H:%M'), text))
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
# Print spiderss logo
|
||||||
def print_logo():
|
def print_logo():
|
||||||
logo = '''
|
logo = '''
|
||||||
;:
|
;:
|
||||||
|
|
@ -51,23 +58,25 @@ def print_logo():
|
||||||
Utility functions
|
Utility functions
|
||||||
'''
|
'''
|
||||||
|
|
||||||
# Get content of a webpage
|
# Get HTML content of a webpage
|
||||||
def get_html_content(url):
|
def get_html_content(url):
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
doc = Document(response.text)
|
doc = Document(response.text)
|
||||||
return doc.summary()
|
return doc.summary()
|
||||||
|
|
||||||
|
|
||||||
|
# Convert HTML to Markdown
|
||||||
def html_to_markdown(html):
|
def html_to_markdown(html):
|
||||||
return html2text.html2text(html)
|
return html2text.html2text(html)
|
||||||
|
|
||||||
|
|
||||||
# Get articles of a feed
|
# Get articles of a feed
|
||||||
def get_articles(feed):
|
def get_articles(feed_url):
|
||||||
feed = feedparser.parse(feed[2])
|
feed = feedparser.parse(feed_url)
|
||||||
return feed.entries
|
return feed.entries
|
||||||
|
|
||||||
|
|
||||||
|
# Write text to file
|
||||||
def write_to_file(filename, text):
|
def write_to_file(filename, text):
|
||||||
file = open(filename, 'w')
|
file = open(filename, 'w')
|
||||||
file.write(text)
|
file.write(text)
|
||||||
|
|
@ -97,19 +106,16 @@ def get_article_text(article):
|
||||||
# Update feed
|
# Update feed
|
||||||
def update_feed(feed):
|
def update_feed(feed):
|
||||||
|
|
||||||
category = feed[0]
|
log(' updating feed "{}"'.format(feed['name']))
|
||||||
name = feed[1]
|
|
||||||
|
|
||||||
log('updating feed "{}"'.format(name))
|
feedpath_new = os.path.join(base_directory, feed['category'], feed['name'], 'new')
|
||||||
|
feedpath_read = os.path.join(base_directory, feed['category'], feed['name'], 'read')
|
||||||
feedpath_new = os.path.join(base_directory, category, name, 'new')
|
|
||||||
feedpath_read = os.path.join(base_directory, category, name, 'read')
|
|
||||||
if not os.path.exists(feedpath_new):
|
if not os.path.exists(feedpath_new):
|
||||||
os.makedirs(feedpath_new)
|
os.makedirs(feedpath_new)
|
||||||
if not os.path.exists(feedpath_read):
|
if not os.path.exists(feedpath_read):
|
||||||
os.makedirs(feedpath_read)
|
os.makedirs(feedpath_read)
|
||||||
|
|
||||||
articles = get_articles(feed)
|
articles = get_articles(feed['url'])
|
||||||
threshold_date = datetime.now() - timedelta(days = max_age)
|
threshold_date = datetime.now() - timedelta(days = max_age)
|
||||||
for a in articles:
|
for a in articles:
|
||||||
date = datetime.fromtimestamp(mktime(a.published_parsed))
|
date = datetime.fromtimestamp(mktime(a.published_parsed))
|
||||||
|
|
@ -118,14 +124,15 @@ def update_feed(feed):
|
||||||
if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)):
|
if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)):
|
||||||
text = get_article_text(a)
|
text = get_article_text(a)
|
||||||
write_to_file(os.path.join(feedpath_new, filename), text)
|
write_to_file(os.path.join(feedpath_new, filename), text)
|
||||||
|
log(' added article "{}"'.format(a.title))
|
||||||
|
|
||||||
|
|
||||||
# Delete articles older than max_age
|
# Delete articles older than max_age
|
||||||
def delete_old_articles():
|
def remove_old_articles():
|
||||||
|
|
||||||
log('removing old articles')
|
|
||||||
|
|
||||||
threshold_date = datetime.now() - timedelta(days = max_age)
|
threshold_date = datetime.now() - timedelta(days = max_age)
|
||||||
|
count = 0
|
||||||
|
|
||||||
for subdir, dirs, files in os.walk(base_directory):
|
for subdir, dirs, files in os.walk(base_directory):
|
||||||
|
|
||||||
# Skip 'loved' directory
|
# Skip 'loved' directory
|
||||||
|
|
@ -134,8 +141,25 @@ def delete_old_articles():
|
||||||
date = datetime.strptime(file[:12], '%Y%m%d%H%M')
|
date = datetime.strptime(file[:12], '%Y%m%d%H%M')
|
||||||
if threshold_date > date:
|
if threshold_date > date:
|
||||||
os.remove(os.path.join(subdir, file))
|
os.remove(os.path.join(subdir, file))
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
log(' removed {} articles'.format(count))
|
||||||
|
|
||||||
|
# Parse config file
|
||||||
|
def load_config(filepath):
|
||||||
|
|
||||||
|
global base_directory, max_age, feeds
|
||||||
|
|
||||||
|
try:
|
||||||
|
config = toml.load(filepath)
|
||||||
|
base_directory = config['base_directory']
|
||||||
|
max_age = config['max_age']
|
||||||
|
feeds = config['feed']
|
||||||
|
except Exception as e:
|
||||||
|
error('while parsing config: {}'.format(e))
|
||||||
|
|
||||||
|
|
||||||
|
# Initialize spiderss
|
||||||
def initialize():
|
def initialize():
|
||||||
|
|
||||||
# Create 'loved' directory if not existent
|
# Create 'loved' directory if not existent
|
||||||
|
|
@ -144,39 +168,46 @@ def initialize():
|
||||||
os.makedirs(lovedpath)
|
os.makedirs(lovedpath)
|
||||||
|
|
||||||
|
|
||||||
|
# Update all feeds and delete old messages
|
||||||
def crawl():
|
def crawl():
|
||||||
|
|
||||||
# Main loop
|
log('crawling feeds', True)
|
||||||
while True:
|
for feed in feeds:
|
||||||
log('starting crawl')
|
update_feed(feed)
|
||||||
for feed in feeds:
|
|
||||||
update_feed(feed)
|
|
||||||
delete_old_articles()
|
|
||||||
time.sleep(update_interval * 60)
|
|
||||||
|
|
||||||
def get_help_message():
|
log('removing old articles', True)
|
||||||
return 'spiderrss.py | run'
|
remove_old_articles()
|
||||||
|
|
||||||
def main(argv):
|
'''
|
||||||
|
Main
|
||||||
|
'''
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
global verbose
|
||||||
|
|
||||||
|
# Initialize parser
|
||||||
|
parser = argparse.ArgumentParser(description = 'Crawl RSS feeds and store articles as Markdown files.')
|
||||||
|
parser.add_argument('-V', '--version', action = 'store_true', help = 'show version and exit')
|
||||||
|
parser.add_argument('-v', '--verbose', action = 'store_true', help = 'verbose output')
|
||||||
|
parser.add_argument('-c', '--config', default = './config.toml', help = 'config file (default: ./config.toml)')
|
||||||
|
|
||||||
|
# Get args
|
||||||
|
args = parser.parse_args()
|
||||||
|
show_version = args.version
|
||||||
|
verbose = args.verbose
|
||||||
|
config = args.config
|
||||||
|
|
||||||
|
if show_version:
|
||||||
|
print('spiderss v{}'.format(version))
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
# Main routine
|
||||||
print_logo()
|
print_logo()
|
||||||
|
load_config(config)
|
||||||
## Get arguments
|
|
||||||
#try:
|
|
||||||
# opts, args = getopt,getopt(argv, 'h', ['ifile=', 'ofile='])
|
|
||||||
#except:
|
|
||||||
# print('spiderrss.py [ run | create_config <file> ]')
|
|
||||||
|
|
||||||
#for opt, arg in opts:
|
|
||||||
# if opt == '-h':
|
|
||||||
# print('spiderrss.py [ run | create_config <file> ]')
|
|
||||||
|
|
||||||
|
|
||||||
initialize()
|
initialize()
|
||||||
crawl()
|
crawl()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main(sys.argv[1:])
|
main()
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue