add config and rename main.py to spiderrss.py
This commit is contained in:
parent
66d9ba1fb5
commit
2e7e3da309
3 changed files with 81 additions and 26 deletions
17
config.py
Normal file
17
config.py
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
# This defines the base directory for the feeds relative to this config file
|
||||||
|
base_directory = '/home/denis/spiderrss'
|
||||||
|
|
||||||
|
# Update interval in minutes
|
||||||
|
update_interval = 15
|
||||||
|
|
||||||
|
# Articles older than max_age will be deleted and not be added
|
||||||
|
max_age = 365
|
||||||
|
|
||||||
|
# Enable verbose output
|
||||||
|
verbose = True
|
||||||
|
|
||||||
|
# Feeds in the form (category, name, url) - the category can be empty ('')
|
||||||
|
feeds = [
|
||||||
|
('News', 'Tagesschau', 'https://www.tagesschau.de/xml/rss2'),
|
||||||
|
('News', 'Vice', 'htdtps://www.vice.com/de/rss'),
|
||||||
|
]
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
feedparser
|
feedparser
|
||||||
readability-lxml
|
readability-lxml
|
||||||
requests
|
requests
|
||||||
|
html2text
|
||||||
|
|
|
||||||
|
|
@ -4,16 +4,21 @@ import requests
|
||||||
import html2text
|
import html2text
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
import time
|
||||||
from time import mktime
|
from time import mktime
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
from config import base_directory, update_interval, max_age, verbose, feeds
|
||||||
|
import logging
|
||||||
|
import sys, getopt
|
||||||
|
|
||||||
feeds = [('News', 'Tagesschau', 'https://www.tagesschau.de/xml/rss2'),
|
def log(text):
|
||||||
('Linux', 'NixOS', 'https://nixos.org/blogs.xml'),
|
if verbose:
|
||||||
('News', 'Vice', 'https://www.vice.com/de/rss')
|
#logging.info(text)
|
||||||
]
|
print('{} - {}'.format(datetime.now().strftime('%d.%m %H:%M'), text))
|
||||||
|
|
||||||
out_directory = './out'
|
def error(text):
|
||||||
delta = 365
|
#logging.error(text)
|
||||||
|
print('{} - ERROR: {}'.format(datetime.now().strftime('%d.%m %H:%M'), text))
|
||||||
|
|
||||||
|
|
||||||
# Get content of a webpage
|
# Get content of a webpage
|
||||||
|
|
@ -27,10 +32,14 @@ def html_to_markdown(html):
|
||||||
return html2text.html2text(html)
|
return html2text.html2text(html)
|
||||||
|
|
||||||
|
|
||||||
# Get articles of a RSS feed
|
# Get articles of a feed
|
||||||
def get_articles(url):
|
def get_articles(feed):
|
||||||
feed = feedparser.parse(url)
|
try:
|
||||||
return feed.entries
|
feed = feedparser.parse(feed[2])
|
||||||
|
return feed.entries
|
||||||
|
except Exception as e:
|
||||||
|
error('failed to get feed "{}: {}"'.format(feed[1], e.msg))
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
def write_to_file(filename, text):
|
def write_to_file(filename, text):
|
||||||
|
|
@ -54,20 +63,21 @@ def get_filename(date, title):
|
||||||
|
|
||||||
# Update feed
|
# Update feed
|
||||||
def update_feed(feed):
|
def update_feed(feed):
|
||||||
|
|
||||||
category = feed[0]
|
category = feed[0]
|
||||||
name = feed[1]
|
name = feed[1]
|
||||||
url = feed[2]
|
|
||||||
|
|
||||||
feedpath_new = os.path.join(out_directory, category, name, 'new')
|
log('updating feed "{}"'.format(name))
|
||||||
feedpath_read = os.path.join(out_directory, category, name, 'read')
|
|
||||||
|
feedpath_new = os.path.join(base_directory, category, name, 'new')
|
||||||
|
feedpath_read = os.path.join(base_directory, category, name, 'read')
|
||||||
if not os.path.exists(feedpath_new):
|
if not os.path.exists(feedpath_new):
|
||||||
os.makedirs(feedpath_new)
|
os.makedirs(feedpath_new)
|
||||||
if not os.path.exists(feedpath_read):
|
if not os.path.exists(feedpath_read):
|
||||||
os.makedirs(feedpath_read)
|
os.makedirs(feedpath_read)
|
||||||
|
|
||||||
articles = get_articles(url)
|
articles = get_articles(feed)
|
||||||
threshold_date = datetime.now() - timedelta(days = delta)
|
threshold_date = datetime.now() - timedelta(days = max_age)
|
||||||
for a in articles:
|
for a in articles:
|
||||||
date = datetime.fromtimestamp(mktime(a.published_parsed))
|
date = datetime.fromtimestamp(mktime(a.published_parsed))
|
||||||
if date > threshold_date:
|
if date > threshold_date:
|
||||||
|
|
@ -77,31 +87,58 @@ def update_feed(feed):
|
||||||
write_to_file(os.path.join(feedpath_new, filename), text)
|
write_to_file(os.path.join(feedpath_new, filename), text)
|
||||||
|
|
||||||
|
|
||||||
# Delete articles older than day delta
|
# Delete articles older than max_age
|
||||||
def delete_old_articles():
|
def delete_old_articles():
|
||||||
|
|
||||||
threshold_date = datetime.now() - timedelta(days = delta)
|
threshold_date = datetime.now() - timedelta(days = max_age)
|
||||||
for subdir, dirs, files in os.walk(out_directory):
|
for subdir, dirs, files in os.walk(base_directory):
|
||||||
|
|
||||||
# Skip 'loved' directory
|
# Skip 'loved' directory
|
||||||
if not os.path.join(out_directory, 'loved') in subdir:
|
if not os.path.join(base_directory, 'loved') in subdir:
|
||||||
for file in files:
|
for file in files:
|
||||||
date = datetime.strptime(file[:12], '%Y%m%d%H%M')
|
date = datetime.strptime(file[:12], '%Y%m%d%H%M')
|
||||||
if threshold_date > date:
|
if threshold_date > date:
|
||||||
os.remove(os.path.join(subdir, file))
|
os.remove(os.path.join(subdir, file))
|
||||||
|
log('deleted old articles')
|
||||||
|
|
||||||
|
|
||||||
|
def initialize():
|
||||||
|
|
||||||
def main():
|
# Create 'loved' directory if not existent
|
||||||
lovedpath = os.path.join(out_directory, 'loved')
|
lovedpath = os.path.join(base_directory, 'loved')
|
||||||
if not os.path.exists(lovedpath):
|
if not os.path.exists(lovedpath):
|
||||||
os.makedirs(lovedpath)
|
os.makedirs(lovedpath)
|
||||||
for feed in feeds:
|
|
||||||
update_feed(feed)
|
|
||||||
delete_old_articles()
|
def crawl():
|
||||||
|
|
||||||
|
# Main loop
|
||||||
|
while True:
|
||||||
|
for feed in feeds:
|
||||||
|
update_feed(feed)
|
||||||
|
delete_old_articles()
|
||||||
|
time.sleep(update_interval * 60)
|
||||||
|
|
||||||
|
def get_help_message():
|
||||||
|
return 'spiderrss.py | run'
|
||||||
|
|
||||||
|
def main(argv):
|
||||||
|
|
||||||
|
# Get arguments
|
||||||
|
try:
|
||||||
|
opts, args = getopt,getopt(argv, 'h', ['ifile=', 'ofile='])
|
||||||
|
except:
|
||||||
|
print('spiderrss.py [ run | create_config <file> ]')
|
||||||
|
|
||||||
|
for opt, arg in opts:
|
||||||
|
if opt == '-h'
|
||||||
|
|
||||||
|
|
||||||
|
#initialize()
|
||||||
|
#crawl()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main(sys.argv[1:])
|
||||||
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue