add better logging

This commit is contained in:
Denis Lehmann 2020-04-14 12:44:33 +02:00
parent 4d8b232dcf
commit 426e68dce7
2 changed files with 18 additions and 11 deletions

View file

@ -13,5 +13,5 @@ verbose = True
# Feeds in the form (category, name, url) - the category can be empty ('')
feeds = [
('News', 'Tagesschau', 'https://www.tagesschau.de/xml/rss2'),
('News', 'Vice', 'htdtps://www.vice.com/de/rss'),
('News', 'Vice', 'https://www.vice.com/de/rss'),
]

27
spiderss.py Normal file → Executable file
View file

@ -1,3 +1,5 @@
#!/usr/bin/env python
import feedparser
from readability import Document
import requests
@ -62,12 +64,8 @@ def html_to_markdown(html):
# Get articles of a feed
def get_articles(feed):
try:
feed = feedparser.parse(feed[2])
return feed.entries
except Exception as e:
error('failed to get feed "{}: {}"'.format(feed[1], e.msg))
return []
feed = feedparser.parse(feed[2])
return feed.entries
def write_to_file(filename, text):
@ -89,6 +87,13 @@ def get_filename(date, title):
return '{}_{}.md'.format(date, title)
# Get Markdown text from an article
def get_article_text(article):
head = '# {}\n\n[Link]({})'.format(article.title, article.link)
body = html_to_markdown(get_html_content(article.link))
return '{}\n\n{}'.format(head, body)
# Update feed
def update_feed(feed):
@ -111,13 +116,15 @@ def update_feed(feed):
if date > threshold_date:
filename = get_filename(date, a.title)
if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)):
text = html_to_markdown(get_html_content(a.link))
text = get_article_text(a)
write_to_file(os.path.join(feedpath_new, filename), text)
# Delete articles older than max_age
def delete_old_articles():
log('removing old articles')
threshold_date = datetime.now() - timedelta(days = max_age)
for subdir, dirs, files in os.walk(base_directory):
@ -127,7 +134,6 @@ def delete_old_articles():
date = datetime.strptime(file[:12], '%Y%m%d%H%M')
if threshold_date > date:
os.remove(os.path.join(subdir, file))
log('deleted old articles')
def initialize():
@ -142,6 +148,7 @@ def crawl():
# Main loop
while True:
log('starting crawl')
for feed in feeds:
update_feed(feed)
delete_old_articles()
@ -165,8 +172,8 @@ def main(argv):
# print('spiderrss.py [ run | create_config <file> ]')
#initialize()
#crawl()
initialize()
crawl()