add better logging
This commit is contained in:
parent
4d8b232dcf
commit
426e68dce7
2 changed files with 18 additions and 11 deletions
|
|
@ -13,5 +13,5 @@ verbose = True
|
|||
# Feeds in the form (category, name, url) - the category can be empty ('')
|
||||
feeds = [
|
||||
('News', 'Tagesschau', 'https://www.tagesschau.de/xml/rss2'),
|
||||
('News', 'Vice', 'htdtps://www.vice.com/de/rss'),
|
||||
('News', 'Vice', 'https://www.vice.com/de/rss'),
|
||||
]
|
||||
|
|
|
|||
23
spiderss.py
Normal file → Executable file
23
spiderss.py
Normal file → Executable file
|
|
@ -1,3 +1,5 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import feedparser
|
||||
from readability import Document
|
||||
import requests
|
||||
|
|
@ -62,12 +64,8 @@ def html_to_markdown(html):
|
|||
|
||||
# Get articles of a feed
|
||||
def get_articles(feed):
|
||||
try:
|
||||
feed = feedparser.parse(feed[2])
|
||||
return feed.entries
|
||||
except Exception as e:
|
||||
error('failed to get feed "{}: {}"'.format(feed[1], e.msg))
|
||||
return []
|
||||
|
||||
|
||||
def write_to_file(filename, text):
|
||||
|
|
@ -89,6 +87,13 @@ def get_filename(date, title):
|
|||
return '{}_{}.md'.format(date, title)
|
||||
|
||||
|
||||
# Get Markdown text from an article
|
||||
def get_article_text(article):
|
||||
head = '# {}\n\n[Link]({})'.format(article.title, article.link)
|
||||
body = html_to_markdown(get_html_content(article.link))
|
||||
return '{}\n\n{}'.format(head, body)
|
||||
|
||||
|
||||
# Update feed
|
||||
def update_feed(feed):
|
||||
|
||||
|
|
@ -111,13 +116,15 @@ def update_feed(feed):
|
|||
if date > threshold_date:
|
||||
filename = get_filename(date, a.title)
|
||||
if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)):
|
||||
text = html_to_markdown(get_html_content(a.link))
|
||||
text = get_article_text(a)
|
||||
write_to_file(os.path.join(feedpath_new, filename), text)
|
||||
|
||||
|
||||
# Delete articles older than max_age
|
||||
def delete_old_articles():
|
||||
|
||||
log('removing old articles')
|
||||
|
||||
threshold_date = datetime.now() - timedelta(days = max_age)
|
||||
for subdir, dirs, files in os.walk(base_directory):
|
||||
|
||||
|
|
@ -127,7 +134,6 @@ def delete_old_articles():
|
|||
date = datetime.strptime(file[:12], '%Y%m%d%H%M')
|
||||
if threshold_date > date:
|
||||
os.remove(os.path.join(subdir, file))
|
||||
log('deleted old articles')
|
||||
|
||||
|
||||
def initialize():
|
||||
|
|
@ -142,6 +148,7 @@ def crawl():
|
|||
|
||||
# Main loop
|
||||
while True:
|
||||
log('starting crawl')
|
||||
for feed in feeds:
|
||||
update_feed(feed)
|
||||
delete_old_articles()
|
||||
|
|
@ -165,8 +172,8 @@ def main(argv):
|
|||
# print('spiderrss.py [ run | create_config <file> ]')
|
||||
|
||||
|
||||
#initialize()
|
||||
#crawl()
|
||||
initialize()
|
||||
crawl()
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue