add better logging
This commit is contained in:
parent
4d8b232dcf
commit
426e68dce7
2 changed files with 18 additions and 11 deletions
|
|
@ -13,5 +13,5 @@ verbose = True
|
||||||
# Feeds in the form (category, name, url) - the category can be empty ('')
|
# Feeds in the form (category, name, url) - the category can be empty ('')
|
||||||
feeds = [
|
feeds = [
|
||||||
('News', 'Tagesschau', 'https://www.tagesschau.de/xml/rss2'),
|
('News', 'Tagesschau', 'https://www.tagesschau.de/xml/rss2'),
|
||||||
('News', 'Vice', 'htdtps://www.vice.com/de/rss'),
|
('News', 'Vice', 'https://www.vice.com/de/rss'),
|
||||||
]
|
]
|
||||||
|
|
|
||||||
27
spiderss.py
Normal file → Executable file
27
spiderss.py
Normal file → Executable file
|
|
@ -1,3 +1,5 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import feedparser
|
import feedparser
|
||||||
from readability import Document
|
from readability import Document
|
||||||
import requests
|
import requests
|
||||||
|
|
@ -62,12 +64,8 @@ def html_to_markdown(html):
|
||||||
|
|
||||||
# Get articles of a feed
|
# Get articles of a feed
|
||||||
def get_articles(feed):
|
def get_articles(feed):
|
||||||
try:
|
feed = feedparser.parse(feed[2])
|
||||||
feed = feedparser.parse(feed[2])
|
return feed.entries
|
||||||
return feed.entries
|
|
||||||
except Exception as e:
|
|
||||||
error('failed to get feed "{}: {}"'.format(feed[1], e.msg))
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
def write_to_file(filename, text):
|
def write_to_file(filename, text):
|
||||||
|
|
@ -89,6 +87,13 @@ def get_filename(date, title):
|
||||||
return '{}_{}.md'.format(date, title)
|
return '{}_{}.md'.format(date, title)
|
||||||
|
|
||||||
|
|
||||||
|
# Get Markdown text from an article
|
||||||
|
def get_article_text(article):
|
||||||
|
head = '# {}\n\n[Link]({})'.format(article.title, article.link)
|
||||||
|
body = html_to_markdown(get_html_content(article.link))
|
||||||
|
return '{}\n\n{}'.format(head, body)
|
||||||
|
|
||||||
|
|
||||||
# Update feed
|
# Update feed
|
||||||
def update_feed(feed):
|
def update_feed(feed):
|
||||||
|
|
||||||
|
|
@ -111,13 +116,15 @@ def update_feed(feed):
|
||||||
if date > threshold_date:
|
if date > threshold_date:
|
||||||
filename = get_filename(date, a.title)
|
filename = get_filename(date, a.title)
|
||||||
if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)):
|
if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)):
|
||||||
text = html_to_markdown(get_html_content(a.link))
|
text = get_article_text(a)
|
||||||
write_to_file(os.path.join(feedpath_new, filename), text)
|
write_to_file(os.path.join(feedpath_new, filename), text)
|
||||||
|
|
||||||
|
|
||||||
# Delete articles older than max_age
|
# Delete articles older than max_age
|
||||||
def delete_old_articles():
|
def delete_old_articles():
|
||||||
|
|
||||||
|
log('removing old articles')
|
||||||
|
|
||||||
threshold_date = datetime.now() - timedelta(days = max_age)
|
threshold_date = datetime.now() - timedelta(days = max_age)
|
||||||
for subdir, dirs, files in os.walk(base_directory):
|
for subdir, dirs, files in os.walk(base_directory):
|
||||||
|
|
||||||
|
|
@ -127,7 +134,6 @@ def delete_old_articles():
|
||||||
date = datetime.strptime(file[:12], '%Y%m%d%H%M')
|
date = datetime.strptime(file[:12], '%Y%m%d%H%M')
|
||||||
if threshold_date > date:
|
if threshold_date > date:
|
||||||
os.remove(os.path.join(subdir, file))
|
os.remove(os.path.join(subdir, file))
|
||||||
log('deleted old articles')
|
|
||||||
|
|
||||||
|
|
||||||
def initialize():
|
def initialize():
|
||||||
|
|
@ -142,6 +148,7 @@ def crawl():
|
||||||
|
|
||||||
# Main loop
|
# Main loop
|
||||||
while True:
|
while True:
|
||||||
|
log('starting crawl')
|
||||||
for feed in feeds:
|
for feed in feeds:
|
||||||
update_feed(feed)
|
update_feed(feed)
|
||||||
delete_old_articles()
|
delete_old_articles()
|
||||||
|
|
@ -165,8 +172,8 @@ def main(argv):
|
||||||
# print('spiderrss.py [ run | create_config <file> ]')
|
# print('spiderrss.py [ run | create_config <file> ]')
|
||||||
|
|
||||||
|
|
||||||
#initialize()
|
initialize()
|
||||||
#crawl()
|
crawl()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue