From 66d9ba1fb55b8b9549224fbb8444ef675b282dae Mon Sep 17 00:00:00 2001 From: Denis Lehmann Date: Tue, 14 Apr 2020 01:36:22 +0200 Subject: [PATCH] implement basic functionality --- main.py | 107 ++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 93 insertions(+), 14 deletions(-) diff --git a/main.py b/main.py index b9149e1..eb4694b 100644 --- a/main.py +++ b/main.py @@ -1,25 +1,104 @@ -import bs4 import feedparser -import urllib.request +from readability import Document +import requests +import html2text +import re +import os +from time import mktime +from datetime import datetime, timedelta + +feeds = [('News', 'Tagesschau', 'https://www.tagesschau.de/xml/rss2'), + ('Linux', 'NixOS', 'https://nixos.org/blogs.xml'), + ('News', 'Vice', 'https://www.vice.com/de/rss') + ] + +out_directory = './out' +delta = 365 + # Get content of a webpage -def get_content(url): - page = urllib.request.Request(url, headers = {'User-Agent': 'Mozilla/5.0'}) - infile = urllib.request.urlopen(page).read() - data = infile.decode('ISO-8859-1') - soup = bs4.BeautifulSoup(data,features = 'html.parser') - return soup +def get_html_content(url): + response = requests.get(url) + doc = Document(response.text) + return doc.summary() -# Get entries of a RSS feed -def get_entries(url): + +def html_to_markdown(html): + return html2text.html2text(html) + + +# Get articles of a RSS feed +def get_articles(url): feed = feedparser.parse(url) return feed.entries + +def write_to_file(filename, text): + file = open(filename, 'w') + file.write(text) + file.close() + + +# Get filename from feedparser article +def get_filename(date, title): + + # Get date as single block + date = date.strftime('%Y%m%d%H%M') + + # Get title as lowercase words concatenated with underscores + title = re.sub('[^A-Za-z0-9 ]+', '', title.lower()) + title = re.sub(' ', '_', title) + + return '{}_{}.md'.format(date, title) + + +# Update feed +def update_feed(feed): + + category = feed[0] + name = feed[1] + url = feed[2] + + feedpath_new = os.path.join(out_directory, category, name, 'new') + feedpath_read = os.path.join(out_directory, category, name, 'read') + if not os.path.exists(feedpath_new): + os.makedirs(feedpath_new) + if not os.path.exists(feedpath_read): + os.makedirs(feedpath_read) + + articles = get_articles(url) + threshold_date = datetime.now() - timedelta(days = delta) + for a in articles: + date = datetime.fromtimestamp(mktime(a.published_parsed)) + if date > threshold_date: + filename = get_filename(date, a.title) + if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)): + text = html_to_markdown(get_html_content(a.link)) + write_to_file(os.path.join(feedpath_new, filename), text) + + +# Delete articles older than day delta +def delete_old_articles(): + + threshold_date = datetime.now() - timedelta(days = delta) + for subdir, dirs, files in os.walk(out_directory): + + # Skip 'loved' directory + if not os.path.join(out_directory, 'loved') in subdir: + for file in files: + date = datetime.strptime(file[:12], '%Y%m%d%H%M') + if threshold_date > date: + os.remove(os.path.join(subdir, file)) + + + def main(): - entries = get_entries("https://nixos.org/blogs.xml") - for e in entries: - print(e.title) - print(get_content(entries[0].link)) + lovedpath = os.path.join(out_directory, 'loved') + if not os.path.exists(lovedpath): + os.makedirs(lovedpath) + for feed in feeds: + update_feed(feed) + delete_old_articles()