add postprocessing feature

This commit is contained in:
Denis Lehmann 2020-04-18 11:36:31 +02:00
parent 400631fba9
commit 50f54f20c5
3 changed files with 85 additions and 56 deletions

View file

@ -9,7 +9,8 @@ Read the news you want, the way you want it.
Without advertisements, clickbait and trackers.
Drop unresponsive web interfaces and stop accepting cookies, because plaintext is God.
Articles are scraped as Markdown files from the original article web page and stored in a special folder structure.
Articles are scraped by default as Markdown files from the original article web page and stored in a special folder structure.
You can parse articles in your favourite file format by defining your own postprocessor.
__Note:__ This script is under development and far from being complete.
Until now it works for the most feeds I read.
@ -65,6 +66,12 @@ base_directory = '/home/<user>/rss'
# Articles older than max_age (days) will be deleted and not be added.
max_age = 30
# Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout.
postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links --reference-location=document'
# Fileending for the article files.
fileending = 'md'
# Date and time format as strftime to be included in the articles.
datetime_format = '%d.%m.%Y %H:%M'
@ -146,6 +153,6 @@ Just synchronize the base_directory with [Syncthing](https://syncthing.net/), [r
## Acknowledgements
Thanks to all the people which created the nice libraries, this project in based on.
Thanks to all the people, who created the nice libraries this project in based on.
And also thanks to Dieter Steffmann who created the Canterbury font, which is used for the logo.
You can find it in the `fonts/` directory.

View file

@ -7,6 +7,12 @@ max_age = 30
# Date and time format as strftime to be included in the articles.
datetime_format = '%d.%m.%Y %H:%M'
# Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout.
postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links --reference-location=document'
# Fileending for the article files.
fileending = 'md'
# Feeds
# The category can be empty (''). The feed fill then be stored in the base_directory.
# The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware').

View file

@ -52,39 +52,19 @@ def print_logo():
Utility functions
'''
# Get readable HTML of a webpage
def get_readable_html(url):
response = requests.get(url)
doc = Document(response.text)
return doc.summary()
# Convert HTML to Markdown
def html_to_markdown(html):
h = html2text.HTML2Text()
h.unicode_snob = True
h.ignore_links = True
h.ignore_images = False
#h.ignore_anchors = True
#h.skip_internal_links = True
#h.protect_links = True
#h.use_automatic_links = True
h.body_width = 0
return h.handle(html).strip()
# Get articles of a feed
def get_articles(feed_url):
feed = feedparser.parse(feed_url)
def get_articles(feed):
feed = feedparser.parse(feed['url'])
return feed.entries
# Write text to file
def write_to_file(filepath, text):
# Postprocess article with pandoc and write to file
pandoc = subprocess.Popen(['pandoc', '-f', 'markdown', '-t', 'markdown', '-o', filepath], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
pandoc.communicate(input = text.encode())
file = open(filepath, 'w')
file.write(text)
file.close()
# Get filename from a date and a title
@ -97,72 +77,104 @@ def get_filename(date, title):
title = re.sub('[^A-Za-z0-9 ]+', '', title.lower())
title = re.sub(' ', '_', title)
return '{}_{}.md'.format(date, title)
return '{}_{}.{}'.format(date, title, fileending)
# Get image snippet for an article
def get_article_image(article):
try:
image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image')
return '<img src="{}" alt="Image">\n\n'.format(image_url)
except:
return ''
# Get summary snippet for an article
def get_article_summary(article):
try:
h = html2text.HTML2Text()
h.unicode_snob = True
h.ignore_links = True
h.ignore_images = True
#h.ignore_anchors = True
#h.skip_internal_links = True
h.body_width = 0
summary = h.handle(article.summary).split('\n\n')[0].strip()
return '**{}**\n\n'.format(summary)
return '<p><b>{}</b></p>\n\n'.format(summary)
except:
return ''
# Get image snippet for an article
def get_article_image(article):
try:
image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image')
return '![Image]({})\n\n'.format(image_url)
except:
return ''
# Get article body either from web or its content
def get_article_body(article, scrape):
body = ''
# Get text from an article
def get_article(article, scrape):
# Construct head of article
image_url = get_article_image(article)
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
head = '# {}\n\n{}{}{}\n\n[Link]({})'.format(article.title, image_url, get_article_summary(article), date, article.link)
# Get body of article
# If scrape, get article with readability
if scrape:
body_html = get_readable_html(article.link)
response = requests.get(article.link)
doc = Document(response.text)
body = doc.summary()
# Else construct from article content
else:
body_html = ''
if hasattr(article, 'content'):
for c in article.content:
if c.type == 'text/html':
body_html += c.value
body += c.value
body = html_to_markdown(body_html)
return body
# Postprocess HTML
def postprocess(text):
processor = subprocess.Popen(postprocessor.split(' '), stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
output = processor.communicate(input = text.encode())[0].decode().strip()
return output
# Get constructed article
def get_article(article, scrape):
# Construct head of article
image = get_article_image(article)
summary = get_article_summary(article)
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link)
# Get body of article
body = get_article_body(article, scrape)
# Postprocess article
article_text = postprocess('{}\n\n<hr>\n\n{}'.format(head, body)).strip()
return article_text
return '{}\n\n---\n\n{}'.format(head, body)
# Update feed
def update_feed(feed):
log(' updating feed "{}"'.format(feed['name']))
# Set feedpaths
feedpath_new = os.path.join(base_directory, feed['category'], feed['name'], 'new')
feedpath_read = os.path.join(base_directory, feed['category'], feed['name'], 'read')
if not os.path.exists(feedpath_new):
os.makedirs(feedpath_new)
if not os.path.exists(feedpath_read):
os.makedirs(feedpath_read)
articles = get_articles(feed['url'])
# Update articles
articles = get_articles(feed)
threshold_date = datetime.now() - timedelta(days = max_age)
for a in articles:
try:
date = datetime.fromtimestamp(mktime(a.published_parsed))
if date > threshold_date:
@ -171,6 +183,7 @@ def update_feed(feed):
text = get_article(a, feed['scrape'])
write_to_file(os.path.join(feedpath_new, filename), text)
log(' added article "{}"'.format(a.title))
except Exception as e:
error('while parsing feed article "{}" from feed "{}": {}'.format(a.title, feed['name'], e))
@ -193,16 +206,19 @@ def remove_old_articles():
log(' removed {} articles'.format(count))
# Parse config file
def load_config(filepath):
global base_directory, max_age, datetime_format, feeds
global base_directory, max_age, datetime_format, postprocessor, fileending, feeds
try:
config = toml.load(filepath)
base_directory = config['base_directory']
max_age = config['max_age']
datetime_format = config['datetime_format']
postprocessor = config['postprocessor']
fileending = config['fileending']
feeds = config['feed']
except Exception as e:
error('while parsing config: {}'.format(e))