add postprocessing feature
This commit is contained in:
parent
400631fba9
commit
50f54f20c5
3 changed files with 85 additions and 56 deletions
11
README.md
11
README.md
|
|
@ -9,7 +9,8 @@ Read the news you want, the way you want it.
|
||||||
Without advertisements, clickbait and trackers.
|
Without advertisements, clickbait and trackers.
|
||||||
Drop unresponsive web interfaces and stop accepting cookies, because plaintext is God.
|
Drop unresponsive web interfaces and stop accepting cookies, because plaintext is God.
|
||||||
|
|
||||||
Articles are scraped as Markdown files from the original article web page and stored in a special folder structure.
|
Articles are scraped by default as Markdown files from the original article web page and stored in a special folder structure.
|
||||||
|
You can parse articles in your favourite file format by defining your own postprocessor.
|
||||||
|
|
||||||
__Note:__ This script is under development and far from being complete.
|
__Note:__ This script is under development and far from being complete.
|
||||||
Until now it works for the most feeds I read.
|
Until now it works for the most feeds I read.
|
||||||
|
|
@ -65,6 +66,12 @@ base_directory = '/home/<user>/rss'
|
||||||
# Articles older than max_age (days) will be deleted and not be added.
|
# Articles older than max_age (days) will be deleted and not be added.
|
||||||
max_age = 30
|
max_age = 30
|
||||||
|
|
||||||
|
# Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout.
|
||||||
|
postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links --reference-location=document'
|
||||||
|
|
||||||
|
# Fileending for the article files.
|
||||||
|
fileending = 'md'
|
||||||
|
|
||||||
# Date and time format as strftime to be included in the articles.
|
# Date and time format as strftime to be included in the articles.
|
||||||
datetime_format = '%d.%m.%Y %H:%M'
|
datetime_format = '%d.%m.%Y %H:%M'
|
||||||
|
|
||||||
|
|
@ -146,6 +153,6 @@ Just synchronize the base_directory with [Syncthing](https://syncthing.net/), [r
|
||||||
|
|
||||||
## Acknowledgements
|
## Acknowledgements
|
||||||
|
|
||||||
Thanks to all the people which created the nice libraries, this project in based on.
|
Thanks to all the people, who created the nice libraries this project in based on.
|
||||||
And also thanks to Dieter Steffmann who created the Canterbury font, which is used for the logo.
|
And also thanks to Dieter Steffmann who created the Canterbury font, which is used for the logo.
|
||||||
You can find it in the `fonts/` directory.
|
You can find it in the `fonts/` directory.
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,12 @@ max_age = 30
|
||||||
# Date and time format as strftime to be included in the articles.
|
# Date and time format as strftime to be included in the articles.
|
||||||
datetime_format = '%d.%m.%Y %H:%M'
|
datetime_format = '%d.%m.%Y %H:%M'
|
||||||
|
|
||||||
|
# Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout.
|
||||||
|
postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links --reference-location=document'
|
||||||
|
|
||||||
|
# Fileending for the article files.
|
||||||
|
fileending = 'md'
|
||||||
|
|
||||||
# Feeds
|
# Feeds
|
||||||
# The category can be empty (''). The feed fill then be stored in the base_directory.
|
# The category can be empty (''). The feed fill then be stored in the base_directory.
|
||||||
# The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware').
|
# The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware').
|
||||||
|
|
|
||||||
124
spiderss.py
124
spiderss.py
|
|
@ -52,39 +52,19 @@ def print_logo():
|
||||||
Utility functions
|
Utility functions
|
||||||
'''
|
'''
|
||||||
|
|
||||||
# Get readable HTML of a webpage
|
|
||||||
def get_readable_html(url):
|
|
||||||
response = requests.get(url)
|
|
||||||
doc = Document(response.text)
|
|
||||||
return doc.summary()
|
|
||||||
|
|
||||||
|
|
||||||
# Convert HTML to Markdown
|
|
||||||
def html_to_markdown(html):
|
|
||||||
h = html2text.HTML2Text()
|
|
||||||
h.unicode_snob = True
|
|
||||||
h.ignore_links = True
|
|
||||||
h.ignore_images = False
|
|
||||||
#h.ignore_anchors = True
|
|
||||||
#h.skip_internal_links = True
|
|
||||||
#h.protect_links = True
|
|
||||||
#h.use_automatic_links = True
|
|
||||||
h.body_width = 0
|
|
||||||
return h.handle(html).strip()
|
|
||||||
|
|
||||||
|
|
||||||
# Get articles of a feed
|
# Get articles of a feed
|
||||||
def get_articles(feed_url):
|
def get_articles(feed):
|
||||||
feed = feedparser.parse(feed_url)
|
feed = feedparser.parse(feed['url'])
|
||||||
return feed.entries
|
return feed.entries
|
||||||
|
|
||||||
|
|
||||||
# Write text to file
|
# Write text to file
|
||||||
def write_to_file(filepath, text):
|
def write_to_file(filepath, text):
|
||||||
|
|
||||||
# Postprocess article with pandoc and write to file
|
file = open(filepath, 'w')
|
||||||
pandoc = subprocess.Popen(['pandoc', '-f', 'markdown', '-t', 'markdown', '-o', filepath], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
file.write(text)
|
||||||
pandoc.communicate(input = text.encode())
|
file.close()
|
||||||
|
|
||||||
|
|
||||||
# Get filename from a date and a title
|
# Get filename from a date and a title
|
||||||
|
|
@ -97,72 +77,104 @@ def get_filename(date, title):
|
||||||
title = re.sub('[^A-Za-z0-9 ]+', '', title.lower())
|
title = re.sub('[^A-Za-z0-9 ]+', '', title.lower())
|
||||||
title = re.sub(' ', '_', title)
|
title = re.sub(' ', '_', title)
|
||||||
|
|
||||||
return '{}_{}.md'.format(date, title)
|
return '{}_{}.{}'.format(date, title, fileending)
|
||||||
|
|
||||||
|
|
||||||
|
# Get image snippet for an article
|
||||||
|
def get_article_image(article):
|
||||||
|
|
||||||
|
try:
|
||||||
|
image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image')
|
||||||
|
return '<img src="{}" alt="Image">\n\n'.format(image_url)
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
# Get summary snippet for an article
|
# Get summary snippet for an article
|
||||||
def get_article_summary(article):
|
def get_article_summary(article):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
h = html2text.HTML2Text()
|
h = html2text.HTML2Text()
|
||||||
h.unicode_snob = True
|
h.unicode_snob = True
|
||||||
h.ignore_links = True
|
h.ignore_links = True
|
||||||
h.ignore_images = True
|
h.ignore_images = True
|
||||||
#h.ignore_anchors = True
|
|
||||||
#h.skip_internal_links = True
|
|
||||||
h.body_width = 0
|
h.body_width = 0
|
||||||
summary = h.handle(article.summary).split('\n\n')[0].strip()
|
return '<p><b>{}</b></p>\n\n'.format(summary)
|
||||||
return '**{}**\n\n'.format(summary)
|
|
||||||
except:
|
except:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
|
||||||
# Get image snippet for an article
|
# Get article body either from web or its content
|
||||||
def get_article_image(article):
|
def get_article_body(article, scrape):
|
||||||
try:
|
|
||||||
image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image')
|
|
||||||
return '\n\n'.format(image_url)
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
body = ''
|
||||||
|
|
||||||
# Get text from an article
|
# If scrape, get article with readability
|
||||||
def get_article(article, scrape):
|
|
||||||
|
|
||||||
# Construct head of article
|
|
||||||
image_url = get_article_image(article)
|
|
||||||
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
|
|
||||||
head = '# {}\n\n{}{}{}\n\n[Link]({})'.format(article.title, image_url, get_article_summary(article), date, article.link)
|
|
||||||
|
|
||||||
# Get body of article
|
|
||||||
if scrape:
|
if scrape:
|
||||||
body_html = get_readable_html(article.link)
|
|
||||||
|
response = requests.get(article.link)
|
||||||
|
doc = Document(response.text)
|
||||||
|
body = doc.summary()
|
||||||
|
|
||||||
|
# Else construct from article content
|
||||||
else:
|
else:
|
||||||
body_html = ''
|
|
||||||
if hasattr(article, 'content'):
|
if hasattr(article, 'content'):
|
||||||
for c in article.content:
|
for c in article.content:
|
||||||
if c.type == 'text/html':
|
if c.type == 'text/html':
|
||||||
body_html += c.value
|
body += c.value
|
||||||
|
|
||||||
body = html_to_markdown(body_html)
|
return body
|
||||||
|
|
||||||
|
|
||||||
|
# Postprocess HTML
|
||||||
|
def postprocess(text):
|
||||||
|
|
||||||
|
processor = subprocess.Popen(postprocessor.split(' '), stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
|
||||||
|
output = processor.communicate(input = text.encode())[0].decode().strip()
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
# Get constructed article
|
||||||
|
def get_article(article, scrape):
|
||||||
|
|
||||||
|
# Construct head of article
|
||||||
|
image = get_article_image(article)
|
||||||
|
summary = get_article_summary(article)
|
||||||
|
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
|
||||||
|
head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link)
|
||||||
|
|
||||||
|
# Get body of article
|
||||||
|
body = get_article_body(article, scrape)
|
||||||
|
|
||||||
|
# Postprocess article
|
||||||
|
article_text = postprocess('{}\n\n<hr>\n\n{}'.format(head, body)).strip()
|
||||||
|
|
||||||
|
return article_text
|
||||||
|
|
||||||
return '{}\n\n---\n\n{}'.format(head, body)
|
|
||||||
|
|
||||||
|
|
||||||
# Update feed
|
# Update feed
|
||||||
def update_feed(feed):
|
def update_feed(feed):
|
||||||
|
|
||||||
log(' updating feed "{}"'.format(feed['name']))
|
log(' updating feed "{}"'.format(feed['name']))
|
||||||
|
|
||||||
|
# Set feedpaths
|
||||||
feedpath_new = os.path.join(base_directory, feed['category'], feed['name'], 'new')
|
feedpath_new = os.path.join(base_directory, feed['category'], feed['name'], 'new')
|
||||||
feedpath_read = os.path.join(base_directory, feed['category'], feed['name'], 'read')
|
feedpath_read = os.path.join(base_directory, feed['category'], feed['name'], 'read')
|
||||||
|
|
||||||
if not os.path.exists(feedpath_new):
|
if not os.path.exists(feedpath_new):
|
||||||
os.makedirs(feedpath_new)
|
os.makedirs(feedpath_new)
|
||||||
|
|
||||||
if not os.path.exists(feedpath_read):
|
if not os.path.exists(feedpath_read):
|
||||||
os.makedirs(feedpath_read)
|
os.makedirs(feedpath_read)
|
||||||
|
|
||||||
articles = get_articles(feed['url'])
|
# Update articles
|
||||||
|
articles = get_articles(feed)
|
||||||
threshold_date = datetime.now() - timedelta(days = max_age)
|
threshold_date = datetime.now() - timedelta(days = max_age)
|
||||||
|
|
||||||
for a in articles:
|
for a in articles:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
date = datetime.fromtimestamp(mktime(a.published_parsed))
|
date = datetime.fromtimestamp(mktime(a.published_parsed))
|
||||||
if date > threshold_date:
|
if date > threshold_date:
|
||||||
|
|
@ -171,6 +183,7 @@ def update_feed(feed):
|
||||||
text = get_article(a, feed['scrape'])
|
text = get_article(a, feed['scrape'])
|
||||||
write_to_file(os.path.join(feedpath_new, filename), text)
|
write_to_file(os.path.join(feedpath_new, filename), text)
|
||||||
log(' added article "{}"'.format(a.title))
|
log(' added article "{}"'.format(a.title))
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error('while parsing feed article "{}" from feed "{}": {}'.format(a.title, feed['name'], e))
|
error('while parsing feed article "{}" from feed "{}": {}'.format(a.title, feed['name'], e))
|
||||||
|
|
||||||
|
|
@ -193,16 +206,19 @@ def remove_old_articles():
|
||||||
|
|
||||||
log(' removed {} articles'.format(count))
|
log(' removed {} articles'.format(count))
|
||||||
|
|
||||||
|
|
||||||
# Parse config file
|
# Parse config file
|
||||||
def load_config(filepath):
|
def load_config(filepath):
|
||||||
|
|
||||||
global base_directory, max_age, datetime_format, feeds
|
global base_directory, max_age, datetime_format, postprocessor, fileending, feeds
|
||||||
|
|
||||||
try:
|
try:
|
||||||
config = toml.load(filepath)
|
config = toml.load(filepath)
|
||||||
base_directory = config['base_directory']
|
base_directory = config['base_directory']
|
||||||
max_age = config['max_age']
|
max_age = config['max_age']
|
||||||
datetime_format = config['datetime_format']
|
datetime_format = config['datetime_format']
|
||||||
|
postprocessor = config['postprocessor']
|
||||||
|
fileending = config['fileending']
|
||||||
feeds = config['feed']
|
feeds = config['feed']
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error('while parsing config: {}'.format(e))
|
error('while parsing config: {}'.format(e))
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue