add optional article scraping

This commit is contained in:
Denis Lehmann 2020-04-16 01:43:12 +02:00
parent 2c27ab316c
commit a081019f51
3 changed files with 70 additions and 24 deletions

View file

@ -4,17 +4,23 @@ base_directory = '/home/<user>/rss'
# Articles older than max_age (days) will be deleted and not be added # Articles older than max_age (days) will be deleted and not be added
max_age = 30 max_age = 30
# Date and time format as strftime to be included in the articles
datetime_format = '%d.%m.%Y %H:%M'
# Feeds # Feeds
# The category can be empty (''). The feed fill then be stored in the base_directory. # The category can be empty (''). The feed fill then be stored in the base_directory.
# The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware'). # The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware').
# The name can also be empty (''). feeds with the same category will then be stored in the same directory. # The name can be empty, too (''). feeds with the same category will then be stored in the same directory.
# If scrape is set to true, the article content will be fetched from it's link. Otherwise the content of the RSS article is used.
[[feed]] [[feed]]
category = 'News' category = 'News'
name = 'Newssite' name = 'Newssite'
url = 'https://example.org/feed' url = 'https://example.org/feed'
scrape = false
[[feed]] [[feed]]
category = 'News' category = 'News'
name = 'Newssite 2' name = 'Newssite 2'
url = 'https://example.org/feed' url = 'https://example.org/feed'
scrape = true

View file

@ -15,6 +15,7 @@ def print_outline(outline, category):
print('category = \'{}\''.format(category)) print('category = \'{}\''.format(category))
print('name = \'{}\''.format(outline.text)) print('name = \'{}\''.format(outline.text))
print('url = \'{}\''.format(outline.xmlUrl)) print('url = \'{}\''.format(outline.xmlUrl))
print('scrape = false')
print('') print('')
''' '''

View file

@ -13,12 +13,6 @@ from datetime import datetime, timedelta
from readability import Document from readability import Document
from time import mktime from time import mktime
'''
Static variables
'''
version = '0.1'
''' '''
Output functions Output functions
''' '''
@ -57,8 +51,8 @@ def print_logo():
Utility functions Utility functions
''' '''
# Get HTML content of a webpage # Get readable HTML of a webpage
def get_html_content(url): def get_readable_html(url):
response = requests.get(url) response = requests.get(url)
doc = Document(response.text) doc = Document(response.text)
return doc.summary() return doc.summary()
@ -66,7 +60,16 @@ def get_html_content(url):
# Convert HTML to Markdown # Convert HTML to Markdown
def html_to_markdown(html): def html_to_markdown(html):
return html2text.html2text(html) h = html2text.HTML2Text()
h.unicode_snob = True
h.ignore_links = True
h.ignore_images = False
#h.ignore_anchors = True
#h.skip_internal_links = True
#h.protect_links = True
#h.use_automatic_links = True
h.body_width = 0
return h.handle(html).strip()
# Get articles of a feed # Get articles of a feed
@ -82,7 +85,7 @@ def write_to_file(filename, text):
file.close() file.close()
# Get filename from feedparser article # Get filename from a date and a title
def get_filename(date, title): def get_filename(date, title):
# Get date as single block # Get date as single block
@ -95,11 +98,52 @@ def get_filename(date, title):
return '{}_{}.md'.format(date, title) return '{}_{}.md'.format(date, title)
# Get Markdown text from an article # Get summary snippet for an article
def get_article_text(article): def get_article_summary(article):
head = '# {}\n\n[Link]({})'.format(article.title, article.link) try:
body = html_to_markdown(get_html_content(article.link)) h = html2text.HTML2Text()
return '{}\n\n{}'.format(head, body) h.unicode_snob = True
h.ignore_links = True
h.ignore_images = True
#h.ignore_anchors = True
#h.skip_internal_links = True
h.body_width = 0
summary = h.handle(article.summary).split('\n\n')[0].strip()
return '**{}**\n\n'.format(summary)
except:
return ''
# Get image snippet for an article
def get_article_image(article):
try:
image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image')
return '![Image]({})\n\n'.format(image_url)
except:
return ''
# Get text from an article
def get_article(article, scrape):
# Construct head of article
image_url = get_article_image(article)
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
head = '# {}\n\n{}{}{} - [Link]({})'.format(article.title, image_url, get_article_summary(article), date, article.link)
# Get body of article
if scrape:
body_html = get_readable_html(article.link)
else:
body_html = ''
if hasattr(article, 'content'):
for c in article.content:
if c.type == 'text/html':
body_html += c.value
body = html_to_markdown(body_html)
return '{}\n\n---\n\n{}'.format(head, body)
# Update feed # Update feed
@ -122,7 +166,7 @@ def update_feed(feed):
if date > threshold_date: if date > threshold_date:
filename = get_filename(date, a.title) filename = get_filename(date, a.title)
if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)): if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)):
text = get_article_text(a) text = get_article(a, feed['scrape'])
write_to_file(os.path.join(feedpath_new, filename), text) write_to_file(os.path.join(feedpath_new, filename), text)
log(' added article "{}"'.format(a.title)) log(' added article "{}"'.format(a.title))
except Exception as e: except Exception as e:
@ -150,12 +194,13 @@ def remove_old_articles():
# Parse config file # Parse config file
def load_config(filepath): def load_config(filepath):
global base_directory, max_age, feeds global base_directory, max_age, datetime_format, feeds
try: try:
config = toml.load(filepath) config = toml.load(filepath)
base_directory = config['base_directory'] base_directory = config['base_directory']
max_age = config['max_age'] max_age = config['max_age']
datetime_format = config['datetime_format']
feeds = config['feed'] feeds = config['feed']
except Exception as e: except Exception as e:
error('while parsing config: {}'.format(e)) error('while parsing config: {}'.format(e))
@ -191,20 +236,14 @@ def main():
# Initialize parser # Initialize parser
parser = argparse.ArgumentParser(description = 'Crawl RSS feeds and store articles as Markdown files.') parser = argparse.ArgumentParser(description = 'Crawl RSS feeds and store articles as Markdown files.')
parser.add_argument('-V', '--version', action = 'store_true', help = 'show version and exit')
parser.add_argument('-v', '--verbose', action = 'store_true', help = 'verbose output') parser.add_argument('-v', '--verbose', action = 'store_true', help = 'verbose output')
parser.add_argument('-c', '--config', default = './config.toml', help = 'config file (default: ./config.toml)') parser.add_argument('-c', '--config', default = './config.toml', help = 'config file (default: ./config.toml)')
# Get args # Get args
args = parser.parse_args() args = parser.parse_args()
show_version = args.version
verbose = args.verbose verbose = args.verbose
config = args.config config = args.config
if show_version:
print('spiderss v{}'.format(version))
sys.exit()
# Main routine # Main routine
print_logo() print_logo()
load_config(config) load_config(config)