add optional article scraping
This commit is contained in:
parent
2c27ab316c
commit
a081019f51
3 changed files with 70 additions and 24 deletions
|
|
@ -4,17 +4,23 @@ base_directory = '/home/<user>/rss'
|
||||||
# Articles older than max_age (days) will be deleted and not be added
|
# Articles older than max_age (days) will be deleted and not be added
|
||||||
max_age = 30
|
max_age = 30
|
||||||
|
|
||||||
|
# Date and time format as strftime to be included in the articles
|
||||||
|
datetime_format = '%d.%m.%Y %H:%M'
|
||||||
|
|
||||||
# Feeds
|
# Feeds
|
||||||
# The category can be empty (''). The feed fill then be stored in the base_directory.
|
# The category can be empty (''). The feed fill then be stored in the base_directory.
|
||||||
# The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware').
|
# The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware').
|
||||||
# The name can also be empty (''). feeds with the same category will then be stored in the same directory.
|
# The name can be empty, too (''). feeds with the same category will then be stored in the same directory.
|
||||||
|
# If scrape is set to true, the article content will be fetched from it's link. Otherwise the content of the RSS article is used.
|
||||||
|
|
||||||
[[feed]]
|
[[feed]]
|
||||||
category = 'News'
|
category = 'News'
|
||||||
name = 'Newssite'
|
name = 'Newssite'
|
||||||
url = 'https://example.org/feed'
|
url = 'https://example.org/feed'
|
||||||
|
scrape = false
|
||||||
|
|
||||||
[[feed]]
|
[[feed]]
|
||||||
category = 'News'
|
category = 'News'
|
||||||
name = 'Newssite 2'
|
name = 'Newssite 2'
|
||||||
url = 'https://example.org/feed'
|
url = 'https://example.org/feed'
|
||||||
|
scrape = true
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ def print_outline(outline, category):
|
||||||
print('category = \'{}\''.format(category))
|
print('category = \'{}\''.format(category))
|
||||||
print('name = \'{}\''.format(outline.text))
|
print('name = \'{}\''.format(outline.text))
|
||||||
print('url = \'{}\''.format(outline.xmlUrl))
|
print('url = \'{}\''.format(outline.xmlUrl))
|
||||||
|
print('scrape = false')
|
||||||
print('')
|
print('')
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
|
||||||
85
spiderss.py
85
spiderss.py
|
|
@ -13,12 +13,6 @@ from datetime import datetime, timedelta
|
||||||
from readability import Document
|
from readability import Document
|
||||||
from time import mktime
|
from time import mktime
|
||||||
|
|
||||||
'''
|
|
||||||
Static variables
|
|
||||||
'''
|
|
||||||
|
|
||||||
version = '0.1'
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Output functions
|
Output functions
|
||||||
'''
|
'''
|
||||||
|
|
@ -57,8 +51,8 @@ def print_logo():
|
||||||
Utility functions
|
Utility functions
|
||||||
'''
|
'''
|
||||||
|
|
||||||
# Get HTML content of a webpage
|
# Get readable HTML of a webpage
|
||||||
def get_html_content(url):
|
def get_readable_html(url):
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
doc = Document(response.text)
|
doc = Document(response.text)
|
||||||
return doc.summary()
|
return doc.summary()
|
||||||
|
|
@ -66,7 +60,16 @@ def get_html_content(url):
|
||||||
|
|
||||||
# Convert HTML to Markdown
|
# Convert HTML to Markdown
|
||||||
def html_to_markdown(html):
|
def html_to_markdown(html):
|
||||||
return html2text.html2text(html)
|
h = html2text.HTML2Text()
|
||||||
|
h.unicode_snob = True
|
||||||
|
h.ignore_links = True
|
||||||
|
h.ignore_images = False
|
||||||
|
#h.ignore_anchors = True
|
||||||
|
#h.skip_internal_links = True
|
||||||
|
#h.protect_links = True
|
||||||
|
#h.use_automatic_links = True
|
||||||
|
h.body_width = 0
|
||||||
|
return h.handle(html).strip()
|
||||||
|
|
||||||
|
|
||||||
# Get articles of a feed
|
# Get articles of a feed
|
||||||
|
|
@ -82,7 +85,7 @@ def write_to_file(filename, text):
|
||||||
file.close()
|
file.close()
|
||||||
|
|
||||||
|
|
||||||
# Get filename from feedparser article
|
# Get filename from a date and a title
|
||||||
def get_filename(date, title):
|
def get_filename(date, title):
|
||||||
|
|
||||||
# Get date as single block
|
# Get date as single block
|
||||||
|
|
@ -95,11 +98,52 @@ def get_filename(date, title):
|
||||||
return '{}_{}.md'.format(date, title)
|
return '{}_{}.md'.format(date, title)
|
||||||
|
|
||||||
|
|
||||||
# Get Markdown text from an article
|
# Get summary snippet for an article
|
||||||
def get_article_text(article):
|
def get_article_summary(article):
|
||||||
head = '# {}\n\n[Link]({})'.format(article.title, article.link)
|
try:
|
||||||
body = html_to_markdown(get_html_content(article.link))
|
h = html2text.HTML2Text()
|
||||||
return '{}\n\n{}'.format(head, body)
|
h.unicode_snob = True
|
||||||
|
h.ignore_links = True
|
||||||
|
h.ignore_images = True
|
||||||
|
#h.ignore_anchors = True
|
||||||
|
#h.skip_internal_links = True
|
||||||
|
h.body_width = 0
|
||||||
|
summary = h.handle(article.summary).split('\n\n')[0].strip()
|
||||||
|
return '**{}**\n\n'.format(summary)
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
# Get image snippet for an article
|
||||||
|
def get_article_image(article):
|
||||||
|
try:
|
||||||
|
image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image')
|
||||||
|
return '\n\n'.format(image_url)
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
# Get text from an article
|
||||||
|
def get_article(article, scrape):
|
||||||
|
|
||||||
|
# Construct head of article
|
||||||
|
image_url = get_article_image(article)
|
||||||
|
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
|
||||||
|
head = '# {}\n\n{}{}{} - [Link]({})'.format(article.title, image_url, get_article_summary(article), date, article.link)
|
||||||
|
|
||||||
|
# Get body of article
|
||||||
|
if scrape:
|
||||||
|
body_html = get_readable_html(article.link)
|
||||||
|
else:
|
||||||
|
body_html = ''
|
||||||
|
if hasattr(article, 'content'):
|
||||||
|
for c in article.content:
|
||||||
|
if c.type == 'text/html':
|
||||||
|
body_html += c.value
|
||||||
|
|
||||||
|
body = html_to_markdown(body_html)
|
||||||
|
|
||||||
|
return '{}\n\n---\n\n{}'.format(head, body)
|
||||||
|
|
||||||
|
|
||||||
# Update feed
|
# Update feed
|
||||||
|
|
@ -122,7 +166,7 @@ def update_feed(feed):
|
||||||
if date > threshold_date:
|
if date > threshold_date:
|
||||||
filename = get_filename(date, a.title)
|
filename = get_filename(date, a.title)
|
||||||
if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)):
|
if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)):
|
||||||
text = get_article_text(a)
|
text = get_article(a, feed['scrape'])
|
||||||
write_to_file(os.path.join(feedpath_new, filename), text)
|
write_to_file(os.path.join(feedpath_new, filename), text)
|
||||||
log(' added article "{}"'.format(a.title))
|
log(' added article "{}"'.format(a.title))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -150,12 +194,13 @@ def remove_old_articles():
|
||||||
# Parse config file
|
# Parse config file
|
||||||
def load_config(filepath):
|
def load_config(filepath):
|
||||||
|
|
||||||
global base_directory, max_age, feeds
|
global base_directory, max_age, datetime_format, feeds
|
||||||
|
|
||||||
try:
|
try:
|
||||||
config = toml.load(filepath)
|
config = toml.load(filepath)
|
||||||
base_directory = config['base_directory']
|
base_directory = config['base_directory']
|
||||||
max_age = config['max_age']
|
max_age = config['max_age']
|
||||||
|
datetime_format = config['datetime_format']
|
||||||
feeds = config['feed']
|
feeds = config['feed']
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error('while parsing config: {}'.format(e))
|
error('while parsing config: {}'.format(e))
|
||||||
|
|
@ -191,20 +236,14 @@ def main():
|
||||||
|
|
||||||
# Initialize parser
|
# Initialize parser
|
||||||
parser = argparse.ArgumentParser(description = 'Crawl RSS feeds and store articles as Markdown files.')
|
parser = argparse.ArgumentParser(description = 'Crawl RSS feeds and store articles as Markdown files.')
|
||||||
parser.add_argument('-V', '--version', action = 'store_true', help = 'show version and exit')
|
|
||||||
parser.add_argument('-v', '--verbose', action = 'store_true', help = 'verbose output')
|
parser.add_argument('-v', '--verbose', action = 'store_true', help = 'verbose output')
|
||||||
parser.add_argument('-c', '--config', default = './config.toml', help = 'config file (default: ./config.toml)')
|
parser.add_argument('-c', '--config', default = './config.toml', help = 'config file (default: ./config.toml)')
|
||||||
|
|
||||||
# Get args
|
# Get args
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
show_version = args.version
|
|
||||||
verbose = args.verbose
|
verbose = args.verbose
|
||||||
config = args.config
|
config = args.config
|
||||||
|
|
||||||
if show_version:
|
|
||||||
print('spiderss v{}'.format(version))
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
# Main routine
|
# Main routine
|
||||||
print_logo()
|
print_logo()
|
||||||
load_config(config)
|
load_config(config)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue