175 lines
4.8 KiB
Python
175 lines
4.8 KiB
Python
import feedparser
|
|
from readability import Document
|
|
import requests
|
|
import html2text
|
|
import re
|
|
import os
|
|
import time
|
|
from time import mktime
|
|
from datetime import datetime, timedelta
|
|
from config import base_directory, update_interval, max_age, verbose, feeds
|
|
import logging
|
|
import sys, getopt
|
|
|
|
|
|
'''
|
|
Output functions
|
|
'''
|
|
|
|
def log(text):
|
|
if verbose:
|
|
#logging.info(text)
|
|
print('{} - {}'.format(datetime.now().strftime('%d.%m %H:%M'), text))
|
|
|
|
|
|
def error(text):
|
|
#logging.error(text)
|
|
print('{} - ERROR: {}'.format(datetime.now().strftime('%d.%m %H:%M'), text))
|
|
|
|
|
|
def print_logo():
|
|
logo = '''
|
|
;:
|
|
.N' ,K:
|
|
,O .0MWx' lk 0;
|
|
,kl;';O lx. .xc :k, .kMMXl.c .:x. .xl .dl :kl,'oo .xd:,,O.
|
|
kNOokOWc ;WMKccXMX .WMX. :XM,,OMMl oXMcNMd 'oMMk:XW:.OWddO0N, cKNlkONk
|
|
MMo 0c KMK :MM' XMO oMM. MMl cMM. ON: .MMl 'MM, .K' OMX oO
|
|
WMWxOWMN: KMK ;MM' XMO oMM. MMl cMM:c; .MMl .WMXx0MMX. xMMOkNMWk
|
|
'X; .MMo KMK ;MM' XMO oMM. MMl cMM, .MMl :X. ;MM, .0d 0MX
|
|
.Kdc:'MMo.oNMNl;lMW. .WM0 KMMk:'MMl dMM: .. cMMk.' ,Xlc;cMM, xOl:'KMX
|
|
;kddkNKl. XMNkWk, :N0; .'cOW0c. ,lOW0; .:0Nl. okddOW0:. .kdoxXNd,
|
|
WMX
|
|
;..cc
|
|
'''
|
|
|
|
print(logo)
|
|
|
|
'''
|
|
Utility functions
|
|
'''
|
|
|
|
# Get content of a webpage
|
|
def get_html_content(url):
|
|
response = requests.get(url)
|
|
doc = Document(response.text)
|
|
return doc.summary()
|
|
|
|
|
|
def html_to_markdown(html):
|
|
return html2text.html2text(html)
|
|
|
|
|
|
# Get articles of a feed
|
|
def get_articles(feed):
|
|
try:
|
|
feed = feedparser.parse(feed[2])
|
|
return feed.entries
|
|
except Exception as e:
|
|
error('failed to get feed "{}: {}"'.format(feed[1], e.msg))
|
|
return []
|
|
|
|
|
|
def write_to_file(filename, text):
|
|
file = open(filename, 'w')
|
|
file.write(text)
|
|
file.close()
|
|
|
|
|
|
# Get filename from feedparser article
|
|
def get_filename(date, title):
|
|
|
|
# Get date as single block
|
|
date = date.strftime('%Y%m%d%H%M')
|
|
|
|
# Get title as lowercase words concatenated with underscores
|
|
title = re.sub('[^A-Za-z0-9 ]+', '', title.lower())
|
|
title = re.sub(' ', '_', title)
|
|
|
|
return '{}_{}.md'.format(date, title)
|
|
|
|
|
|
# Update feed
|
|
def update_feed(feed):
|
|
|
|
category = feed[0]
|
|
name = feed[1]
|
|
|
|
log('updating feed "{}"'.format(name))
|
|
|
|
feedpath_new = os.path.join(base_directory, category, name, 'new')
|
|
feedpath_read = os.path.join(base_directory, category, name, 'read')
|
|
if not os.path.exists(feedpath_new):
|
|
os.makedirs(feedpath_new)
|
|
if not os.path.exists(feedpath_read):
|
|
os.makedirs(feedpath_read)
|
|
|
|
articles = get_articles(feed)
|
|
threshold_date = datetime.now() - timedelta(days = max_age)
|
|
for a in articles:
|
|
date = datetime.fromtimestamp(mktime(a.published_parsed))
|
|
if date > threshold_date:
|
|
filename = get_filename(date, a.title)
|
|
if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)):
|
|
text = html_to_markdown(get_html_content(a.link))
|
|
write_to_file(os.path.join(feedpath_new, filename), text)
|
|
|
|
|
|
# Delete articles older than max_age
|
|
def delete_old_articles():
|
|
|
|
threshold_date = datetime.now() - timedelta(days = max_age)
|
|
for subdir, dirs, files in os.walk(base_directory):
|
|
|
|
# Skip 'loved' directory
|
|
if not os.path.join(base_directory, 'loved') in subdir:
|
|
for file in files:
|
|
date = datetime.strptime(file[:12], '%Y%m%d%H%M')
|
|
if threshold_date > date:
|
|
os.remove(os.path.join(subdir, file))
|
|
log('deleted old articles')
|
|
|
|
|
|
def initialize():
|
|
|
|
# Create 'loved' directory if not existent
|
|
lovedpath = os.path.join(base_directory, 'loved')
|
|
if not os.path.exists(lovedpath):
|
|
os.makedirs(lovedpath)
|
|
|
|
|
|
def crawl():
|
|
|
|
# Main loop
|
|
while True:
|
|
for feed in feeds:
|
|
update_feed(feed)
|
|
delete_old_articles()
|
|
time.sleep(update_interval * 60)
|
|
|
|
def get_help_message():
|
|
return 'spiderrss.py | run'
|
|
|
|
def main(argv):
|
|
|
|
print_logo()
|
|
|
|
## Get arguments
|
|
#try:
|
|
# opts, args = getopt,getopt(argv, 'h', ['ifile=', 'ofile='])
|
|
#except:
|
|
# print('spiderrss.py [ run | create_config <file> ]')
|
|
|
|
#for opt, arg in opts:
|
|
# if opt == '-h':
|
|
# print('spiderrss.py [ run | create_config <file> ]')
|
|
|
|
|
|
#initialize()
|
|
#crawl()
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main(sys.argv[1:])
|
|
|