Blacken code

This commit is contained in:
Denis Lehmann 2020-08-29 11:45:59 +02:00
parent 3d9531eb8d
commit 5726c201b9

View file

@ -8,7 +8,6 @@ import re
import requests import requests
import subprocess import subprocess
import sys import sys
import time
import toml import toml
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -16,24 +15,25 @@ from readability import Document
from time import mktime from time import mktime
from urllib.parse import urlsplit, urlunsplit from urllib.parse import urlsplit, urlunsplit
''' """
Output functions Output functions
''' """
# Print log message # Print log message
def log(text, force=False): def log(text, force=False):
if verbose or force: if verbose or force:
print('{} | {}'.format(datetime.now().strftime('%d.%m %H:%M'), text)) print("{} | {}".format(datetime.now().strftime("%d.%m %H:%M"), text))
# Print error message and exit # Print error message and exit
def error(text): def error(text):
print('{} E {}'.format(datetime.now().strftime('%d.%m %H:%M'), text)) print("{} E {}".format(datetime.now().strftime("%d.%m %H:%M"), text))
# Print spiderss logo # Print spiderss logo
def print_logo(): def print_logo():
logo = ''' logo = """
;: ;:
.N' ,K: .N' ,K:
,O .0MWx' lk 0; ,O .0MWx' lk 0;
@ -46,25 +46,27 @@ def print_logo():
;kddkNKl. XMNkWk, :N0; .'cOW0c. ,lOW0; .:0Nl. okddOW0:. .kdoxXNd, ;kddkNKl. XMNkWk, :N0; .'cOW0c. ,lOW0; .:0Nl. okddOW0:. .kdoxXNd,
WMX WMX
;..cc ;..cc
''' """
print(logo) print(logo)
'''
"""
Utility functions Utility functions
''' """
# Get articles of a feed # Get articles of a feed
def get_articles(feed): def get_articles(feed):
feed = feedparser.parse(feed['url']) feed = feedparser.parse(feed["url"])
return feed.entries return feed.entries
# Write text to file # Write text to file
def write_to_file(filepath, text): def write_to_file(filepath, text):
file = open(filepath, 'w') file = open(filepath, "w")
file.write(text) file.write(text)
file.close() file.close()
@ -73,20 +75,22 @@ def write_to_file(filepath, text):
def get_filename_postfix(title): def get_filename_postfix(title):
# Get title as lowercase words concatenated with underscores # Get title as lowercase words concatenated with underscores
title = re.sub('[^A-Za-z0-9 ]+', '', title.lower()) title = re.sub("[^A-Za-z0-9 ]+", "", title.lower())
title = re.sub(' ', '_', title) title = re.sub(" ", "_", title)
return '{}.{}'.format(title, fileending) return "{}.{}".format(title, fileending)
# Get HTML image snippet from the first image url in a text # Get HTML image snippet from the first image url in a text
def get_image_snippet(text): def get_image_snippet(text):
try: try:
image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', text, re.IGNORECASE).group('image') image_url = re.search(
"(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))", text, re.IGNORECASE
).group("image")
return '<img src="{}" alt="Image">\n\n'.format(image_url) return '<img src="{}" alt="Image">\n\n'.format(image_url)
except: except Exception:
return '' return ""
# Get HTML summary snippet from a HTML text # Get HTML summary snippet from a HTML text
@ -98,21 +102,23 @@ def get_summary_snippet(text):
h.ignore_links = True h.ignore_links = True
h.ignore_images = True h.ignore_images = True
h.body_width = 0 h.body_width = 0
summary = h.handle(text).split('\n\n')[0].strip() summary = h.handle(text).split("\n\n")[0].strip()
return '<p><b>{}</b></p>\n\n'.format(summary) return "<p><b>{}</b></p>\n\n".format(summary)
except: except Exception:
return '' return ""
# Get article body either from web or its content # Get article body either from web or its content
def get_article_body(article, feed): def get_article_body(article, feed):
body = '' body = ""
# If scrape, get article with readability # If scrape, get article with readability
if feed['scrape']: if feed["scrape"]:
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'} headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36"
}
response = requests.get(article.link, headers=headers) response = requests.get(article.link, headers=headers)
doc = Document(response.text) doc = Document(response.text)
body = doc.summary() body = doc.summary()
@ -121,44 +127,56 @@ def get_article_body(article, feed):
else: else:
# Add all content to body # Add all content to body
if hasattr(article, 'content'): if hasattr(article, "content"):
for c in article.content: for c in article.content:
if c.type == 'text/html' or c.type == 'text/plain': if c.type == "text/html" or c.type == "text/plain":
body += c.value body += c.value
# Use summary as fallback # Use summary as fallback
elif hasattr(article, 'summary'): elif hasattr(article, "summary"):
body += article.summary body += article.summary
# Replace relative links with absolute ones, using beautifulsoup # Replace relative links with absolute ones, using beautifulsoup
try: try:
splitted_url = urlsplit(article.link) splitted_url = urlsplit(article.link)
except: except Exception:
splitted_url = urlsplit(feed['url']) splitted_url = urlsplit(feed["url"])
soup = BeautifulSoup(body, features = 'lxml') soup = BeautifulSoup(body, features="lxml")
for img in soup.find_all('img', src = True): for img in soup.find_all("img", src=True):
src = img.get('src') src = img.get("src")
splitted_src = urlsplit(src) splitted_src = urlsplit(src)
constructed_src = [splitted_src.scheme, splitted_src.netloc, splitted_src.path, splitted_src.query, splitted_src.fragment] constructed_src = [
if constructed_src[0] == '': splitted_src.scheme,
splitted_src.netloc,
splitted_src.path,
splitted_src.query,
splitted_src.fragment,
]
if constructed_src[0] == "":
constructed_src[0] = splitted_url.scheme constructed_src[0] = splitted_url.scheme
if constructed_src[1] == '': if constructed_src[1] == "":
constructed_src[1] = splitted_url.netloc constructed_src[1] = splitted_url.netloc
new_src = urlunsplit(constructed_src) new_src = urlunsplit(constructed_src)
if new_src.startswith('http'): if new_src.startswith("http"):
body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1) body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1)
for a in soup.find_all('a', href = True): for a in soup.find_all("a", href=True):
href = a.get('href') href = a.get("href")
splitted_href = urlsplit(href) splitted_href = urlsplit(href)
constructed_href = [splitted_href.scheme, splitted_href.netloc, splitted_href.path, splitted_href.query, splitted_href.fragment] constructed_href = [
if constructed_href[0] == '': splitted_href.scheme,
splitted_href.netloc,
splitted_href.path,
splitted_href.query,
splitted_href.fragment,
]
if constructed_href[0] == "":
constructed_href[0] = splitted_url.scheme constructed_href[0] = splitted_url.scheme
if constructed_href[1] == '': if constructed_href[1] == "":
constructed_href[1] = splitted_url.netloc constructed_href[1] = splitted_url.netloc
new_href = urlunsplit(constructed_href) new_href = urlunsplit(constructed_href)
if new_href.startswith('http'): if new_href.startswith("http"):
body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1) body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1)
return body return body
@ -168,12 +186,17 @@ def get_article_body(article, feed):
def postprocess(text): def postprocess(text):
try: try:
processor = subprocess.Popen(postprocessor.split(' '), stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE) processor = subprocess.Popen(
postprocessor.split(" "),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
(output, err) = processor.communicate(input=text.encode()) (output, err) = processor.communicate(input=text.encode())
if err: if err:
raise Exception(err.decode().strip()) raise Exception(err.decode().strip())
except Exception as e: except Exception as e:
error(' while postprocessing: {}'.format(e)) error(" while postprocessing: {}".format(e))
sys.exit(1) sys.exit(1)
return output.decode().strip() return output.decode().strip()
@ -187,25 +210,29 @@ def get_article(article, feed):
# Construct head of article # Construct head of article
image = get_image_snippet(str(article)) image = get_image_snippet(str(article))
if image == '': if image == "":
image = get_image_snippet(body) image = get_image_snippet(body)
summary = get_summary_snippet(article.summary) summary = get_summary_snippet(article.summary)
if summary == '': if summary == "":
summary = get_summary_snippet(body) summary = get_summary_snippet(body)
try: try:
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format) date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(
except: datetime_format
)
except Exception:
date = datetime.now().strftime(datetime_format) date = datetime.now().strftime(datetime_format)
try: try:
link = article.link link = article.link
except: except Exception:
splitted_url = urlsplit(feed['url']) splitted_url = urlsplit(feed["url"])
splitted_link = [splitted_url.scheme, splitted_url.netloc, '', '', ''] splitted_link = [splitted_url.scheme, splitted_url.netloc, "", "", ""]
link = urlunsplit(splitted_link) link = urlunsplit(splitted_link)
head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, link) head = "<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>".format(
article.title, image, summary, date, link
)
# Postprocess article # Postprocess article
article_text = postprocess('{}\n\n<hr>\n\n{}'.format(head, body)).strip() article_text = postprocess("{}\n\n<hr>\n\n{}".format(head, body)).strip()
return article_text return article_text
@ -213,11 +240,11 @@ def get_article(article, feed):
# Update feed # Update feed
def update_feed(feed): def update_feed(feed):
log(' updating feed "{}"'.format(feed['name'])) log(' updating feed "{}"'.format(feed["name"]))
# Set feedpaths # Set feedpaths
feedpath_new = os.path.join(base_directory, feed['category'], feed['name'], 'new') feedpath_new = os.path.join(base_directory, feed["category"], feed["name"], "new")
feedpath_read = os.path.join(base_directory, feed['category'], feed['name'], 'read') feedpath_read = os.path.join(base_directory, feed["category"], feed["name"], "read")
if not os.path.exists(feedpath_new): if not os.path.exists(feedpath_new):
os.makedirs(feedpath_new) os.makedirs(feedpath_new)
@ -226,14 +253,16 @@ def update_feed(feed):
os.makedirs(feedpath_read) os.makedirs(feedpath_read)
# Get exisiting articles # Get exisiting articles
existing_articles = os.listdir(feedpath_new) + os.listdir(feedpath_read) + os.listdir(lovedpath) existing_articles = (
os.listdir(feedpath_new) + os.listdir(feedpath_read) + os.listdir(lovedpath)
)
# Update articles # Update articles
articles = get_articles(feed) articles = get_articles(feed)
threshold_date = datetime.now() - timedelta(days=max_age) threshold_date = datetime.now() - timedelta(days=max_age)
if len(articles) == 0: if len(articles) == 0:
error('no articles received from feed "{}"'.format(feed['name'])) error('no articles received from feed "{}"'.format(feed["name"]))
for a in articles: for a in articles:
@ -243,7 +272,7 @@ def update_feed(feed):
fallback = False fallback = False
try: try:
date = datetime.fromtimestamp(mktime(a.published_parsed)) date = datetime.fromtimestamp(mktime(a.published_parsed))
except: except Exception:
date = datetime.now() date = datetime.now()
fallback = True fallback = True
@ -259,9 +288,9 @@ def update_feed(feed):
if not filter: if not filter:
# Construct filename # Construct filename
filename_prefix = date.strftime('%Y%m%d%H%M') filename_prefix = date.strftime("%Y%m%d%H%M")
filename_postfix = get_filename_postfix(a.title) filename_postfix = get_filename_postfix(a.title)
filename = '{}_{}'.format(filename_prefix, filename_postfix) filename = "{}_{}".format(filename_prefix, filename_postfix)
# Check if article exists # Check if article exists
article_exists = False article_exists = False
@ -278,7 +307,11 @@ def update_feed(feed):
log(' added article "{}"'.format(a.title)) log(' added article "{}"'.format(a.title))
except Exception as e: except Exception as e:
error('while parsing article "{}" from feed "{}": {}'.format(a.title, feed['name'], e)) error(
'while parsing article "{}" from feed "{}": {}'.format(
a.title, feed["name"], e
)
)
# Delete articles older than max_age # Delete articles older than max_age
@ -290,14 +323,14 @@ def remove_old_articles():
for subdir, dirs, files in os.walk(base_directory): for subdir, dirs, files in os.walk(base_directory):
# Skip 'loved' directory # Skip 'loved' directory
if not os.path.join(base_directory, 'loved') in subdir: if not os.path.join(base_directory, "loved") in subdir:
for file in files: for file in files:
date = datetime.strptime(file[:12], '%Y%m%d%H%M') date = datetime.strptime(file[:12], "%Y%m%d%H%M")
if threshold_date > date: if threshold_date > date:
os.remove(os.path.join(subdir, file)) os.remove(os.path.join(subdir, file))
count += 1 count += 1
log(' removed {} articles'.format(count)) log(" removed {} articles".format(count))
# Parse config file # Parse config file
@ -307,15 +340,15 @@ def load_config(filepath):
try: try:
config = toml.load(filepath) config = toml.load(filepath)
base_directory = config['base_directory'] base_directory = config["base_directory"]
max_age = config['max_age'] max_age = config["max_age"]
datetime_format = config['datetime_format'] datetime_format = config["datetime_format"]
postprocessor = config['postprocessor'] postprocessor = config["postprocessor"]
fileending = config['fileending'] fileending = config["fileending"]
filters = config['filters'] filters = config["filters"]
feeds = config['feed'] feeds = config["feed"]
except Exception as e: except Exception as e:
error('while parsing config: {}'.format(e)) error("while parsing config: {}".format(e))
sys.exit(1) sys.exit(1)
@ -325,7 +358,7 @@ def initialize():
global lovedpath global lovedpath
# Create 'loved' directory if not existent # Create 'loved' directory if not existent
lovedpath = os.path.join(base_directory, 'loved') lovedpath = os.path.join(base_directory, "loved")
if not os.path.exists(lovedpath): if not os.path.exists(lovedpath):
os.makedirs(lovedpath) os.makedirs(lovedpath)
@ -333,25 +366,34 @@ def initialize():
# Update all feeds and remove old articles # Update all feeds and remove old articles
def crawl(): def crawl():
log('crawling feeds', True) log("crawling feeds", True)
for feed in feeds: for feed in feeds:
update_feed(feed) update_feed(feed)
log('removing old articles', True) log("removing old articles", True)
remove_old_articles() remove_old_articles()
'''
"""
Main Main
''' """
def main(): def main():
global verbose global verbose
# Initialize parser # Initialize parser
parser = argparse.ArgumentParser(description = 'Crawl RSS feeds and store articles as Markdown files.') parser = argparse.ArgumentParser(
parser.add_argument('-v', '--verbose', action = 'store_true', help = 'verbose output') description="Crawl RSS feeds and store articles as Markdown files."
parser.add_argument('-c', '--config', default = './config.toml', help = 'config file (default: ./config.toml)') )
parser.add_argument("-v", "--verbose", action="store_true", help="verbose output")
parser.add_argument(
"-c",
"--config",
default="./config.toml",
help="config file (default: ./config.toml)",
)
# Get args # Get args
args = parser.parse_args() args = parser.parse_args()
@ -365,5 +407,5 @@ def main():
crawl() crawl()
if __name__ == '__main__': if __name__ == "__main__":
main() main()