Compare commits
10 commits
877aff0475
...
de2782117a
| Author | SHA1 | Date | |
|---|---|---|---|
| de2782117a | |||
| 5726c201b9 | |||
| 3d9531eb8d | |||
| 1ddc57c30a | |||
| fa542ee56e | |||
| d630cc96c4 | |||
| 7d4d1311bb | |||
| d37148ea32 | |||
| e4837d77c6 | |||
| 8aed1df8c7 |
5 changed files with 277 additions and 152 deletions
11
README.md
11
README.md
|
|
@ -20,6 +20,7 @@ Use at your own risk!
|
||||||
|
|
||||||
- Store articles in categories
|
- Store articles in categories
|
||||||
- Delete articles after a few days
|
- Delete articles after a few days
|
||||||
|
- Filter articles, you're not interested in
|
||||||
- Distinguish __new__ from __read__ articles
|
- Distinguish __new__ from __read__ articles
|
||||||
- Store __loved__ articles forever
|
- Store __loved__ articles forever
|
||||||
- OPML import
|
- OPML import
|
||||||
|
|
@ -66,14 +67,18 @@ base_directory = '/home/<user>/rss'
|
||||||
# Articles older than max_age (days) will be deleted and not be added.
|
# Articles older than max_age (days) will be deleted and not be added.
|
||||||
max_age = 30
|
max_age = 30
|
||||||
|
|
||||||
|
# Date and time format as strftime, to be included in the articles.
|
||||||
|
datetime_format = '%d.%m.%Y %H:%M'
|
||||||
|
|
||||||
# Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout.
|
# Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout.
|
||||||
postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links --reference-location=document'
|
postprocessor = 'pandoc -f html -t markdown_strict-raw_html --reference-links --reference-location=document'
|
||||||
|
|
||||||
# Fileending for the article files.
|
# Fileending for the article files.
|
||||||
fileending = 'md'
|
fileending = 'md'
|
||||||
|
|
||||||
# Date and time format as strftime to be included in the articles.
|
# List of regular expression strings. If any of these matches an lowercase article title, the article won't be saved.
|
||||||
datetime_format = '%d.%m.%Y %H:%M'
|
# E.g. if you wan't to skip news about RSS explicitly, add '(\W|^)rss(\W|$)'.
|
||||||
|
filters = []
|
||||||
|
|
||||||
# Feeds
|
# Feeds
|
||||||
# The category can be empty (''). The feed fill then be stored in the base_directory.
|
# The category can be empty (''). The feed fill then be stored in the base_directory.
|
||||||
|
|
@ -153,6 +158,6 @@ Just synchronize the base_directory with [Syncthing](https://syncthing.net/), [r
|
||||||
|
|
||||||
## Acknowledgements
|
## Acknowledgements
|
||||||
|
|
||||||
Thanks to all the people, who created the nice libraries this project in based on.
|
Thanks to all the people who created the nice software, this project in based on.
|
||||||
And also thanks to Dieter Steffmann who created the Canterbury font, which is used for the logo.
|
And also thanks to Dieter Steffmann who created the Canterbury font, which is used for the logo.
|
||||||
You can find it in the `fonts/` directory.
|
You can find it in the `fonts/` directory.
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@ base_directory = '/home/<user>/rss'
|
||||||
# Articles older than max_age (days) will be deleted and not be added.
|
# Articles older than max_age (days) will be deleted and not be added.
|
||||||
max_age = 30
|
max_age = 30
|
||||||
|
|
||||||
# Date and time format as strftime to be included in the articles.
|
# Date and time format as strftime, to be included in the articles.
|
||||||
datetime_format = '%d.%m.%Y %H:%M'
|
datetime_format = '%d.%m.%Y %H:%M'
|
||||||
|
|
||||||
# Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout.
|
# Postprocessing command of the articles. The article is written to stdin in HTML format and read from stdout.
|
||||||
|
|
@ -13,6 +13,10 @@ postprocessor = 'pandoc -f html -t markdown_strict-raw_html+pipe_tables --refere
|
||||||
# Fileending for the article files.
|
# Fileending for the article files.
|
||||||
fileending = 'md'
|
fileending = 'md'
|
||||||
|
|
||||||
|
# List of regular expression strings. If any of these matches an lowercase article title, the article won't be saved.
|
||||||
|
# E.g. if you wan't to skip news about RSS explicitly, add '(\W|^)rss(\W|$)'.
|
||||||
|
filters = []
|
||||||
|
|
||||||
# Feeds
|
# Feeds
|
||||||
# The category can be empty (''). The feed fill then be stored in the base_directory.
|
# The category can be empty (''). The feed fill then be stored in the base_directory.
|
||||||
# The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware').
|
# The category can also be a path, which will result in subdirectories (e.g. 'technology/hardware').
|
||||||
|
|
|
||||||
21
default.nix
21
default.nix
|
|
@ -1,21 +0,0 @@
|
||||||
with import <nixpkgs> {};
|
|
||||||
|
|
||||||
stdenv.mkDerivation {
|
|
||||||
name = "myPythonEnv";
|
|
||||||
buildInputs = with pkgs; [
|
|
||||||
python37Full
|
|
||||||
python37Packages.virtualenv
|
|
||||||
pandoc
|
|
||||||
];
|
|
||||||
src = null;
|
|
||||||
shellHook = ''
|
|
||||||
if [ ! -d .venv ]; then
|
|
||||||
python -m venv .venv
|
|
||||||
fi
|
|
||||||
source .venv/bin/activate
|
|
||||||
pip install --upgrade pip
|
|
||||||
if [ -s requirements.txt ]; then
|
|
||||||
pip install -r requirements.txt
|
|
||||||
fi
|
|
||||||
'';
|
|
||||||
}
|
|
||||||
38
shell.nix
Normal file
38
shell.nix
Normal file
|
|
@ -0,0 +1,38 @@
|
||||||
|
{ pkgs ? import <nixpkgs> {} }:
|
||||||
|
pkgs.mkShell {
|
||||||
|
name = "python-environment";
|
||||||
|
buildInputs = with pkgs; [
|
||||||
|
pandoc
|
||||||
|
python3
|
||||||
|
python3Packages.virtualenv
|
||||||
|
];
|
||||||
|
shellHook = ''
|
||||||
|
function log_header {
|
||||||
|
echo -ne "==> \e[32m\e[1m$1\e[0m\n\n"
|
||||||
|
}
|
||||||
|
function log_subheader {
|
||||||
|
echo -ne "--> \e[33m\e[1m$1\e[0m\n\n"
|
||||||
|
}
|
||||||
|
function log {
|
||||||
|
echo -ne " $1\n"
|
||||||
|
}
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
log_header "python_environment"
|
||||||
|
if [ ! -d .venv ]; then
|
||||||
|
python -m venv .venv
|
||||||
|
fi
|
||||||
|
source .venv/bin/activate
|
||||||
|
log_subheader "upgrading pip"
|
||||||
|
pip install --upgrade pip
|
||||||
|
echo ""
|
||||||
|
if [ -s requirements.txt ]; then
|
||||||
|
log_subheader "found requirements.txt, installing packages"
|
||||||
|
pip install -r requirements.txt
|
||||||
|
echo ""
|
||||||
|
fi
|
||||||
|
log_header "package versions"
|
||||||
|
log "$(python --version)"
|
||||||
|
log "$(pip --version)"
|
||||||
|
'';
|
||||||
|
}
|
||||||
353
spiderss.py
353
spiderss.py
|
|
@ -8,7 +8,6 @@ import re
|
||||||
import requests
|
import requests
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import time
|
|
||||||
import toml
|
import toml
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
@ -16,24 +15,25 @@ from readability import Document
|
||||||
from time import mktime
|
from time import mktime
|
||||||
from urllib.parse import urlsplit, urlunsplit
|
from urllib.parse import urlsplit, urlunsplit
|
||||||
|
|
||||||
'''
|
"""
|
||||||
Output functions
|
Output functions
|
||||||
'''
|
"""
|
||||||
|
|
||||||
|
|
||||||
# Print log message
|
# Print log message
|
||||||
def log(text, force = False):
|
def log(text, force=False):
|
||||||
if verbose or force:
|
if verbose or force:
|
||||||
print('{} | {}'.format(datetime.now().strftime('%d.%m %H:%M'), text))
|
print("{} | {}".format(datetime.now().strftime("%d.%m %H:%M"), text))
|
||||||
|
|
||||||
|
|
||||||
# Print error message and exit
|
# Print error message and exit
|
||||||
def error(text):
|
def error(text):
|
||||||
print('{} E {}'.format(datetime.now().strftime('%d.%m %H:%M'), text))
|
print("{} E {}".format(datetime.now().strftime("%d.%m %H:%M"), text))
|
||||||
|
|
||||||
|
|
||||||
# Print spiderss logo
|
# Print spiderss logo
|
||||||
def print_logo():
|
def print_logo():
|
||||||
logo = '''
|
logo = """
|
||||||
;:
|
;:
|
||||||
.N' ,K:
|
.N' ,K:
|
||||||
,O .0MWx' lk 0;
|
,O .0MWx' lk 0;
|
||||||
|
|
@ -46,54 +46,55 @@ def print_logo():
|
||||||
;kddkNKl. XMNkWk, :N0; .'cOW0c. ,lOW0; .:0Nl. okddOW0:. .kdoxXNd,
|
;kddkNKl. XMNkWk, :N0; .'cOW0c. ,lOW0; .:0Nl. okddOW0:. .kdoxXNd,
|
||||||
WMX
|
WMX
|
||||||
;..cc
|
;..cc
|
||||||
'''
|
"""
|
||||||
|
|
||||||
print(logo)
|
print(logo)
|
||||||
|
|
||||||
'''
|
|
||||||
Utility functions
|
|
||||||
'''
|
|
||||||
|
|
||||||
# Get articles of a feed
|
"""
|
||||||
|
Utility functions
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# Get articles of a feed
|
||||||
def get_articles(feed):
|
def get_articles(feed):
|
||||||
feed = feedparser.parse(feed['url'])
|
|
||||||
|
feed = feedparser.parse(feed["url"])
|
||||||
return feed.entries
|
return feed.entries
|
||||||
|
|
||||||
|
|
||||||
# Write text to file
|
# Write text to file
|
||||||
def write_to_file(filepath, text):
|
def write_to_file(filepath, text):
|
||||||
|
|
||||||
file = open(filepath, 'w')
|
file = open(filepath, "w")
|
||||||
file.write(text)
|
file.write(text)
|
||||||
file.close()
|
file.close()
|
||||||
|
|
||||||
|
|
||||||
# Get filename from a date and a title
|
# Get filename postfix from a title
|
||||||
def get_filename(date, title):
|
def get_filename_postfix(title):
|
||||||
|
|
||||||
# Get date as single block
|
|
||||||
date = date.strftime('%Y%m%d%H%M')
|
|
||||||
|
|
||||||
# Get title as lowercase words concatenated with underscores
|
# Get title as lowercase words concatenated with underscores
|
||||||
title = re.sub('[^A-Za-z0-9 ]+', '', title.lower())
|
title = re.sub("[^A-Za-z0-9 ]+", "", title.lower())
|
||||||
title = re.sub(' ', '_', title)
|
title = re.sub(" ", "_", title)
|
||||||
|
|
||||||
return '{}_{}.{}'.format(date, title, fileending)
|
return "{}.{}".format(title, fileending)
|
||||||
|
|
||||||
|
|
||||||
# If scraped, use first content image as fallback
|
# Get HTML image snippet from the first image url in a text
|
||||||
# Get image snippet for an article
|
def get_image_snippet(text):
|
||||||
def get_article_image(article):
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
image_url = re.search('(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))', str(article), re.IGNORECASE).group('image')
|
image_url = re.search(
|
||||||
|
"(?P<image>https?://\S+(\.png|\.jpg|\.jpeg))", text, re.IGNORECASE
|
||||||
|
).group("image")
|
||||||
return '<img src="{}" alt="Image">\n\n'.format(image_url)
|
return '<img src="{}" alt="Image">\n\n'.format(image_url)
|
||||||
except:
|
except Exception:
|
||||||
return ''
|
return ""
|
||||||
|
|
||||||
|
|
||||||
# Get summary snippet for an article
|
# Get HTML summary snippet from a HTML text
|
||||||
def get_article_summary(article):
|
def get_summary_snippet(text):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
h = html2text.HTML2Text()
|
h = html2text.HTML2Text()
|
||||||
|
|
@ -101,60 +102,82 @@ def get_article_summary(article):
|
||||||
h.ignore_links = True
|
h.ignore_links = True
|
||||||
h.ignore_images = True
|
h.ignore_images = True
|
||||||
h.body_width = 0
|
h.body_width = 0
|
||||||
return '<p><b>{}</b></p>\n\n'.format(summary)
|
summary = h.handle(text).split("\n\n")[0].strip()
|
||||||
except:
|
return "<p><b>{}</b></p>\n\n".format(summary)
|
||||||
return ''
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
# Get article body either from web or its content
|
# Get article body either from web or its content
|
||||||
def get_article_body(article, scrape):
|
def get_article_body(article, feed):
|
||||||
|
|
||||||
body = ''
|
body = ""
|
||||||
|
|
||||||
# TODO: Include appropriate header?
|
|
||||||
# If scrape, get article with readability
|
# If scrape, get article with readability
|
||||||
if scrape:
|
if feed["scrape"]:
|
||||||
|
|
||||||
response = requests.get(article.link)
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36"
|
||||||
|
}
|
||||||
|
response = requests.get(article.link, headers=headers)
|
||||||
doc = Document(response.text)
|
doc = Document(response.text)
|
||||||
body = doc.summary()
|
body = doc.summary()
|
||||||
|
|
||||||
# Replace relative site links with absolute ones, using beautifulsoup
|
# Else construct from article object
|
||||||
splitted_url = urlsplit(article.link)
|
|
||||||
soup = BeautifulSoup(body, features = 'lxml')
|
|
||||||
for img in soup.find_all('img', src = True):
|
|
||||||
src = img.get('src')
|
|
||||||
splitted_src = urlsplit(src)
|
|
||||||
constructed_src = [splitted_src.scheme, splitted_src.netloc, splitted_src.path, splitted_src.query, splitted_src.fragment]
|
|
||||||
if constructed_src[0] == '':
|
|
||||||
constructed_src[0] = splitted_url.scheme
|
|
||||||
if constructed_src[1] == '':
|
|
||||||
constructed_src[1] = splitted_url.netloc
|
|
||||||
new_src = urlunsplit(constructed_src)
|
|
||||||
if new_src.startswith('http'):
|
|
||||||
body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1)
|
|
||||||
|
|
||||||
# TODO: catch mailto:
|
|
||||||
for a in soup.find_all('a', href = True):
|
|
||||||
href = a.get('href')
|
|
||||||
splitted_href = urlsplit(href)
|
|
||||||
constructed_href = [splitted_href.scheme, splitted_href.netloc, splitted_href.path, splitted_href.query, splitted_href.fragment]
|
|
||||||
if constructed_href[0] == '':
|
|
||||||
constructed_href[0] = splitted_url.scheme
|
|
||||||
if constructed_href[1] == '':
|
|
||||||
constructed_href[1] = splitted_url.netloc
|
|
||||||
new_href = urlunsplit(constructed_href)
|
|
||||||
if new_href.startswith('http'):
|
|
||||||
body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1)
|
|
||||||
|
|
||||||
|
|
||||||
# Else construct from article content
|
|
||||||
else:
|
else:
|
||||||
|
|
||||||
if hasattr(article, 'content'):
|
# Add all content to body
|
||||||
|
if hasattr(article, "content"):
|
||||||
for c in article.content:
|
for c in article.content:
|
||||||
if c.type == 'text/html':
|
if c.type == "text/html" or c.type == "text/plain":
|
||||||
body += c.value
|
body += c.value
|
||||||
|
# Use summary as fallback
|
||||||
|
elif hasattr(article, "summary"):
|
||||||
|
body += article.summary
|
||||||
|
|
||||||
|
# Replace relative links with absolute ones, using beautifulsoup
|
||||||
|
try:
|
||||||
|
splitted_url = urlsplit(article.link)
|
||||||
|
except Exception:
|
||||||
|
splitted_url = urlsplit(feed["url"])
|
||||||
|
|
||||||
|
soup = BeautifulSoup(body, features="lxml")
|
||||||
|
|
||||||
|
for img in soup.find_all("img", src=True):
|
||||||
|
src = img.get("src")
|
||||||
|
splitted_src = urlsplit(src)
|
||||||
|
constructed_src = [
|
||||||
|
splitted_src.scheme,
|
||||||
|
splitted_src.netloc,
|
||||||
|
splitted_src.path,
|
||||||
|
splitted_src.query,
|
||||||
|
splitted_src.fragment,
|
||||||
|
]
|
||||||
|
if constructed_src[0] == "":
|
||||||
|
constructed_src[0] = splitted_url.scheme
|
||||||
|
if constructed_src[1] == "":
|
||||||
|
constructed_src[1] = splitted_url.netloc
|
||||||
|
new_src = urlunsplit(constructed_src)
|
||||||
|
if new_src.startswith("http"):
|
||||||
|
body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1)
|
||||||
|
|
||||||
|
for a in soup.find_all("a", href=True):
|
||||||
|
href = a.get("href")
|
||||||
|
splitted_href = urlsplit(href)
|
||||||
|
constructed_href = [
|
||||||
|
splitted_href.scheme,
|
||||||
|
splitted_href.netloc,
|
||||||
|
splitted_href.path,
|
||||||
|
splitted_href.query,
|
||||||
|
splitted_href.fragment,
|
||||||
|
]
|
||||||
|
if constructed_href[0] == "":
|
||||||
|
constructed_href[0] = splitted_url.scheme
|
||||||
|
if constructed_href[1] == "":
|
||||||
|
constructed_href[1] = splitted_url.netloc
|
||||||
|
new_href = urlunsplit(constructed_href)
|
||||||
|
if new_href.startswith("http"):
|
||||||
|
body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1)
|
||||||
|
|
||||||
return body
|
return body
|
||||||
|
|
||||||
|
|
@ -163,32 +186,53 @@ def get_article_body(article, scrape):
|
||||||
def postprocess(text):
|
def postprocess(text):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
processor = subprocess.Popen(postprocessor.split(' '), stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
|
processor = subprocess.Popen(
|
||||||
(output, err) = processor.communicate(input = text.encode())
|
postprocessor.split(" "),
|
||||||
|
stdin=subprocess.PIPE,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
)
|
||||||
|
(output, err) = processor.communicate(input=text.encode())
|
||||||
if err:
|
if err:
|
||||||
raise Exception(err.decode().strip())
|
raise Exception(err.decode().strip())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error(' while postprocessing: {}'.format(e))
|
error(" while postprocessing: {}".format(e))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
return output.decode().strip()
|
return output.decode().strip()
|
||||||
|
|
||||||
|
|
||||||
# Get constructed article
|
# Get constructed article
|
||||||
def get_article(article, scrape):
|
def get_article(article, feed):
|
||||||
|
|
||||||
# Construct head of article
|
|
||||||
image = get_article_image(article)
|
|
||||||
summary = get_article_summary(article)
|
|
||||||
#TODO: Current time as fallback?
|
|
||||||
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
|
|
||||||
head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link)
|
|
||||||
|
|
||||||
# Get body of article
|
# Get body of article
|
||||||
body = get_article_body(article, scrape)
|
body = get_article_body(article, feed)
|
||||||
|
|
||||||
|
# Construct head of article
|
||||||
|
image = get_image_snippet(str(article))
|
||||||
|
if image == "":
|
||||||
|
image = get_image_snippet(body)
|
||||||
|
summary = get_summary_snippet(article.summary)
|
||||||
|
if summary == "":
|
||||||
|
summary = get_summary_snippet(body)
|
||||||
|
try:
|
||||||
|
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(
|
||||||
|
datetime_format
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
date = datetime.now().strftime(datetime_format)
|
||||||
|
try:
|
||||||
|
link = article.link
|
||||||
|
except Exception:
|
||||||
|
splitted_url = urlsplit(feed["url"])
|
||||||
|
splitted_link = [splitted_url.scheme, splitted_url.netloc, "", "", ""]
|
||||||
|
link = urlunsplit(splitted_link)
|
||||||
|
head = "<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>".format(
|
||||||
|
article.title, image, summary, date, link
|
||||||
|
)
|
||||||
|
|
||||||
# Postprocess article
|
# Postprocess article
|
||||||
article_text = postprocess('{}\n\n<hr>\n\n{}'.format(head, body)).strip()
|
article_text = postprocess("{}\n\n<hr>\n\n{}".format(head, body)).strip()
|
||||||
|
|
||||||
return article_text
|
return article_text
|
||||||
|
|
||||||
|
|
@ -196,105 +240,160 @@ def get_article(article, scrape):
|
||||||
# Update feed
|
# Update feed
|
||||||
def update_feed(feed):
|
def update_feed(feed):
|
||||||
|
|
||||||
log(' updating feed "{}"'.format(feed['name']))
|
log(' updating feed "{}"'.format(feed["name"]))
|
||||||
|
|
||||||
# Set feedpaths
|
# Set feedpaths
|
||||||
feedpath_new = os.path.join(base_directory, feed['category'], feed['name'], 'new')
|
feedpath_new = os.path.join(base_directory, feed["category"], feed["name"], "new")
|
||||||
feedpath_read = os.path.join(base_directory, feed['category'], feed['name'], 'read')
|
feedpath_read = os.path.join(base_directory, feed["category"], feed["name"], "read")
|
||||||
|
|
||||||
if not os.path.exists(feedpath_new):
|
if not os.path.exists(feedpath_new):
|
||||||
os.makedirs(feedpath_new)
|
os.makedirs(feedpath_new)
|
||||||
|
|
||||||
if not os.path.exists(feedpath_read):
|
if not os.path.exists(feedpath_read):
|
||||||
os.makedirs(feedpath_read)
|
os.makedirs(feedpath_read)
|
||||||
|
|
||||||
|
# Get exisiting articles
|
||||||
|
existing_articles = (
|
||||||
|
os.listdir(feedpath_new) + os.listdir(feedpath_read) + os.listdir(lovedpath)
|
||||||
|
)
|
||||||
|
|
||||||
# Update articles
|
# Update articles
|
||||||
articles = get_articles(feed)
|
articles = get_articles(feed)
|
||||||
threshold_date = datetime.now() - timedelta(days = max_age)
|
threshold_date = datetime.now() - timedelta(days=max_age)
|
||||||
|
|
||||||
|
if len(articles) == 0:
|
||||||
|
error('no articles received from feed "{}"'.format(feed["name"]))
|
||||||
|
|
||||||
for a in articles:
|
for a in articles:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
date = datetime.fromtimestamp(mktime(a.published_parsed))
|
|
||||||
|
# Set fallback if no parseable date found
|
||||||
|
fallback = False
|
||||||
|
try:
|
||||||
|
date = datetime.fromtimestamp(mktime(a.published_parsed))
|
||||||
|
except Exception:
|
||||||
|
date = datetime.now()
|
||||||
|
fallback = True
|
||||||
|
|
||||||
if date > threshold_date:
|
if date > threshold_date:
|
||||||
filename = get_filename(date, a.title)
|
|
||||||
if not os.path.exists(os.path.join(feedpath_new, filename)) and not os.path.exists(os.path.join(feedpath_read, filename)):
|
# Check if article should be filtered
|
||||||
text = get_article(a, feed['scrape'])
|
filter = False
|
||||||
write_to_file(os.path.join(feedpath_new, filename), text)
|
for f in filters:
|
||||||
log(' added article "{}"'.format(a.title))
|
if re.search(f, a.title.lower()):
|
||||||
|
filter = True
|
||||||
|
log(' filtered article "{}"'.format(a.title))
|
||||||
|
|
||||||
|
if not filter:
|
||||||
|
|
||||||
|
# Construct filename
|
||||||
|
filename_prefix = date.strftime("%Y%m%d%H%M")
|
||||||
|
filename_postfix = get_filename_postfix(a.title)
|
||||||
|
filename = "{}_{}".format(filename_prefix, filename_postfix)
|
||||||
|
|
||||||
|
# Check if article exists
|
||||||
|
article_exists = False
|
||||||
|
if fallback:
|
||||||
|
existing_articles_fallback = [a[13:] for a in existing_articles]
|
||||||
|
if filename_postfix in existing_articles_fallback:
|
||||||
|
article_exists = True
|
||||||
|
elif filename in existing_articles:
|
||||||
|
article_exists = True
|
||||||
|
|
||||||
|
if not article_exists:
|
||||||
|
text = get_article(a, feed)
|
||||||
|
write_to_file(os.path.join(feedpath_new, filename), text)
|
||||||
|
log(' added article "{}"'.format(a.title))
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error('while parsing feed article "{}" from feed "{}": {}'.format(a.title, feed['name'], e))
|
error(
|
||||||
|
'while parsing article "{}" from feed "{}": {}'.format(
|
||||||
|
a.title, feed["name"], e
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Delete articles older than max_age
|
# Delete articles older than max_age
|
||||||
def remove_old_articles():
|
def remove_old_articles():
|
||||||
|
|
||||||
threshold_date = datetime.now() - timedelta(days = max_age)
|
threshold_date = datetime.now() - timedelta(days=max_age)
|
||||||
count = 0
|
count = 0
|
||||||
|
|
||||||
for subdir, dirs, files in os.walk(base_directory):
|
for subdir, dirs, files in os.walk(base_directory):
|
||||||
|
|
||||||
# Skip 'loved' directory
|
# Skip 'loved' directory
|
||||||
if not os.path.join(base_directory, 'loved') in subdir:
|
if not os.path.join(base_directory, "loved") in subdir:
|
||||||
for file in files:
|
for file in files:
|
||||||
date = datetime.strptime(file[:12], '%Y%m%d%H%M')
|
date = datetime.strptime(file[:12], "%Y%m%d%H%M")
|
||||||
if threshold_date > date:
|
if threshold_date > date:
|
||||||
os.remove(os.path.join(subdir, file))
|
os.remove(os.path.join(subdir, file))
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
log(' removed {} articles'.format(count))
|
log(" removed {} articles".format(count))
|
||||||
|
|
||||||
|
|
||||||
# Parse config file
|
# Parse config file
|
||||||
def load_config(filepath):
|
def load_config(filepath):
|
||||||
|
|
||||||
global base_directory, max_age, datetime_format, postprocessor, fileending, feeds
|
global base_directory, max_age, datetime_format, postprocessor, fileending, filters, feeds
|
||||||
|
|
||||||
try:
|
try:
|
||||||
config = toml.load(filepath)
|
config = toml.load(filepath)
|
||||||
base_directory = config['base_directory']
|
base_directory = config["base_directory"]
|
||||||
max_age = config['max_age']
|
max_age = config["max_age"]
|
||||||
datetime_format = config['datetime_format']
|
datetime_format = config["datetime_format"]
|
||||||
postprocessor = config['postprocessor']
|
postprocessor = config["postprocessor"]
|
||||||
fileending = config['fileending']
|
fileending = config["fileending"]
|
||||||
feeds = config['feed']
|
filters = config["filters"]
|
||||||
|
feeds = config["feed"]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error('while parsing config: {}'.format(e))
|
error("while parsing config: {}".format(e))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
# Initialize spiderss
|
# Initialize spiderss
|
||||||
def initialize():
|
def initialize():
|
||||||
|
|
||||||
|
global lovedpath
|
||||||
|
|
||||||
# Create 'loved' directory if not existent
|
# Create 'loved' directory if not existent
|
||||||
lovedpath = os.path.join(base_directory, 'loved')
|
lovedpath = os.path.join(base_directory, "loved")
|
||||||
if not os.path.exists(lovedpath):
|
if not os.path.exists(lovedpath):
|
||||||
os.makedirs(lovedpath)
|
os.makedirs(lovedpath)
|
||||||
|
|
||||||
|
|
||||||
# Update all feeds and delete old messages
|
# Update all feeds and remove old articles
|
||||||
def crawl():
|
def crawl():
|
||||||
|
|
||||||
log('crawling feeds', True)
|
log("crawling feeds", True)
|
||||||
for feed in feeds:
|
for feed in feeds:
|
||||||
update_feed(feed)
|
update_feed(feed)
|
||||||
|
|
||||||
log('removing old articles', True)
|
log("removing old articles", True)
|
||||||
remove_old_articles()
|
remove_old_articles()
|
||||||
|
|
||||||
'''
|
|
||||||
|
"""
|
||||||
Main
|
Main
|
||||||
'''
|
"""
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
global verbose
|
global verbose
|
||||||
|
|
||||||
# Initialize parser
|
# Initialize parser
|
||||||
parser = argparse.ArgumentParser(description = 'Crawl RSS feeds and store articles as Markdown files.')
|
parser = argparse.ArgumentParser(
|
||||||
parser.add_argument('-v', '--verbose', action = 'store_true', help = 'verbose output')
|
description="Crawl RSS feeds and store articles as Markdown files."
|
||||||
parser.add_argument('-c', '--config', default = './config.toml', help = 'config file (default: ./config.toml)')
|
)
|
||||||
|
parser.add_argument("-v", "--verbose", action="store_true", help="verbose output")
|
||||||
|
parser.add_argument(
|
||||||
|
"-c",
|
||||||
|
"--config",
|
||||||
|
default="./config.toml",
|
||||||
|
help="config file (default: ./config.toml)",
|
||||||
|
)
|
||||||
|
|
||||||
# Get args
|
# Get args
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
@ -308,5 +407,5 @@ def main():
|
||||||
crawl()
|
crawl()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue