From 877aff0475e695f7df946caa52b1dd3d0a5c0c14 Mon Sep 17 00:00:00 2001 From: Denis Lehmann Date: Sat, 18 Apr 2020 20:20:09 +0200 Subject: [PATCH] only replace links if they start with http --- spiderss.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/spiderss.py b/spiderss.py index c1acc89..afd8a71 100755 --- a/spiderss.py +++ b/spiderss.py @@ -81,6 +81,7 @@ def get_filename(date, title): return '{}_{}.{}'.format(date, title, fileending) +# If scraped, use first content image as fallback # Get image snippet for an article def get_article_image(article): @@ -110,6 +111,7 @@ def get_article_body(article, scrape): body = '' + # TODO: Include appropriate header? # If scrape, get article with readability if scrape: @@ -129,8 +131,10 @@ def get_article_body(article, scrape): if constructed_src[1] == '': constructed_src[1] = splitted_url.netloc new_src = urlunsplit(constructed_src) - body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1) + if new_src.startswith('http'): + body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1) + # TODO: catch mailto: for a in soup.find_all('a', href = True): href = a.get('href') splitted_href = urlsplit(href) @@ -140,7 +144,8 @@ def get_article_body(article, scrape): if constructed_href[1] == '': constructed_href[1] = splitted_url.netloc new_href = urlunsplit(constructed_href) - body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1) + if new_href.startswith('http'): + body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1) # Else construct from article content @@ -175,6 +180,7 @@ def get_article(article, scrape): # Construct head of article image = get_article_image(article) summary = get_article_summary(article) + #TODO: Current time as fallback? date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format) head = '

{}

\n\n{}{}

{} - Link

'.format(article.title, image, summary, date, article.link)