only replace links if they start with http

This commit is contained in:
Denis Lehmann 2020-04-18 20:20:09 +02:00
parent 6b0d0f05be
commit 877aff0475

View file

@ -81,6 +81,7 @@ def get_filename(date, title):
return '{}_{}.{}'.format(date, title, fileending) return '{}_{}.{}'.format(date, title, fileending)
# If scraped, use first content image as fallback
# Get image snippet for an article # Get image snippet for an article
def get_article_image(article): def get_article_image(article):
@ -110,6 +111,7 @@ def get_article_body(article, scrape):
body = '' body = ''
# TODO: Include appropriate header?
# If scrape, get article with readability # If scrape, get article with readability
if scrape: if scrape:
@ -129,8 +131,10 @@ def get_article_body(article, scrape):
if constructed_src[1] == '': if constructed_src[1] == '':
constructed_src[1] = splitted_url.netloc constructed_src[1] = splitted_url.netloc
new_src = urlunsplit(constructed_src) new_src = urlunsplit(constructed_src)
body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1) if new_src.startswith('http'):
body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1)
# TODO: catch mailto:
for a in soup.find_all('a', href = True): for a in soup.find_all('a', href = True):
href = a.get('href') href = a.get('href')
splitted_href = urlsplit(href) splitted_href = urlsplit(href)
@ -140,7 +144,8 @@ def get_article_body(article, scrape):
if constructed_href[1] == '': if constructed_href[1] == '':
constructed_href[1] = splitted_url.netloc constructed_href[1] = splitted_url.netloc
new_href = urlunsplit(constructed_href) new_href = urlunsplit(constructed_href)
body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1) if new_href.startswith('http'):
body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1)
# Else construct from article content # Else construct from article content
@ -175,6 +180,7 @@ def get_article(article, scrape):
# Construct head of article # Construct head of article
image = get_article_image(article) image = get_article_image(article)
summary = get_article_summary(article) summary = get_article_summary(article)
#TODO: Current time as fallback?
date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format) date = datetime.fromtimestamp(mktime(article.published_parsed)).strftime(datetime_format)
head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link) head = '<h1>{}</h1>\n\n{}{}<p>{} - <a href={}>Link</a></p>'.format(article.title, image, summary, date, article.link)