replace relative links with absolute ones using beautifulsoup

This commit is contained in:
Denis Lehmann 2020-04-18 14:33:53 +02:00
parent d4f7bb6ac5
commit ce79c4f400
2 changed files with 29 additions and 0 deletions

View file

@ -1,3 +1,4 @@
beautifulsoup4
feedparser
html2text
opml

View file

@ -10,9 +10,11 @@ import subprocess
import sys
import time
import toml
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from readability import Document
from time import mktime
from urllib.parse import urljoin, urlsplit, urlunsplit, SplitResult
'''
Output functions
@ -115,6 +117,32 @@ def get_article_body(article, scrape):
doc = Document(response.text)
body = doc.summary()
# Replace relative site links with absolute ones, using beautifulsoup
splitted_url = urlsplit(article.link)
soup = BeautifulSoup(body, features = 'lxml')
for img in soup.find_all('img', src = True):
src = img.get('src')
splitted_src = urlsplit(src)
constructed_src = [splitted_src.scheme, splitted_src.netloc, splitted_src.path, splitted_src.query, splitted_src.fragment]
if constructed_src[0] == '':
constructed_src[0] = splitted_url.scheme
if constructed_src[1] == '':
constructed_src[1] = splitted_url.netloc
new_src = urlunsplit(constructed_src)
body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1)
for a in soup.find_all('a', href = True):
href = a.get('href')
splitted_href = urlsplit(href)
constructed_href = [splitted_href.scheme, splitted_href.netloc, splitted_href.path, splitted_href.query, splitted_href.fragment]
if constructed_href[0] == '':
constructed_href[0] = splitted_url.scheme
if constructed_href[1] == '':
constructed_href[1] = splitted_url.netloc
new_href = urlunsplit(constructed_href)
body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1)
# Else construct from article content
else: