replace relative links with absolute ones using beautifulsoup
This commit is contained in:
parent
d4f7bb6ac5
commit
ce79c4f400
2 changed files with 29 additions and 0 deletions
|
|
@ -1,3 +1,4 @@
|
||||||
|
beautifulsoup4
|
||||||
feedparser
|
feedparser
|
||||||
html2text
|
html2text
|
||||||
opml
|
opml
|
||||||
|
|
|
||||||
28
spiderss.py
28
spiderss.py
|
|
@ -10,9 +10,11 @@ import subprocess
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import toml
|
import toml
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from readability import Document
|
from readability import Document
|
||||||
from time import mktime
|
from time import mktime
|
||||||
|
from urllib.parse import urljoin, urlsplit, urlunsplit, SplitResult
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Output functions
|
Output functions
|
||||||
|
|
@ -115,6 +117,32 @@ def get_article_body(article, scrape):
|
||||||
doc = Document(response.text)
|
doc = Document(response.text)
|
||||||
body = doc.summary()
|
body = doc.summary()
|
||||||
|
|
||||||
|
# Replace relative site links with absolute ones, using beautifulsoup
|
||||||
|
splitted_url = urlsplit(article.link)
|
||||||
|
soup = BeautifulSoup(body, features = 'lxml')
|
||||||
|
for img in soup.find_all('img', src = True):
|
||||||
|
src = img.get('src')
|
||||||
|
splitted_src = urlsplit(src)
|
||||||
|
constructed_src = [splitted_src.scheme, splitted_src.netloc, splitted_src.path, splitted_src.query, splitted_src.fragment]
|
||||||
|
if constructed_src[0] == '':
|
||||||
|
constructed_src[0] = splitted_url.scheme
|
||||||
|
if constructed_src[1] == '':
|
||||||
|
constructed_src[1] = splitted_url.netloc
|
||||||
|
new_src = urlunsplit(constructed_src)
|
||||||
|
body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1)
|
||||||
|
|
||||||
|
for a in soup.find_all('a', href = True):
|
||||||
|
href = a.get('href')
|
||||||
|
splitted_href = urlsplit(href)
|
||||||
|
constructed_href = [splitted_href.scheme, splitted_href.netloc, splitted_href.path, splitted_href.query, splitted_href.fragment]
|
||||||
|
if constructed_href[0] == '':
|
||||||
|
constructed_href[0] = splitted_url.scheme
|
||||||
|
if constructed_href[1] == '':
|
||||||
|
constructed_href[1] = splitted_url.netloc
|
||||||
|
new_href = urlunsplit(constructed_href)
|
||||||
|
body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1)
|
||||||
|
|
||||||
|
|
||||||
# Else construct from article content
|
# Else construct from article content
|
||||||
else:
|
else:
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue