Как редактировать htlm документ через Python
В общем я тут пытаюсь страницу html отредактировать. Проблема в том что когда я крашу все буквы документа - редактируются данные внутри тэгов. Где то читал что BeautifulSoup можно как то редактировать тэги и это отразится в оригинальном html документе.
Сейчас пробую вариант: img['src']=needed_url+img[src]
Но такой вариант ничего не дает
Пробовал череp replace.
Пока получается какая то чушь.
Код для любопытных:
class WebRework(object):
def __init__(self,url):
global words, FORMAT
global trash_symbs, allowedshorts, restricted, trashset, common
global LYN, local_libraries_pool,OPERATIONS
ENCODIN= "utf-8, deflate"
ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
DNT = "1"
CON = "close"
INSEC ="1"
LANG = "ru-RU,ru;q=0.5"
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
url='https://ru.wikipedia.org/wiki/%D0%92%D0%B0%D0%BD_%D0%93%D0%BE%D0%B3,_%D0%92%D0%B8%D0%BD%D1%81%D0%B5%D0%BD%D1%82'
link=url
headers = {"user-agent" : USER_AGENT, "Accept-Language": LANG, "Accept-Encoding": ENCODIN, "DNT": DNT, "Connection": CON, "Accept":ACCEPT, "Upgrade-Insecure-Requests":INSEC}
cookies = browser_cookie3.chrome(domain_name=link.split("/")[2])
needed_url='https://'+link.split("/")[2]
r2 = requests.get(link,headers=headers, verify=True, cookies=cookies, timeout=3)
r3=str(r2.text).replace("<!DOCTYPE html>",f'<!DOCTYPE html><head><base href="{needed_url}"><base target="_blank" href="{needed_url}"></head>')
soup_original = BeautifulSoup(r3,'html.parser')
#r2.text=r2.text.replace("<!DOCTYPE html>",f'<!DOCTYPE html><head><base href="{needed_url}"><base target="_blank" href="{needed_url}"></head>')
all_links = soup_original.find_all("a")
all_img = soup_original.find_all("img")
head_orig=soup_original.find("head")
html_document = r2.text
ultimate_links=[]
for link2 in all_links:
ultimate_links+=[str(link2)]
ultimate_img=[]
for img in all_img:
ultimate_img+=[str(img)]
urltest = url
testurl=link.split("/")[2]
needed_url='https://'+link.split("/")[2]
download_folder = f'/{link.split("/")[2]}/'
kwargs = {'bypass_robots': True, 'project_name': 'testing', 'open_in_browser':False}
save_webpage(urltest, download_folder, **kwargs)
#страница сохраняется на компьютер
adress=r"C:\{0}\**\**\**\*.html".format(testurl)
#adress+=testurl
#adress+=r"\**\**\**\*.html"
arr=glob.glob(adress) #ищется index
print(arr)#тестовая проверка выдачи
index_page = [page for page in arr if 'index' in page][0]#находится index
downloaded_page=""
with open(index_page, "r", encoding="utf-8") as f:
try:
downloaded_page=f.read()
except UnicodeDecodeError:
with open(index_page, "r", encoding="ISO-8859–1") as f:
try:
downloaded_page=f.read()
except UnicodeDecodeError:
pass
#if downloaded_page=="":
# downloaded_page=r.text()
downloaded_page=downloaded_page.encode('utf-8')
downloaded_page=html.unescape(downloaded_page.decode('utf-8'))
downloaded_page=downloaded_page.encode('utf-8')
downloaded_page=downloaded_page.decode('utf-8')
downloaded_page=downloaded_page.replace(r"file:///C:",needed_url)
with open('reserve.html', "w", encoding="utf-8") as f:
f.write(downloaded_page)
print("213 checker")
soup_downloaded=BeautifulSoup(downloaded_page,'html.parser')
cloud_test=soup_downloaded.get_text()
all_p_downloaded_page=soup_downloaded.find_all("p")
temp_str=""
#for word in cloud_test2:
# temp_str+=" "+word
word_collection=[]
text_var=[]
for p in all_p_downloaded_page:
for_change = p.get_text()
text1=str(for_change)
text2=text1 #дубликат
text3=text1
for trash in trash_symbs:
text3=text3.replace(trash,"")
text3=text3.replace(u'\n'," ")
text3=text3.replace(u"_"," ")
text3=text3.replace(u"\xa0"," ")
text3=text3.replace(u"\\xa0"," ")
text3=text3.replace(u"\\xa"," ")
text3=text3.replace(u"\u2009"," ")
word_collection+=text3.split(" ")
text_var+=text3.split(" ")
text_var = [x for x in text_var if len(x)>2]
format_dict=dict()
indeyer=len(text_var)
indexer=0
for words in text_var:
if words in FORMAT:
try:
if FORMAT[words].replace:
words=FORMAT[words].replace #слово заменяется на другое
except KeyError:
pass
except IndexError:
pass
if len(words)<1:
continue
tempword=""
for letter in words:
try:
tempword+=FORMAT[str(letter)].get() #подставляется форматирование
except KeyError:
tempword+=letter
except IndexError:
tempword+=letter
format_dict[words]=tempword
#print(format_dict[words])
#print(words)
indexer+=1
print(f"{indexer}/{indeyer}")
for word in text_var:
try:
downloaded_page=downloaded_page.replace(word,format_dict[word])
except KeyError:
continue
except IndexError:
continue
washed=BeautifulSoup(downloaded_page,'html.parser')
washed_links=washed.find_all('a')
washed_img=washed.find_all('img')
indexer=0
for link in washed_links:
try:
downloaded_page=downloaded_page.replace(str(link),ultimate_links[indexer])
except KeyError:
pass
except IndexError:
pass
indexer+=1
indexer=0
for img in washed_img:
try:
downloaded_page=downloaded_page.replace(str(img),str(ultimate_img[indexer]))
except KeyError:
pass
except IndexError:
pass
indexer+=1
with open('alchemist.html', "w", encoding="utf-8") as f:
f.write(downloaded_page.replace('<href="','<href="'+needed_url))
# print(1+'1')
#счетчик прогресса
#downloaded_page=downloaded_page.decode('utf-8')
needed_links=BeautifulSoup(downloaded_page,'html.parser')
links_for_restore=needed_links.find_all('a')
imgs_for_restore=needed_links.find_all("img")
indexer=0
for link in links_for_restore:
string_to_replace=str(all_links[indexer])
string_to_replace=string_to_replace.replace("file:///C:",needed_url)
link.string=string_to_replace
#link.string=str(str(all_links[indexer]).replace("file:///C:",needed_url))
indexer=0
for img in imgs_for_restore:
try:
img.string=str(all_img[indexer])
except KeyError:
continue
except IndexError:
continue
with open('hefestus.html', "w", encoding="utf-8") as f:
f.write(downloaded_page.replace('<href="','<href="'+needed_url))
soup_apollo=BeautifulSoup(downloaded_page,'html.parser')
apollo_img=soup_apollo.find_all("img")
apollo_links=soup_apollo.find_all("a")
apollo_texts=soup_apollo.get_text()
for trash in trash_symbs:
apollo_texts=apollo_texts.replace(trash,"")
apollo_texts=apollo_texts.replace('\n'," ")
apollo_texts=apollo_texts.replace(u"_"," ")
apollo_texts=apollo_texts.replace(u"\xa0"," ")
apollo_texts=apollo_texts.replace(u"\\xa0"," ")
apollo_texts=apollo_texts.replace(u"\\xa"," ")
apollo_texts=apollo_texts.replace(u"\u2009"," ")
#text_var=set(apollo_texts.split(" "))
soup=BeautifulSoup(downloaded_page,'html.parser')
indexer=0
for img in soup.find_all('img'):
reserve=str(img)
to_replace=ultimate_img[indexer]
testing=str(downloaded_page)
img=to_replace
downloaded_page=downloaded_page.replace(reserve,ultimate_img[indexer])
try:
img['src']=img['src'].replace(img['src'],all_img[indexer]['src'])
except KeyError:
pass
try:
img['srcset']=img['srcset'].replace(img['srcset'],all_img[indexer]['srcset'])
except KeyError:
pass
try:
img['title']=img['title'].replace(img['title'],all_img[indexer]['title'])
except KeyError:
pass
if testing==str(downloaded_page):
print("не изменилось<><><><><")
else:
print("изменилось!!!!!!!!!!!")
indexer+=1
#img.string=needed_url+img.string
testing=soup_apollo
testing_all=testing.find_all()
text_all=""
for test in testing_all:
text_all+=str(test)
#df = pd.read_html(downloaded_page)[0]
#print(df.to_csv(index=False))
with open('apollo.html', "w", encoding="utf-8") as f:
f.write(downloaded_page.replace('file:///C:',needed_url))
print(1+"1")
Как видно из кода - я уже сильно запутался.
ПРоблема в том что по какой то причине только что извлеченный из soup.find_all("a") который с неправильным форматированием, почему то не находится в документе downloaded_page , хотя казалось бы - из него и вытащен, и поэтому не получается заменить его через replace.
Что-почему - не знаю.