Как редактировать htlm документ через Python

В общем я тут пытаюсь страницу html отредактировать. Проблема в том что когда я крашу все буквы документа - редактируются данные внутри тэгов. Где то читал что BeautifulSoup можно как то редактировать тэги и это отразится в оригинальном html документе.

Сейчас пробую вариант: img['src']=needed_url+img[src]

Но такой вариант ничего не дает

Пробовал череp replace.

Пока получается какая то чушь.

Код для любопытных:

class WebRework(object):

    def __init__(self,url):
        global words, FORMAT
        global trash_symbs, allowedshorts, restricted, trashset, common
        global LYN, local_libraries_pool,OPERATIONS
        ENCODIN= "utf-8, deflate"
        ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
        DNT = "1"
        CON = "close"
        INSEC ="1"
        LANG =  "ru-RU,ru;q=0.5"
        USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
        url='https://ru.wikipedia.org/wiki/%D0%92%D0%B0%D0%BD_%D0%93%D0%BE%D0%B3,_%D0%92%D0%B8%D0%BD%D1%81%D0%B5%D0%BD%D1%82'
        link=url
        headers = {"user-agent" : USER_AGENT, "Accept-Language": LANG,  "Accept-Encoding": ENCODIN, "DNT": DNT, "Connection": CON, "Accept":ACCEPT, "Upgrade-Insecure-Requests":INSEC}
        cookies = browser_cookie3.chrome(domain_name=link.split("/")[2])
        needed_url='https://'+link.split("/")[2]
        r2 = requests.get(link,headers=headers, verify=True, cookies=cookies, timeout=3)
        r3=str(r2.text).replace("<!DOCTYPE html>",f'<!DOCTYPE html><head><base href="{needed_url}"><base target="_blank" href="{needed_url}"></head>')
        soup_original = BeautifulSoup(r3,'html.parser')
        #r2.text=r2.text.replace("<!DOCTYPE html>",f'<!DOCTYPE html><head><base href="{needed_url}"><base target="_blank" href="{needed_url}"></head>')
        all_links = soup_original.find_all("a")
        all_img = soup_original.find_all("img")
        head_orig=soup_original.find("head")
        html_document = r2.text
        ultimate_links=[]
        for link2 in all_links:
            ultimate_links+=[str(link2)]
        ultimate_img=[]
        for img in all_img:
            ultimate_img+=[str(img)]
        
        
        
        urltest = url
        testurl=link.split("/")[2]
        needed_url='https://'+link.split("/")[2]
        download_folder = f'/{link.split("/")[2]}/'
        kwargs = {'bypass_robots': True, 'project_name': 'testing', 'open_in_browser':False}
        save_webpage(urltest, download_folder, **kwargs)
        #страница сохраняется на компьютер
        adress=r"C:\{0}\**\**\**\*.html".format(testurl)
        #adress+=testurl
        #adress+=r"\**\**\**\*.html"
        arr=glob.glob(adress) #ищется index
        print(arr)#тестовая проверка выдачи
        index_page = [page for page in arr if 'index' in page][0]#находится index
        downloaded_page=""
        with open(index_page, "r", encoding="utf-8") as f: 
            try:
                downloaded_page=f.read()
            except UnicodeDecodeError:
                with open(index_page, "r", encoding="ISO-8859–1") as f:
                    try:
                        downloaded_page=f.read()
                    except UnicodeDecodeError:
                        pass

        
        #if downloaded_page=="":
        #    downloaded_page=r.text()
        downloaded_page=downloaded_page.encode('utf-8')     
        downloaded_page=html.unescape(downloaded_page.decode('utf-8'))   
         
        downloaded_page=downloaded_page.encode('utf-8')
        downloaded_page=downloaded_page.decode('utf-8')
        downloaded_page=downloaded_page.replace(r"file:///C:",needed_url) 
        with open('reserve.html', "w", encoding="utf-8") as f:
            f.write(downloaded_page)
            print("213 checker")      

        soup_downloaded=BeautifulSoup(downloaded_page,'html.parser') 
        cloud_test=soup_downloaded.get_text()


        all_p_downloaded_page=soup_downloaded.find_all("p")
        temp_str=""
        #for word in cloud_test2:
        #   temp_str+=" "+word
        word_collection=[]
        text_var=[]
        for p in all_p_downloaded_page:
            for_change = p.get_text()
            text1=str(for_change)
            text2=text1 #дубликат
            text3=text1
            for trash in trash_symbs:
                text3=text3.replace(trash,"")
            text3=text3.replace(u'\n'," ")
            text3=text3.replace(u"_"," ")
            text3=text3.replace(u"\xa0"," ")
            text3=text3.replace(u"\\xa0"," ") 
            text3=text3.replace(u"\\xa"," ")
            text3=text3.replace(u"\u2009"," ")
            word_collection+=text3.split(" ")

            text_var+=text3.split(" ")
        

        text_var = [x for x in text_var if len(x)>2]

        format_dict=dict()
        indeyer=len(text_var)
        
        indexer=0
        for words in text_var:
            if words in FORMAT:
                try: 
                    if FORMAT[words].replace:
                        words=FORMAT[words].replace #слово заменяется на другое
                except KeyError:
                    pass
                except IndexError:
                    pass
            if len(words)<1:
                continue
            tempword=""
            for letter in words:
            
                try:
                    tempword+=FORMAT[str(letter)].get() #подставляется форматирование
                except KeyError:
                    tempword+=letter
                except IndexError:
                    tempword+=letter
            
            format_dict[words]=tempword

        
            #print(format_dict[words])
            #print(words)            
            
            indexer+=1
            print(f"{indexer}/{indeyer}")
      

        for word in text_var:
            try:
                downloaded_page=downloaded_page.replace(word,format_dict[word])
            except KeyError:
                continue
            except IndexError:
                continue  

        washed=BeautifulSoup(downloaded_page,'html.parser')

        washed_links=washed.find_all('a')
        washed_img=washed.find_all('img')

        indexer=0
        for link in washed_links:
            try:
                downloaded_page=downloaded_page.replace(str(link),ultimate_links[indexer])
            except KeyError:
                pass
            except IndexError:
                pass
            indexer+=1

        indexer=0
        for img in washed_img:
            try:
                downloaded_page=downloaded_page.replace(str(img),str(ultimate_img[indexer]))
            except KeyError:
                pass
            except IndexError:
                pass
            indexer+=1
        with open('alchemist.html', "w", encoding="utf-8") as f:
            f.write(downloaded_page.replace('<href="','<href="'+needed_url))

        # print(1+'1')
         #счетчик прогресса
        #downloaded_page=downloaded_page.decode('utf-8')
        
        needed_links=BeautifulSoup(downloaded_page,'html.parser')
        links_for_restore=needed_links.find_all('a')
        imgs_for_restore=needed_links.find_all("img")
        indexer=0
        for link in links_for_restore:
            string_to_replace=str(all_links[indexer])
            string_to_replace=string_to_replace.replace("file:///C:",needed_url)
            link.string=string_to_replace
            #link.string=str(str(all_links[indexer]).replace("file:///C:",needed_url))
        indexer=0
        for img in imgs_for_restore:
            try:
                img.string=str(all_img[indexer])    
            except KeyError:
                continue
            except IndexError:
                continue
        with open('hefestus.html', "w", encoding="utf-8") as f:
            f.write(downloaded_page.replace('<href="','<href="'+needed_url))
        
        soup_apollo=BeautifulSoup(downloaded_page,'html.parser')
        apollo_img=soup_apollo.find_all("img")
        apollo_links=soup_apollo.find_all("a")
        apollo_texts=soup_apollo.get_text()
        for trash in trash_symbs:
            apollo_texts=apollo_texts.replace(trash,"")
        apollo_texts=apollo_texts.replace('\n'," ")
        apollo_texts=apollo_texts.replace(u"_"," ")
        apollo_texts=apollo_texts.replace(u"\xa0"," ")
        apollo_texts=apollo_texts.replace(u"\\xa0"," ") 
        apollo_texts=apollo_texts.replace(u"\\xa"," ")
        apollo_texts=apollo_texts.replace(u"\u2009"," ")
        #text_var=set(apollo_texts.split(" "))
        soup=BeautifulSoup(downloaded_page,'html.parser')
        indexer=0
        for img in soup.find_all('img'):
            
            reserve=str(img)
            to_replace=ultimate_img[indexer]
            testing=str(downloaded_page)
            img=to_replace
            downloaded_page=downloaded_page.replace(reserve,ultimate_img[indexer])
            try:
                img['src']=img['src'].replace(img['src'],all_img[indexer]['src'])
            except KeyError:
                pass
            try:
                img['srcset']=img['srcset'].replace(img['srcset'],all_img[indexer]['srcset'])
            except KeyError:
                pass
            try:
                img['title']=img['title'].replace(img['title'],all_img[indexer]['title'])
            except KeyError:
                pass
            if testing==str(downloaded_page):
                print("не изменилось<><><><><")
            else:
                print("изменилось!!!!!!!!!!!")
            indexer+=1
            
            #img.string=needed_url+img.string
        testing=soup_apollo
        testing_all=testing.find_all()
        text_all=""
        for test in testing_all:
            text_all+=str(test)


        
        #df = pd.read_html(downloaded_page)[0]
        #print(df.to_csv(index=False))
        with open('apollo.html', "w", encoding="utf-8") as f:
            f.write(downloaded_page.replace('file:///C:',needed_url))

        print(1+"1")   

Как видно из кода - я уже сильно запутался.

ПРоблема в том что по какой то причине только что извлеченный из soup.find_all("a") который с неправильным форматированием, почему то не находится в документе downloaded_page , хотя казалось бы - из него и вытащен, и поэтому не получается заменить его через replace.

Что-почему - не знаю.


Ответы (0 шт):