как сделать сохранение спарсенных данных в excel файл python

Question

написал небольшой парсер статей и комментариев к ним, но при сохранении в excel таблицу сохраняется только первая статья (с json таких проблем не было). Вот сам код:

from bs4 import BeautifulSoup as bs
import requests
import json
import pandas as pd

url = 'https://lainelir2.pythonanywhere.com/'
req = requests.get(url)
soup = bs(req.text, 'lxml')

items = {}
for i in soup.find_all('div', class_='container-link'):
    articles_url = 'https://lainelir2.pythonanywhere.com' + i.a['href']
    articles_name = i.text
    articles_name = articles_name.replace('\n', '') 
    items[articles_name] = articles_url

with open('articles_name.json', 'w', encoding='utf-8') as file:
    json.dump(items, file, indent=4, ensure_ascii=False)

with open('articles_name.json', encoding='utf-8') as f:
    data = json.load(f)

articles_text = []
for name, url in data.items():

req = requests.get(url)
soup = bs(req.text, 'lxml')

for text_articles in soup.find_all('div', class_='container-view'):
    articles_text = soup.find('div', class_='text-view').text

    rep = ['\r', '\n', '\r\n\n\n\r\n', '\r\n']
    for items in rep:
        if items in articles_text:
            articles_text = articles_text.replace(items, '')

    comment_list = []
    for i in soup.find_all('a', class_='text-reply'):
        comment_text = i.text
        for comm in rep:
            if comm in comment_text:
                comment_text = comment_text.replace(comm, '')
        comment_list.append(comment_text)

        articles_json = [{
        'name': name,
        'url': url,
        'text': articles_text,
        'comment': comment_list,
        }]

df = pd.DataFrame(articles_json)
writer = pd.ExcelWriter('text.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='welcom', index=True)
writer.save()

Answer 1

from bs4 import BeautifulSoup as bs
import requests
import json
import pandas as pd

articles_json = []
items = {}
url = 'https://lainelir2.pythonanywhere.com/'
req = requests.get(url)
soup = bs(req.text, 'lxml')

for i in soup.find_all('div', class_='container-link'):
    url = 'https://lainelir2.pythonanywhere.com' + i.a['href']
    name = i.text.replace('\n', '')
    items[name] = url
    print(name)

    req = requests.get(url)
    soup = bs(req.text, 'lxml')

    articles_text = ' '.join([x for x in soup.find('div', class_='text-view').stripped_strings])
    comment = [x.text.strip() for x in soup.find_all('a', class_='text-reply')]

    articles_json.append({
    'name': name,
    'url': url,
    'text': articles_text,
    'comment': comment,
    })

df = pd.DataFrame(articles_json)
writer = pd.ExcelWriter('text.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='welcom', index=True)
writer.save()

with open('articles_name.json', 'w', encoding='utf-8') as file:
    json.dump(items, file, indent=4, ensure_ascii=False)

БЛОГ НА HUSL

как сделать сохранение спарсенных данных в excel файл python

Ответы (1 шт):