как сделать сохранение спарсенных данных в excel файл python
написал небольшой парсер статей и комментариев к ним, но при сохранении в excel таблицу сохраняется только первая статья (с json таких проблем не было). Вот сам код:
from bs4 import BeautifulSoup as bs
import requests
import json
import pandas as pd
url = 'https://lainelir2.pythonanywhere.com/'
req = requests.get(url)
soup = bs(req.text, 'lxml')
items = {}
for i in soup.find_all('div', class_='container-link'):
articles_url = 'https://lainelir2.pythonanywhere.com' + i.a['href']
articles_name = i.text
articles_name = articles_name.replace('\n', '')
items[articles_name] = articles_url
with open('articles_name.json', 'w', encoding='utf-8') as file:
json.dump(items, file, indent=4, ensure_ascii=False)
with open('articles_name.json', encoding='utf-8') as f:
data = json.load(f)
articles_text = []
for name, url in data.items():
req = requests.get(url)
soup = bs(req.text, 'lxml')
for text_articles in soup.find_all('div', class_='container-view'):
articles_text = soup.find('div', class_='text-view').text
rep = ['\r', '\n', '\r\n\n\n\r\n', '\r\n']
for items in rep:
if items in articles_text:
articles_text = articles_text.replace(items, '')
comment_list = []
for i in soup.find_all('a', class_='text-reply'):
comment_text = i.text
for comm in rep:
if comm in comment_text:
comment_text = comment_text.replace(comm, '')
comment_list.append(comment_text)
articles_json = [{
'name': name,
'url': url,
'text': articles_text,
'comment': comment_list,
}]
df = pd.DataFrame(articles_json)
writer = pd.ExcelWriter('text.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='welcom', index=True)
writer.save()
Ответы (1 шт):
Автор решения: Сергей Ш
→ Ссылка
from bs4 import BeautifulSoup as bs
import requests
import json
import pandas as pd
articles_json = []
items = {}
url = 'https://lainelir2.pythonanywhere.com/'
req = requests.get(url)
soup = bs(req.text, 'lxml')
for i in soup.find_all('div', class_='container-link'):
url = 'https://lainelir2.pythonanywhere.com' + i.a['href']
name = i.text.replace('\n', '')
items[name] = url
print(name)
req = requests.get(url)
soup = bs(req.text, 'lxml')
articles_text = ' '.join([x for x in soup.find('div', class_='text-view').stripped_strings])
comment = [x.text.strip() for x in soup.find_all('a', class_='text-reply')]
articles_json.append({
'name': name,
'url': url,
'text': articles_text,
'comment': comment,
})
df = pd.DataFrame(articles_json)
writer = pd.ExcelWriter('text.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='welcom', index=True)
writer.save()
with open('articles_name.json', 'w', encoding='utf-8') as file:
json.dump(items, file, indent=4, ensure_ascii=False)