import requests
from bs4 import BeautifulSoup
import string
url = input("> ")
r = requests.get(url, headers={'Accept-Language': 'en-US,en;q=0.5'})
stat = r.status_code
s = BeautifulSoup(r.content, "lxml")
f = s.find_all('div', 'c-card__body u-display-flex u-flex-direction-column')
href = list()
for i in s:
j = i.find('a')
href.append(j.get('href'))
types = s.find_all('span', 'c-meta__type')
urls = list()
for c in range(len(href)):
urls.append(f'https://www.nature.com{href[c]}')
if len(urls) == 0:
print("This site does not contain articles with the type of news")
saved_articles = list()
for url in urls:
r2 = requests.get(url, headers={'Accept-Language': 'en-US,en;q=0.5'})
s = BeautifulSoup(r2.content, "lxml")
body = s.find('div', 'c-article-body u-clearfix')
paragraphs = body.find_all('p', {'class': ''})
title = s.title.text
titlewp = title.translate(str.maketrans('', '', string.punctuation))
article_name = titlewp.translate(str.maketrans(" ", "_", "—"))
while "__" in article_name:
article_name = article_name.replace("__", "_")
saved_articles.append(f'{article_name}.txt')
file = open(f'{article_name}.txt', "wb+")
for p in paragraphs:
file.write(bytes(p.text, 'utf8'))
file.close()
print(f'Saved articles: {saved_articles}')
if stat == 200:
print("Content saved")
else:
print("The URL returned ", stat, "!")