Парсинг, обход cloudflare

Нужна помощь с написанием парсера для webmd, цель — спарсить список докторов, всю информацию о докторе (больше 100к страниц). Парсер настроен и готов к работе, но столкнулся с защитой Cloudflare с капчей, которая сбивает с толку.

Пробовал прокси, настройки юзер-агентов, даже session pooling, но каждый раз утыкаюсь в капчу. Есть идеи, как обойти эту защиту и пройти капчу автоматически? Буду благодарен за любые советы или готовые решения по этому вопросу.

P.S. Вопрос не про обход защиты ради обхода, а для автоматизации сбора данных.

import threading
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from selenium import webdriver
from selenium.webdriver.chrome.service import Service  # Импорт класса Service
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
from bs4 import BeautifulSoup
import time

driver_path = 'chromedriver.exe' 
service = Service(driver_path)

driver = webdriver.Chrome(service=service)

def extract_urls_from_page(soup, target_zipcodes):
    ul_element = soup.find("ul", class_="resultslist-content")
    urls = []

    if ul_element:
        for list_item in ul_element.find_all("li"):
            a_tag = list_item.find("a")
            if a_tag:
                url = a_tag.get("href")

                response = requests.get(url)
                if response.status_code == 200:
                    inner_soup = BeautifulSoup(response.content, 'html.parser')
                    zips_on_page = inner_soup.findAll('span', {'class': "location-zipcode loc-coi-loczip"})
                    for zip_span in zips_on_page:
                        if zip_span.text.strip() in target_zipcodes:
                            urls.append(url)
                            break
    return urls

def extract_and_save_urls(base_url, db_url):
    # Database setup for each thread
    Base = declarative_base()

    class SitesCrawled(Base):
        __tablename__ = 'sites_crawled'
        id = Column(Integer, primary_key=True)
        site = Column(String(1024))

    engine = create_engine(db_url)
    Base.metadata.create_all(engine)
    Session = sessionmaker(bind=engine)
    session = Session()

    driver = webdriver.Chrome()
    driver.maximize_window()
    driver.get(base_url)

    target_zipcodes = [
    '00501', '06390', '11701', '11702', '11703', '11704', '11705', '11706', 
    '11707', '11708', '11713', '11715', '11716', '11717', '11718', '11719', 
    '11720', '11721', '11722', '11724', '11725', '11726', '11727', '11729', 
    '11730', '11731', '11733', '11734', '11738', '11739', '11740', '11741', 
    '11742', '11743', '11745', '11746', '11747', '11749', '11750', '11751', 
    '11752', '11754', '11755', '11757', '11760', '11763', '11764', '11766', 
    '11767', '11768', '11769', '11770', '11772', '11775', '11776', '11777', 
    '11778', '11779', '11780', '11782', '11784', '11786', '11787', '11788', 
    '11789', '11790', '11792', '11794', '11795', '11796', '11798', '11901', 
    '11930', '11931', '11932', '11933', '11934', '11935', '11937', '11939', 
    '11940', '11941', '11942', '11944', '11946', '11947', '11948', '11949', 
    '11950', '11951', '11952', '11953', '11954', '11955', '11956', '11957', 
    '11958', '11959', '11960', '11961', '11962', '11963', '11964', '11965', 
    '11967', '11968', '11969', '11970', '11971', '11972', '11973', '11975', 
    '11976', '11977', '11978', '11980'
]

    try:
        while True:
            time.sleep(3)
            # Close ads (if any)

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            current_page_urls = extract_urls_from_page(soup, target_zipcodes)

            for url in current_page_urls:
                existing_entry = session.query(SitesCrawled).filter_by(site=url).first()
                if not existing_entry:
                    entry = SitesCrawled(site=url)
                    session.add(entry)
                    session.commit()

            try:
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, ".btn-next[title='Next Page']"))
                )
                driver.execute_script("arguments[0].click();", next_button)
            except (NoSuchElementException, TimeoutException):
                break
    finally:
        driver.quit()
        session.close()

# Base URLs and Database URLs
base_urls = [
    "https://doctor.webmd.com/providers/new-york/islip",
    "https://doctor.webmd.com/providers/new-york/southold",
    "https://doctor.webmd.com/providers/new-york/babylon",
    "https://doctor.webmd.com/providers/new-york/brookhaven",
    "https://doctor.webmd.com/providers/new-york/huntington",
    "https://doctor.webmd.com/providers/new-york/smithtown",
    "https://doctor.webmd.com/providers/new-york/shelter-island",
    "https://doctor.webmd.com/providers/new-york/east-hampton",
    "https://doctor.webmd.com/providers/new-york/southampton",
    "https://doctor.webmd.com/providers/new-york/riverhead"
]

db_urls = [
    "sqlite:///sites_crawled1.db3",
    "sqlite:///sites_crawled2.db3",
    "sqlite:///sites_crawled3.db3",
    "sqlite:///sites_crawled4.db3",
    "sqlite:///sites_crawled5.db3",
    "sqlite:///sites_crawled6.db3",
    "sqlite:///sites_crawled7.db3",
    "sqlite:///sites_crawled8.db3",
    "sqlite:///sites_crawled9.db3",
    "sqlite:///sites_crawled10.db3",
    
]

# Create and start threads
threads = []
for i in range(len(base_urls)):
    thread = threading.Thread(target=extract_and_save_urls, args=(base_urls[i], db_urls[i]))
    threads.append(thread)
    thread.start()

# Wait for all threads to finish
for thread in threads:
    thread.join()

print("All URLs processed and saved to the respective databases.")

import requests
from bs4 import BeautifulSoup
import re
import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text
from sqlalchemy.ext.declarative import declarative_base
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service  # Импорт класса Service
from selenium.webdriver.common.by import By
from multiprocessing import Pool
import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base

street_suffixes = [' Boulevard ', ' BOULEVARD ', ' Broadway ', ' BROADWAY ', ' Building ', ' BUILDING ', ' Parkway ', ' PARKWAY ', ' Highway ', ' HIGHWAY ', ' Avenue ', ' AVENUE ', ' Street ', ' STREET ', ' Drive ', ' DRIVE ', ' Circle ', ' CIRCLE ', ' Place ', ' PLACE ', ' Avenue ', ' ROAD ', ' Road ', ' Hwy ', ' Lane ', ' LANE ', ' WAY ', ' Way ', ' Dr ', ' DR ', ' Ave, ', ' Ave ', ' AVE ', ' St ', ' ST ', ' Rd ', ' RD ', ' Hw ', ' Blvd ', ' BLVD ', ' Ln ', ' 25a ', ' 25A ', ' Pkwy ', ' PKWY ', ' Grn ', ' Ste ', ' STE ', ' Ctr ', ' HWY ', ' Plz ',' PLZ ']

driver_path = 'chromedriver.exe' 
service = Service(driver_path)

driver = webdriver.Chrome(service=service)

def extract_address_parts_webmd(address):
    parts_pattern = r'\(([^)]*)\)'
    parts = re.findall(parts_pattern, address)
    if len(parts) < 4:
        return None, None, None, None, None

    street = parts[0].strip()
    city = parts[1].strip().rstrip(',')
    state = parts[2].strip().rstrip(',') 
    zipcode = parts[3].strip()

    street_internal = None
    for suffix in street_suffixes:
        if suffix in street:
            split_street = street.split(suffix, 1)
            if len(split_street) > 1:
                street = split_street[0] + suffix
                street_internal = split_street[1].strip().rstrip('.')
    return street, street_internal, city, state, zipcode

def parse_single_address(address_str):
    address_dict = {}
    phone_number = None
    
    street, street_internal, city, state, zipcode = extract_address_parts_webmd(address_str)

    if ' Ste ' in street or ' STE ' in street or ' UNIT ' in street or ' Suite ' in street or ' APT ' in street or ' Bldg ' in street:
        substring = ' Ste ' if ' Ste ' in street else 'STE ' if ' STE ' in street else ' UNIT ' if ' UNIT ' in street else ' Suite ' if ' Suite ' in street else ' APT ' if ' APT ' in street else ' Bldg '
        parts = street.split(substring, 1)
        street = parts[0].strip()
        street_internal = substring + parts[1]
     
    address_dict['Street'] = street
    address_dict['Street_Internal'] =street_internal
    address_dict['City'] = city
    address_dict['State'] = state
    address_dict['ZipCode'] = zipcode
    return address_dict, phone_number

def extract_name_parts(full_name):
    title, first_name, middle_name, last_name, prefix, suffix = '', '', '', '', '', ''
    titles = ["Dr", "Md"]
    prefixes = ['Jr', 'Sr', 'II', 'III', 'IV', 'V']
    suffixes = ['MD', 'PhD', 'DO', 'NPC', 'NP', 'DPM', 'DC', 'FACS','DDS', 'OD', 'PsyD', 'RNP', 'DMD']
    words = full_name.split()

    if words[0].startswith(tuple(titles)):
        title = words.pop(0).rstrip(".")

    if words and words[-1] in suffixes:
        suffix = words.pop()
        while words and words[-1].rstrip(',') in suffixes:
            suffix = words.pop().rstrip(',') + ', ' + suffix
    elif ',' in words[-1]:
        last_word = words.pop().split(',')
        if last_word[-1] in suffixes:
            suffix = last_word[-1]
            words.append(last_word[0])

    if words:
        first_name = words.pop(0)

    if words:
        last_name = words.pop().rstrip(',')

    if last_name in prefixes:
        prefix = last_name
        last_name = words.pop().rstrip(',')

    if words:
        middle_name = " ".join(words).rstrip(".")

    return title, first_name, middle_name, last_name, prefix, suffix

def scrape_doctor_info(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    doctor_name_title = soup.find('h1', {'class':'provider-full-name'})
    if doctor_name_title:
        title, first_name, middle_name, last_name, prefix, suffix = extract_name_parts(doctor_name_title.text)
    else:
        title, first_name, middle_name, last_name, prefix, suffix = 'None', 'None', 'None', 'None', 'None', 'None'

    doctor_specialty = soup.find('div', {'class':'prov-specialties-wrap'})
    doctor_education = soup.findAll('div', {"class":"education-wrapper webmd-row"})

    street_elements = soup.findAll('div', {'class': "location-address loc-coi-locad webmd-row"})
    streets = [text.get_text().strip() for text in street_elements]

    city_elements = soup.findAll('span', {'class': "location-city loc-coi-loccty"})
    cities = [text.get_text().strip() for text in city_elements]

    state_elements = soup.findAll('span', {'class': "location-state loc-coi-locsta"})
    states = [text.get_text().strip() for text in state_elements]

    zipcode_elements = soup.findAll('span', {'class': "location-zipcode loc-coi-loczip"})
    zipcodes = [text.get_text().strip() for text in zipcode_elements]

    phone_elements = soup.findAll('div', {'class': "location-phone webmd-row"})
    phones = [text.get_text().strip() for text in phone_elements]

    phones += [None] * (len(streets) - len(phones))

    address_list = [
        f"({street})({city})({state})({zipcode}){' ' + phone if phone else ''}"
        for street, city, state, zipcode, phone in zip(streets, cities, states, zipcodes, phones)
    ]
    
    address_str = ' || '.join(address_list)
    if address_str.count("||") > 3:
        driver = webdriver.Chrome()
        driver.maximize_window()
        driver.get(url)

        try:
            ad_button1 = driver.find_element(By.XPATH, "//*[@id=\"adserved-ppg\"]/div/div/div[1]/div[2]")
            ad_button1.click()
        except:
            pass
        try:
            ad_button2 = driver.find_element(By.XPATH, "//*[@id=\"adserved-ppg\"]/div/div/div/div/div[1]")
            ad_button2.click()
        except:
            pass
        try:
            ad_button3 = driver.find_element(By.XPATH, "//*[@id=\"bottombannerad\"]/button")
            ad_button3.click()
        except:
            pass

        try:
            show_more_button = driver.find_element(By.XPATH, "//*[@id=\"office-info\"]/div/div[2]/div[2]/button")
            show_more_button.click()
        except:
            pass

        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        address_str = ' || '.join(address_list)
        driver.close()

    university = [text.get_text() for text in doctor_education]
    university_str = ' || '.join(university)
    

    street_internal = None
    street = '' if 'street' not in locals() else street

    doctor_data = {
        "Website": "WebMD",
        "Name": doctor_name_title.text,
        "Title": title,
        "FirstName": first_name,
        "MiddleName": middle_name,
        "LastName": last_name,
        "Prefix": prefix,
        "Suffix": suffix,
        "Specialty": ' | '.join(doctor_specialty.text.split('  ')),
        "University": university_str,
        "Address":  address_str,
        "Street_Internal": street_internal,
        "PhoneNumber": 'none'
     }
    if not '||' in doctor_data['Address']:
        parsed_address, phone_number = parse_single_address(doctor_data['Address'])
        doctor_data['Address'] = json.dumps(parsed_address)
        doctor_data['PhoneNumber'] = phone_number

    return doctor_data

def extract_url_key(url):
    match = re.search(r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})', url)
    return match.group(0) if match else None

DATABASE_URLS = [
    "sqlite:///sites_crawled1.db3",
    "sqlite:///sites_crawled2.db3",
    "sqlite:///sites_crawled3.db3",
    "sqlite:///sites_crawled4.db3",
    "sqlite:///sites_crawled5.db3",
    "sqlite:///sites_crawled6.db3",
    "sqlite:///sites_crawled7.db3",
    "sqlite:///sites_crawled8.db3",
    "sqlite:///sites_crawled9.db3",
    "sqlite:///sites_crawled10.db3"
]


Base = declarative_base()
class SitesCrawled(Base):
    __tablename__ = 'sites_crawled'
    id = Column(Integer, primary_key=True)
    site = Column(String(1024))

class DoctorScrapedInfo(Base):
    __tablename__ = 'scraped_doctors_info'
    id = Column(Integer, primary_key=True)
    site_crawled_id = Column(Integer, nullable=False)
    url_key = Column(String(1024))
    site = Column(String(1024), nullable=False)
    website = Column(String(1024))
    name = Column(String(1024))
    title = Column(String(1024))
    first_name = Column(String(1024))
    middle_name = Column(String(1024))
    last_name = Column(String(1024))
    personal_suffix = Column(String(1024))  
    clinical_suffix = Column(String(1024))
    specialty = Column(String(1024))
    university = Column(String(1024))
    street = Column(String(1024))     
    street_internal = Column(String(1024))
    city = Column(String(1024))       
    state = Column(String(1024))       
    zip_code = Column(String(1024))    
    phone_number = Column(String(1024))
    composite_key = Column(String(1024))
    

    @property
    def generate_composite_key(self):
        return f"{self.first_name}_{self.last_name}_{self.street}_{self.zip_code}"

class MultipleAddresses(Base):
    __tablename__ = 'multiple_addresses'
    id = Column(Integer, primary_key=True)
    doctor_scraped_info_id = Column(Integer, nullable=False)
    street = Column(String(1024))
    street_internal = Column(String(1024))
    city = Column(String(1024))
    state = Column(String(1024))
    zip_code = Column(String(1024))
    phone_number = Column(String(1024))
    composite_key = Column(String(1024))



# Define your process_database function
def process_database(db_url):
  
    # SQLAlchemy setup
    engine = sa.create_engine(db_url)
    Base.metadata.create_all(engine)
    Session = sessionmaker(bind=engine)

    # Create a new session for each database
    session = Session()


    urls = session.query(SitesCrawled.id, SitesCrawled.site).all()

    for url in urls:
        try:
            doctor_info = scrape_doctor_info(url[1])
            if doctor_info:
                title, first_name, middle_name, last_name, prefix, suffix = extract_name_parts(doctor_info['Name'])
                street = "multiple addresses" if '||' in doctor_info['Address'] else ''
                street_internal = None
                city = None
                state = None
                zip_code = None

                if not street:
                    address_dict = json.loads(doctor_info['Address'])
                    street = address_dict.get('Street')
                    street_internal = address_dict.get('Street_Internal')
                    city = address_dict.get('City')
                    state = address_dict.get('State')
                    zip_code = address_dict.get('ZipCode')

                if street:
                    doctor = DoctorScrapedInfo(
                        site_crawled_id=url[0], site=url[1], website=doctor_info['Website'], 
                        name=doctor_info['Name'], title=title, first_name=first_name, 
                        middle_name=middle_name, last_name=last_name, personal_suffix=prefix, 
                        clinical_suffix=suffix, specialty=doctor_info['Specialty'], 
                        university=doctor_info['University'], street=street, 
                        street_internal=street_internal, city=city, 
                        state=state, zip_code=zip_code, phone_number=doctor_info['PhoneNumber']
                    )

                    # Setting composite_key and url_key after creating the doctor instance
                    doctor.composite_key = doctor.generate_composite_key
                    doctor.url_key = extract_url_key(url[1])

                    session.add(doctor)
                    session.commit()
                    session.flush()


                if '||' in doctor_info['Address']:
                    addresses = doctor_info['Address'].split('||')
                    for address in addresses:
                        address = address.strip()
                        if address:
                            phone_number = None
                            tel_index = address.find('Tel:')
                            if tel_index != -1:
                                phone_number = address[tel_index+len('Tel:'):].strip()
                                address = address[:tel_index].strip()

                            street, street_internal, city, state, zipcode = extract_address_parts_webmd(address)

                            multiple_address = MultipleAddresses(
                                doctor_scraped_info_id=doctor.id,
                                street=street, 
                                street_internal=street_internal, 
                                city=city, 
                                state=state, 
                                zip_code=zipcode, 
                                phone_number=phone_number
                            )

                            # Generate the Composite Key using data from DoctorScrapedInfo
                            composite_key = f"{doctor.first_name}_{doctor.last_name}_{street}_{zipcode}"
                            multiple_address.composite_key = composite_key

                            session.add(multiple_address)

                    session.commit()

        except Exception as e:
            print(f"Error processing URL {url[1]}: {str(e)}\n")

    session.close()

    # Ensure the multiprocessing code only runs when the script is executed directly
if __name__ == '__main__':
        # Create a pool of worker processes
    with Pool(len(DATABASE_URLS)) as pool:
            # Map the process_database function to each database URL
        pool.map(process_database, DATABASE_URLS)

Ответы (0 шт):