Combined crawler and extraction

import requests
import time
import random
from bs4 import BeautifulSoup

Global variables

URL_list = []
URL_parent_Category = {}
categoryLevel = {}
history = {}
final_URLs = {}
parsed = 0
n_URLs = 1
max_URLs = 5000

URLs base

URL_base1 = "https://mathworld.wolfram.com/topics/" # for directory pages (root)
URL_base2 = "https://mathworld.wolfram.com/" # for final pages

Seed URL and Category

seed_URL = "https://mathworld.wolfram.com/topics/ProbabilityandStatistics.html"
seed_category = "Probability and Statistics"
categoryLevel[seed_category] = 1 # Start category level

Proxy setup (optional, uncomment and modify if needed)

proxies = {

‘http’: ‘http://username:password@proxy_url:proxy_port‘,

‘https’: ‘https://username:password@proxy_url:proxy_port‘,

}

Validate function to filter unwanted links

def validate(string):
Ignore = ['about/', 'classroom/', 'contact/', 'whatsnew/', 'letters/']
return len(string) <= 60 and string not in Ignore and ‘topics’ not in string

Request with retries and custom headers

def get_request_with_retries(url, retries=3, timeout=5):
headers = {
‘User-Agent’: random.choice([
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15'
])
}

for i in range(retries): try: # Uncomment `proxies` argument if using proxy # resp = requests.get(url, timeout=timeout, proxies=proxies, headers=headers) resp = requests.get(url, timeout=timeout, headers=headers) return resp except requests.exceptions.RequestException as e: print(f"Attempt {i+1} failed for URL: {url}. Error: {e}") time.sleep(2 + random.uniform(0, 1.5)) # Randomized sleep return None 

Update lists of URLs and categories

def update_lists(new_URL, new_category, parent_category, file):
URL_parent_Category[new_URL] = new_category
categoryLevel[new_category] = 1 + categoryLevel[parent_category]
level = str(categoryLevel[new_category])
file.write(f"{level}\t{new_category}\t{parent_category}\n")
file.flush()

Crawling phase (Step 1)

def crawl(seed_URL, seed_category, file1, file2):
global parsed, n_URLs
URL_list.append(seed_URL)
URL_parent_Category[seed_URL] = seed_category
categoryLevel[seed_category] = 1

while parsed < min(max_URLs, n_URLs): URL = URL_list[parsed] parent_category = URL_parent_Category[URL] level = categoryLevel[parent_category] time.sleep(2 + random.uniform(0, 1.5)) # Slow down crawling parsed += 1 if URL in history: print(f"Duplicate: {URL}") file1.write(f"{URL}\tDuplicate\t{parent_category}\t{level}\n") else: print(f"Parsing: {parsed}/{n_URLs}: {URL}") resp = get_request_with_retries(URL) if resp: history[URL] = resp.status_code else: history[URL] = "Error" if not resp or resp.status_code != 200: reason = resp.reason if resp else "Timeout" print(f"Failed: {URL} - {reason}") file1.write(f"{URL}\tError:{resp.status_code if resp else 'Timeout'}\t{reason}\t{parent_category}\t{level}\n") else: file1.write(f"{URL}\tParsed\t{parent_category}\t{level}\n") page = resp.text.replace('\n', ' ') soup = BeautifulSoup(page, 'html.parser') # Scrape intermediate directories (Type-1) for link in soup.find_all('a', href=True): href = link['href'] if 'topics/' in href: new_URL = URL_base1 + href.split("/topics/")[1] new_category = link.text.strip() URL_list.append(new_URL) update_lists(new_URL, new_category, parent_category, file2) file1.write(f"{new_URL}\tQueued\t{new_category}\t{level+1}\n") n_URLs += 1 # Scrape final pages (Type-2) for link in soup.find_all('a', href=True): href = link['href'] if validate(href): new_URL = URL_base2 + href.split("/")[1] new_category = link.text.strip() final_URLs[new_URL] = (new_category, parent_category, level+1) update_lists(new_URL, new_category, parent_category, file2) file1.write(f"{new_URL}\tEndNode\t{new_category}\t{level+1}\n") print(f"Crawling completed. Parsed {parsed} URLs out of {n_URLs}.") 

Content extraction phase (Step 2)

def extract_content(begin, end):
with open("list_final_URLs.txt", "r", encoding="utf-8") as file_input:
Lines = file_input.readlines()

with open(f"crawl_final_{begin}_{end}.txt", "w", encoding="utf-8") as file_output: for line in Lines: count, URL, category = line.split("\t")[:3] if int(count) >= begin and int(count) <= end: print(f"Page {count}: {URL}") resp = get_request_with_retries(URL) if resp and resp.status_code == 200: page = resp.text.replace('\n', ' ') file_output.write(f"{URL}\t{category}\t~{page}\n") else: print(f"Error fetching {URL}: {resp.status_code if resp else 'Timeout'}") print(f"Content extraction from {begin} to {end} completed.") 

Main execution

if name == "main":
# Open files for logging
with open("crawl_log.txt", "w", encoding="utf-8") as file1, open("crawl_categories.txt", "w", encoding="utf-8") as file2:
crawl(seed_URL="https://mathworld.wolfram.com/topics/ProbabilityandStatistics.html", seed_category="Probability and Statistics", file1=file1, file2=file2)

# Extract content from final URLs (Modify begin and end as needed) extract_content(begin=1, end=500) # Completion message print("All tasks completed successfully.")

Mahalo

SIGNATURE:
Clifford "RAY" Hackett www.rayis.me RESUME: www.rayis.me/resume

I founded www.adapt.org in 1980 it now has over 50 million members.
$500 of material=World’s fastest hydrofoil sailboat. http://sunrun.biz

Leave a comment