import requests
import time
import random
from bs4 import BeautifulSoup

Global variables

URL_list = []
URL_parent_Category = {}
categoryLevel = {}
history = {}
final_URLs = {}
parsed = 0
n_URLs = 1
max_URLs = 5000

URLs base

URL_base1 = "https://mathworld.wolfram.com/topics/" # for directory pages (root)
URL_base2 = "https://mathworld.wolfram.com/" # for final pages

Seed URL and Category

seed_URL = "https://mathworld.wolfram.com/topics/ProbabilityandStatistics.html"
seed_category = "Probability and Statistics"
categoryLevel[seed_category] = 1 # Start category level

Proxy setup (optional, uncomment and modify if needed)

proxies = {

‘http’: ‘http://username:password@proxy_url:proxy_port‘,

‘https’: ‘https://username:password@proxy_url:proxy_port‘,

}

Validate function to filter unwanted links

def validate(string):
Ignore = ['about/', 'classroom/', 'contact/', 'whatsnew/', 'letters/']
return len(string) <= 60 and string not in Ignore and ‘topics’ not in string

Request with retries and custom headers

def get_request_with_retries(url, retries=3, timeout=5):
headers = {
‘User-Agent’: random.choice([
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15'
])
}

for i in range(retries): try: # Uncomment `proxies` argument if using proxy # resp = requests.get(url, timeout=timeout, proxies=proxies, headers=headers) resp = requests.get(url, timeout=timeout, headers=headers) return resp except requests.exceptions.RequestException as e: print(f"Attempt {i+1} failed for URL: {url}. Error: {e}") time.sleep(2 + random.uniform(0, 1.5)) # Randomized sleep return None

Update lists of URLs and categories

def update_lists(new_URL, new_category, parent_category, file):
URL_parent_Category[new_URL] = new_category
categoryLevel[new_category] = 1 + categoryLevel[parent_category]
level = str(categoryLevel[new_category])
file.write(f"{level}\t{new_category}\t{parent_category}\n")
file.flush()

Crawling phase (Step 1)

def crawl(seed_URL, seed_category, file1, file2):
global parsed, n_URLs
URL_list.append(seed_URL)
URL_parent_Category[seed_URL] = seed_category
categoryLevel[seed_category] = 1

while parsed < min(max_URLs, n_URLs): URL = URL_list[parsed] parent_category = URL_parent_Category[URL] level = categoryLevel[parent_category] time.sleep(2 + random.uniform(0, 1.5)) # Slow down crawling parsed += 1 if URL in history: print(f"Duplicate: {URL}") file1.write(f"{URL}\tDuplicate\t{parent_category}\t{level}\n") else: print(f"Parsing: {parsed}/{n_URLs}: {URL}") resp = get_request_with_retries(URL) if resp: history[URL] = resp.status_code else: history[URL] = "Error" if not resp or resp.status_code != 200: reason = resp.reason if resp else "Timeout" print(f"Failed: {URL} - {reason}") file1.write(f"{URL}\tError:{resp.status_code if resp else 'Timeout'}\t{reason}\t{parent_category}\t{level}\n") else: file1.write(f"{URL}\tParsed\t{parent_category}\t{level}\n") page = resp.text.replace('\n', ' ') soup = BeautifulSoup(page, 'html.parser') # Scrape intermediate directories (Type-1) for link in soup.find_all('a', href=True): href = link['href'] if 'topics/' in href: new_URL = URL_base1 + href.split("/topics/")[1] new_category = link.text.strip() URL_list.append(new_URL) update_lists(new_URL, new_category, parent_category, file2) file1.write(f"{new_URL}\tQueued\t{new_category}\t{level+1}\n") n_URLs += 1 # Scrape final pages (Type-2) for link in soup.find_all('a', href=True): href = link['href'] if validate(href): new_URL = URL_base2 + href.split("/")[1] new_category = link.text.strip() final_URLs[new_URL] = (new_category, parent_category, level+1) update_lists(new_URL, new_category, parent_category, file2) file1.write(f"{new_URL}\tEndNode\t{new_category}\t{level+1}\n") print(f"Crawling completed. Parsed {parsed} URLs out of {n_URLs}.")

Content extraction phase (Step 2)

def extract_content(begin, end):
with open("list_final_URLs.txt", "r", encoding="utf-8") as file_input:
Lines = file_input.readlines()

with open(f"crawl_final_{begin}_{end}.txt", "w", encoding="utf-8") as file_output: for line in Lines: count, URL, category = line.split("\t")[:3] if int(count) >= begin and int(count) <= end: print(f"Page {count}: {URL}") resp = get_request_with_retries(URL) if resp and resp.status_code == 200: page = resp.text.replace('\n', ' ') file_output.write(f"{URL}\t{category}\t~{page}\n") else: print(f"Error fetching {URL}: {resp.status_code if resp else 'Timeout'}") print(f"Content extraction from {begin} to {end} completed.")

Main execution

if name == "main":
# Open files for logging
with open("crawl_log.txt", "w", encoding="utf-8") as file1, open("crawl_categories.txt", "w", encoding="utf-8") as file2:
crawl(seed_URL="https://mathworld.wolfram.com/topics/ProbabilityandStatistics.html", seed_category="Probability and Statistics", file1=file1, file2=file2)

# Extract content from final URLs (Modify begin and end as needed) extract_content(begin=1, end=500) # Completion message print("All tasks completed successfully.")

Mahalo

SIGNATURE:
Clifford "RAY" Hackett www.rayis.me RESUME: www.rayis.me/resume

I founded www.adapt.org in 1980 it now has over 50 million members.
$500 of material=World’s fastest hydrofoil sailboat. http://sunrun.biz

	raymay14 on Please print for Mayor and Vic…
	raymay14 on YouTube recap
	raymay14 on Cryptocurrency Developers
	raymay14 on At&t is cricket has Guam c…
	raymay14 on DARPA thru me

	raymay14 on Please print for Mayor and Vic…
	raymay14 on YouTube recap
	raymay14 on Cryptocurrency Developers
	raymay14 on At&t is cricket has Guam c…
	raymay14 on DARPA thru me

Solar

Combined crawler and extraction

Global variables

URLs base

Seed URL and Category

Proxy setup (optional, uncomment and modify if needed)

proxies = {

‘http’: ‘http://username:password@proxy_url:proxy_port‘,

‘https’: ‘https://username:password@proxy_url:proxy_port‘,

}

Validate function to filter unwanted links

Request with retries and custom headers

Update lists of URLs and categories

Crawling phase (Step 1)

Content extraction phase (Step 2)

Main execution

Leave a comment Cancel reply

Solar NowNow

Global variables

URLs base

Seed URL and Category

Proxy setup (optional, uncomment and modify if needed)

proxies = {

‘http’: ‘http://username:password@proxy_url:proxy_port‘,

‘https’: ‘https://username:password@proxy_url:proxy_port‘,

}

Validate function to filter unwanted links

Request with retries and custom headers

Update lists of URLs and categories

Crawling phase (Step 1)

Content extraction phase (Step 2)

Main execution

Share this:

Related

Leave a comment Cancel reply

Solar NowNow