Improved CRAWLER 241112

import requests import time import random from bs4 import BeautifulSoup

# Global variables URL_list = [] URL_parent_Category = {} categoryLevel = {} history = {} final_URLs = {} parsed = 0 n_URLs = 1 max_URLs = 5000

# Base URLs URL_base1 = "https://mathworld.wolfram.com/topics/" # Directory pages URL_base2 = "https://mathworld.wolfram.com/" # Final pages

# Seed URL and Category seed_URL = "https://mathworld.wolfram.com/topics/ProbabilityandStatistics.html" seed_category = "Probability and Statistics" categoryLevel[seed_category] = 1

# Validate function to filter unwanted links def validate(string): ignore_list = ['about/', 'classroom/', 'contact/', 'whatsnew/', 'letters/'] return len(string) <= 60 and all(i not in string for i in ignore_list) and ‘topics’ not in string

# Request with retries and random user-agent def get_request(url, retries=3, timeout=5): headers = {‘User-Agent’: random.choice([
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/58.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 Safari/605.1.15'
])} for attempt in range(retries): try: response = requests.get(url, timeout=timeout, headers=headers) return response except requests.RequestException: time.sleep(2 + random.uniform(0, 1.5)) # Randomized sleep return None

# Update URL and category lists def update_lists(new_URL, new_category, parent_category, file): URL_parent_Category[new_URL] = new_category categoryLevel[new_category] = categoryLevel[parent_category] + 1 level = str(categoryLevel[new_category]) file.write(f"{level}\t{new_category}\t{parent_category}\n") file.flush()

# Crawling phase def crawl(seed_URL, seed_category, file1, file2): global parsed, n_URLs URL_list.append(seed_URL) URL_parent_Category[seed_URL] = seed_category while parsed < min(max_URLs, n_URLs): URL = URL_list[parsed] parent_category = URL_parent_Category[URL] level = categoryLevel[parent_category] time.sleep(2 + random.uniform(0, 1.5)) parsed += 1 if URL in history: file1.write(f"{URL}\tDuplicate\t{parent_category}\t{level}\n") continue resp = get_request(URL) history[URL] = resp.status_code if resp else "Error" if not resp or resp.status_code != 200: file1.write(f"{URL}\tError:{resp.status_code if resp else ‘Timeout’}\t{parent_category}\t{level}\n") continue file1.write(f"{URL}\tParsed\t{parent_category}\t{level}\n") soup = BeautifulSoup(resp.text, ‘html.parser’) for link in soup.find_all(‘a’, href=True): href = link['href'] new_category = link.text.strip() if ‘topics/’ in href: new_URL = URL_base1 + href.split("/topics/")[1] URL_list.append(new_URL) update_lists(new_URL, new_category, parent_category, file2) file1.write(f"{new_URL}\tQueued\t{new_category}\t{level+1}\n") n_URLs += 1 elif validate(href): new_URL = URL_base2 + href.split("/")[1] final_URLs[new_URL] = (new_category, parent_category, level+1) update_lists(new_URL, new_category, parent_category, file2) file1.write(f"{new_URL}\tEndNode\t{new_category}\t{level+1}\n") print(f"Crawling completed. Parsed {parsed} URLs out of {n_URLs}.")

# Content extraction phase def extract_content(begin, end): with open("list_final_URLs.txt", "r", encoding="utf-8") as file_input, \ open(f"crawl_final_{begin}_{end}.txt", "w", encoding="utf-8") as file_output: for line in file_input: count, URL, category = line.split("\t")[:3] if begin <= int(count) <= end: resp = get_request(URL) if resp and resp.status_code == 200: page = resp.text.replace(‘\n’, ‘ ‘) file_output.write(f"{URL}\t{category}\t~{page}\n") else: print(f"Error fetching {URL}: {resp.status_code if resp else ‘Timeout’}") print(f"Content extraction from {begin} to {end} completed.")

# Main execution if __name__ == "__main__": with open("crawl_log.txt", "w", encoding="utf-8") as file1, open("crawl_categories.txt", "w", encoding="utf-8") as file2: crawl(seed_URL, seed_category, file1, file2) extract_content(begin=1, end=500) print("All tasks completed successfully.")

Mahalo

SIGNATURE:
Clifford "RAY" Hackett www.rayis.me RESUME: www.rayis.me/resume

I founded www.adapt.org in 1980 it now has over 50 million members.
$500 of material=World’s fastest hydrofoil sailboat. http://sunrun.biz

	raymay14 on Please print for Mayor and Vic…
	raymay14 on YouTube recap
	raymay14 on Cryptocurrency Developers
	raymay14 on At&t is cricket has Guam c…
	raymay14 on DARPA thru me

	raymay14 on Please print for Mayor and Vic…
	raymay14 on YouTube recap
	raymay14 on Cryptocurrency Developers
	raymay14 on At&t is cricket has Guam c…
	raymay14 on DARPA thru me

Solar

Improved CRAWLER 241112

Leave a comment Cancel reply

Solar NowNow

Share this:

Related

Leave a comment Cancel reply

Solar NowNow