Improved CRAWLER 241112

import requests import time import random from bs4 import BeautifulSoup

# Global variables URL_list = [] URL_parent_Category = {} categoryLevel = {} history = {} final_URLs = {} parsed = 0 n_URLs = 1 max_URLs = 5000

# Base URLs URL_base1 = "https://mathworld.wolfram.com/topics/" # Directory pages URL_base2 = "https://mathworld.wolfram.com/" # Final pages

# Seed URL and Category seed_URL = "https://mathworld.wolfram.com/topics/ProbabilityandStatistics.html" seed_category = "Probability and Statistics" categoryLevel[seed_category] = 1

# Validate function to filter unwanted links def validate(string): ignore_list = ['about/', 'classroom/', 'contact/', 'whatsnew/', 'letters/'] return len(string) <= 60 and all(i not in string for i in ignore_list) and ‘topics’ not in string

# Request with retries and random user-agent def get_request(url, retries=3, timeout=5): headers = {‘User-Agent’: random.choice([
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/58.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 Safari/605.1.15'
])} for attempt in range(retries): try: response = requests.get(url, timeout=timeout, headers=headers) return response except requests.RequestException: time.sleep(2 + random.uniform(0, 1.5)) # Randomized sleep return None

# Update URL and category lists def update_lists(new_URL, new_category, parent_category, file): URL_parent_Category[new_URL] = new_category categoryLevel[new_category] = categoryLevel[parent_category] + 1 level = str(categoryLevel[new_category]) file.write(f"{level}\t{new_category}\t{parent_category}\n") file.flush()

# Crawling phase def crawl(seed_URL, seed_category, file1, file2): global parsed, n_URLs URL_list.append(seed_URL) URL_parent_Category[seed_URL] = seed_category while parsed < min(max_URLs, n_URLs): URL = URL_list[parsed] parent_category = URL_parent_Category[URL] level = categoryLevel[parent_category] time.sleep(2 + random.uniform(0, 1.5)) parsed += 1 if URL in history: file1.write(f"{URL}\tDuplicate\t{parent_category}\t{level}\n") continue resp = get_request(URL) history[URL] = resp.status_code if resp else "Error" if not resp or resp.status_code != 200: file1.write(f"{URL}\tError:{resp.status_code if resp else ‘Timeout’}\t{parent_category}\t{level}\n") continue file1.write(f"{URL}\tParsed\t{parent_category}\t{level}\n") soup = BeautifulSoup(resp.text, ‘html.parser’) for link in soup.find_all(‘a’, href=True): href = link['href'] new_category = link.text.strip() if ‘topics/’ in href: new_URL = URL_base1 + href.split("/topics/")[1] URL_list.append(new_URL) update_lists(new_URL, new_category, parent_category, file2) file1.write(f"{new_URL}\tQueued\t{new_category}\t{level+1}\n") n_URLs += 1 elif validate(href): new_URL = URL_base2 + href.split("/")[1] final_URLs[new_URL] = (new_category, parent_category, level+1) update_lists(new_URL, new_category, parent_category, file2) file1.write(f"{new_URL}\tEndNode\t{new_category}\t{level+1}\n") print(f"Crawling completed. Parsed {parsed} URLs out of {n_URLs}.")

# Content extraction phase def extract_content(begin, end): with open("list_final_URLs.txt", "r", encoding="utf-8") as file_input, \ open(f"crawl_final_{begin}_{end}.txt", "w", encoding="utf-8") as file_output: for line in file_input: count, URL, category = line.split("\t")[:3] if begin <= int(count) <= end: resp = get_request(URL) if resp and resp.status_code == 200: page = resp.text.replace(‘\n’, ‘ ‘) file_output.write(f"{URL}\t{category}\t~{page}\n") else: print(f"Error fetching {URL}: {resp.status_code if resp else ‘Timeout’}") print(f"Content extraction from {begin} to {end} completed.")

# Main execution if __name__ == "__main__": with open("crawl_log.txt", "w", encoding="utf-8") as file1, open("crawl_categories.txt", "w", encoding="utf-8") as file2: crawl(seed_URL, seed_category, file1, file2) extract_content(begin=1, end=500) print("All tasks completed successfully.")

Mahalo

SIGNATURE:
Clifford "RAY" Hackett www.rayis.me RESUME: www.rayis.me/resume

I founded www.adapt.org in 1980 it now has over 50 million members.
$500 of material=World’s fastest hydrofoil sailboat. http://sunrun.biz

Combined crawler and extraction

import requests
import time
import random
from bs4 import BeautifulSoup

Global variables

URL_list = []
URL_parent_Category = {}
categoryLevel = {}
history = {}
final_URLs = {}
parsed = 0
n_URLs = 1
max_URLs = 5000

URLs base

URL_base1 = "https://mathworld.wolfram.com/topics/" # for directory pages (root)
URL_base2 = "https://mathworld.wolfram.com/" # for final pages

Seed URL and Category

seed_URL = "https://mathworld.wolfram.com/topics/ProbabilityandStatistics.html"
seed_category = "Probability and Statistics"
categoryLevel[seed_category] = 1 # Start category level

Proxy setup (optional, uncomment and modify if needed)

proxies = {

‘http’: ‘http://username:password@proxy_url:proxy_port‘,

‘https’: ‘https://username:password@proxy_url:proxy_port‘,

}

Validate function to filter unwanted links

def validate(string):
Ignore = ['about/', 'classroom/', 'contact/', 'whatsnew/', 'letters/']
return len(string) <= 60 and string not in Ignore and ‘topics’ not in string

Request with retries and custom headers

def get_request_with_retries(url, retries=3, timeout=5):
headers = {
‘User-Agent’: random.choice([
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15'
])
}

for i in range(retries): try: # Uncomment `proxies` argument if using proxy # resp = requests.get(url, timeout=timeout, proxies=proxies, headers=headers) resp = requests.get(url, timeout=timeout, headers=headers) return resp except requests.exceptions.RequestException as e: print(f"Attempt {i+1} failed for URL: {url}. Error: {e}") time.sleep(2 + random.uniform(0, 1.5)) # Randomized sleep return None 

Update lists of URLs and categories

def update_lists(new_URL, new_category, parent_category, file):
URL_parent_Category[new_URL] = new_category
categoryLevel[new_category] = 1 + categoryLevel[parent_category]
level = str(categoryLevel[new_category])
file.write(f"{level}\t{new_category}\t{parent_category}\n")
file.flush()

Crawling phase (Step 1)

def crawl(seed_URL, seed_category, file1, file2):
global parsed, n_URLs
URL_list.append(seed_URL)
URL_parent_Category[seed_URL] = seed_category
categoryLevel[seed_category] = 1

while parsed < min(max_URLs, n_URLs): URL = URL_list[parsed] parent_category = URL_parent_Category[URL] level = categoryLevel[parent_category] time.sleep(2 + random.uniform(0, 1.5)) # Slow down crawling parsed += 1 if URL in history: print(f"Duplicate: {URL}") file1.write(f"{URL}\tDuplicate\t{parent_category}\t{level}\n") else: print(f"Parsing: {parsed}/{n_URLs}: {URL}") resp = get_request_with_retries(URL) if resp: history[URL] = resp.status_code else: history[URL] = "Error" if not resp or resp.status_code != 200: reason = resp.reason if resp else "Timeout" print(f"Failed: {URL} - {reason}") file1.write(f"{URL}\tError:{resp.status_code if resp else 'Timeout'}\t{reason}\t{parent_category}\t{level}\n") else: file1.write(f"{URL}\tParsed\t{parent_category}\t{level}\n") page = resp.text.replace('\n', ' ') soup = BeautifulSoup(page, 'html.parser') # Scrape intermediate directories (Type-1) for link in soup.find_all('a', href=True): href = link['href'] if 'topics/' in href: new_URL = URL_base1 + href.split("/topics/")[1] new_category = link.text.strip() URL_list.append(new_URL) update_lists(new_URL, new_category, parent_category, file2) file1.write(f"{new_URL}\tQueued\t{new_category}\t{level+1}\n") n_URLs += 1 # Scrape final pages (Type-2) for link in soup.find_all('a', href=True): href = link['href'] if validate(href): new_URL = URL_base2 + href.split("/")[1] new_category = link.text.strip() final_URLs[new_URL] = (new_category, parent_category, level+1) update_lists(new_URL, new_category, parent_category, file2) file1.write(f"{new_URL}\tEndNode\t{new_category}\t{level+1}\n") print(f"Crawling completed. Parsed {parsed} URLs out of {n_URLs}.") 

Content extraction phase (Step 2)

def extract_content(begin, end):
with open("list_final_URLs.txt", "r", encoding="utf-8") as file_input:
Lines = file_input.readlines()

with open(f"crawl_final_{begin}_{end}.txt", "w", encoding="utf-8") as file_output: for line in Lines: count, URL, category = line.split("\t")[:3] if int(count) >= begin and int(count) <= end: print(f"Page {count}: {URL}") resp = get_request_with_retries(URL) if resp and resp.status_code == 200: page = resp.text.replace('\n', ' ') file_output.write(f"{URL}\t{category}\t~{page}\n") else: print(f"Error fetching {URL}: {resp.status_code if resp else 'Timeout'}") print(f"Content extraction from {begin} to {end} completed.") 

Main execution

if name == "main":
# Open files for logging
with open("crawl_log.txt", "w", encoding="utf-8") as file1, open("crawl_categories.txt", "w", encoding="utf-8") as file2:
crawl(seed_URL="https://mathworld.wolfram.com/topics/ProbabilityandStatistics.html", seed_category="Probability and Statistics", file1=file1, file2=file2)

# Extract content from final URLs (Modify begin and end as needed) extract_content(begin=1, end=500) # Completion message print("All tasks completed successfully.")

Mahalo

SIGNATURE:
Clifford "RAY" Hackett www.rayis.me RESUME: www.rayis.me/resume

I founded www.adapt.org in 1980 it now has over 50 million members.
$500 of material=World’s fastest hydrofoil sailboat. http://sunrun.biz

Seasteading fixes worlds 8 worst problems 1. Poor (enrich) fishing and algae farms 2. AI R (clean) no pesticides 3. Water(clean) less chemicals 4. nature(balance) less pollution 5. sick(HEAL) less pollution 6. hungry (feed) seafood farms 7. Operate(power.) sustainable. 8. PEACE(end war) no land dispute

Seasteading fixes worlds 8 worst problems

  1. Poor (enrich) fishing and algae farms
  2. AI R (clean) no pesticides
  3. Water(clean) less chemicals
  4. nature(balance) less pollution
  5. sick(HEAL) less pollution
  6. hungry (feed) seafood farms
  7. Operate(power.) sustainable.
  8. PEACE(end war) no land dispute

Mahalo

SIGNATURE:
Clifford "RAY" Hackett www.rayis.me RESUME: www.rayis.me/resume

I founded www.adapt.org in 1980 it now has over 50 million members.
$500 of material=World’s fastest hydrofoil sailboat. http://sunrun.biz

Philippines marriage laws

Philippine laws relating to marital status follow Filipinos wherever they may go. Thus, as a rule, a married Filipino remains married even if a divorce is obtained abroad because divorce is generally not recognized in the Philippines. Luckily there is a limited exception for recognition of a foreign divorce decree which would allow a divorced Filipino to remarry.

Under the second paragraph of Article 26, Family Code of the Philippines, if a validly celebrated marriage between a Filipino and a foreigner is dissolved by a foreign divorce decree capacitating the foreign spouse to remarry, the Filipino spouse can also remarry. In other words, for the divorce to be recognized in the Philippines, the following conditions must exist: (1) the marriage was between a Filipino and a foreigner; (2) the marriage was dissolved by a foreign divorce decree; and (3) the divorce was obtained by the non-Filipino spouse

Mahalo

SIGNATURE:
Clifford "RAY" Hackett www.rayis.me RESUME: www.rayis.me/resume

I founded www.adapt.org in 1980 it now has over 50 million members.
$500 of material=World’s fastest hydrofoil sailboat. http://sunrun.biz

Letter to EEOC about United Airlines

EEOC.HQ: 131 M Street, NE,Washington, DC 20507, 202-663-4900

Clifford "RAY" Hackett 440 Kapiolani, Hilo,HI,96720

3659745 6717872345

Events: Blocked from United airlines employment

Registered in person with others, who were hired Comments made by United personnel indicate I would not be hired for the following reasons

Why: (race,(told hoales not welcome) color,(white, told I look like evil ghost) religion, (told Christianity is evil) sex (told men are bad) age (, told too old), disability,(told blind/deaf people are problems) (genetic ( told my ancestor bombed their country);

Injury suffered: (financial losses, physical injuries caused by this situation United Airlines employee shoved me as I stepped on a slippery spot and fell I am the only job fair attendant, not hired with the $25,000 bonus

eeo

Mahalo

SIGNATURE:
Clifford "RAY" Hackett www.rayis.me RESUME: www.rayis.me/resume

I founded www.adapt.org in 1980 it now has over 50 million members.
$500 of material=World’s fastest hydrofoil sailboat. http://sunrun.biz

Passport, denial letter

United States Department of State

Honolulu Passport Agency

4432 Mercure Circle

PO Box 1076

Sterling, Virginia 20166-1076

February 13, 2023

Clifford Ray Hackett

General Delivery G.M.F.

Barrigada, GU 96913

RE: 122500184

Dear Mr. Hackett:

Thank you for your recent passport application. Unfortunately, we are unable to finish processing your application because you owe a balance on an existing repatriation loan advanced to you by the U.S. government to pay for your return and/or the return of your immediate family members to this country or a country of safe haven.

To arrange for the repayment of your loan, and determine your exact account balance and payment options you must contact the U.S. Department of the Treasury at:

U.S. Department of the Treasury Bureau of the Fiscal Service

P.O. Box 830794

Birmingham, AL 35283

Telephone #: 1-888-826-3127

If payment arrangements are not made within ninety (90) days from the date of this letter, your passport application will be denied, and your citizenship evidence will be returned to you

Section 51.60(a)(1) of the Title 22 of the Code of Federal Regulations reads as follows:

51.60 – Denial and restriction of passports

(a) The Department may not issue a passport, except a passport for direct return to the United States, in any case in which the Department determines or is informed by a competent authority that:

(1) The applicant is in default on a loan received from the United States under 22 U.S.C.

2671(2)B) for the repatriation of the applicant, and, where applicable, the applicant’s spouse, minor

child(ren), and/or other immediate family members, from a foreign country (see 22 U.S.C. 2671(d))

In addition, you are ineligible to receive passport services because the Department of Health and Human Services (HHS) certified that you owe child support.

• Section 51.60(a)(2) of Title 22 of the Code of Federal Regulations reads as follows:

51.60- Denial of Passports

(a) The Department may not issue a passport, except a passport for direct return to the United States, in any case in which the Department determines or is informed by a competent authority that:

(2) The applicant has been certified by the Secretary of Health and Human Services as notified by a state agency under 42 U.S.C. 652(k) to be in arrears of child support in an amount determined by the statute.

Neither this passport agency nor the Department of State has information concerning your child support obligation.

A list of state child support enforcement agencies and their contact information can be found on-line at http://www.acf.hhs.gov/programs/css/resource/state-and-tribal-child-support-agency-contacts

You must contact and make appropriate arrangements with

Mahalo

SIGNATURE:
Clifford "RAY" Hackett www.rayis.me RESUME: www.rayis.me/resume

I founded www.adapt.org in 1980 it now has over 50 million members.
$500 of material=World’s fastest hydrofoil sailboat. http://sunrun.biz

Solar NowNow