import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
class WebCrawler:
def __init__(self, base_url):
self.base_url = base_url
def fetch_page(self, url):
try:
response = requests.get(url)
response.raise_for_status() # Raise HTTPError for bad responses return response.text
except requests.exceptions.RequestException as e:
print(f”Error fetching {url}: {e}”)
return None
def parse_html(self, html_content, tag, attributes={}):
soup = BeautifulSoup(html_content, ‘html.parser’)
return soup.find_all(tag, attrs=attributes)
def scrape(self, page_path=’/’):
url = f”{self.base_url}{page_path}”
html_content = self.fetch_page(url)
if not html_content:
return []
# Example: Extract all links
links = self.parse_html(html_content, ‘a’, {‘href’: True}) return [link[‘href’] for link in links if ‘href’ in link.attrs]
def fetch_api(self, endpoint):
try:
response = requests.get(f”{self.base_url}{endpoint}”) response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
print(f”Error fetching API {endpoint}: {e}”)
return {}
def store_data_csv(self, data, filename):
df = pd.DataFrame(data)
df.to_csv(filename, index=False)
print(f”Data saved to {filename}”)
def store_data_json(self, data, filename):
with open(filename, ‘w’) as f:
json.dump(data, f, indent=4)
print(f”Data saved to {filename}”)
if __name__ == “__main__”:
# Example usage
crawler = WebCrawler(“https://example.com“)
# Scraping links from a webpage
links = crawler.scrape(“/”)
print(“Scraped Links:”, links)
# Fetching data from an API endpoint
api_data = crawler.fetch_api(“/api/data”)
print(“API Data:”, api_data)
# Storing scraped data into CSV
crawler.store_data_csv([{“link”: link} for link in links], “links.csv”)
# Storing API data into JSON
crawler.store_data_json(api_data, “api_data.json”)