Fixed voting bug
This commit is contained in:
99
scrape/scrape.py
Normal file
99
scrape/scrape.py
Normal file
@@ -0,0 +1,99 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
import logging
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(filename='scrape_bashforever.log', level=logging.DEBUG,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
# File path for the quotes
|
||||
file_path = 'bashforever_quotes.txt'
|
||||
|
||||
# Create the file if it doesn't exist
|
||||
if not os.path.exists(file_path):
|
||||
with open(file_path, 'w') as file:
|
||||
file.write('')
|
||||
|
||||
# Function to scrape a single quote page from bashforever.com
|
||||
def scrape_quote(quote_id):
|
||||
url = f"https://bashforever.com/?{quote_id}"
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
|
||||
logging.info(f"Fetching Quote #{quote_id} from {url}...")
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=headers)
|
||||
except Exception as e:
|
||||
logging.error(f"Request failed for Quote #{quote_id}: {e}")
|
||||
return False
|
||||
|
||||
# Check if the request was successful
|
||||
if response.status_code == 200:
|
||||
logging.debug(f"Successfully fetched content for Quote #{quote_id}.")
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Extract the quote text, vote count, and quote year
|
||||
try:
|
||||
quote_element = soup.find('div', class_='quotes')
|
||||
if quote_element:
|
||||
quote_id_tag = quote_element.find('a')
|
||||
vote_pos_tag = quote_element.find('span', class_='pos') # Positive votes
|
||||
vote_neg_tag = quote_element.find('span', class_='neg') # Negative votes
|
||||
|
||||
# Determine the vote count based on which tag is available
|
||||
if vote_neg_tag:
|
||||
vote_count = vote_neg_tag.text.strip()
|
||||
elif vote_pos_tag:
|
||||
vote_count = vote_pos_tag.text.strip()
|
||||
else:
|
||||
vote_count = "Unknown Votes"
|
||||
|
||||
quote_text_tag = quote_element.find('div').next_sibling
|
||||
quote_year_tag = soup.find('div', class_='quoteYear')
|
||||
|
||||
# Handle missing elements gracefully
|
||||
quote_id_text = quote_id_tag.text.strip() if quote_id_tag else "Unknown ID"
|
||||
quote_text = quote_text_tag.strip() if quote_text_tag else "No text available"
|
||||
quote_year = quote_year_tag.text.strip() if quote_year_tag else "Unknown Year"
|
||||
|
||||
logging.info(f"Quote #{quote_id} - Text: {quote_text}, Votes: {vote_count}, Year: {quote_year}")
|
||||
|
||||
# Write to the file
|
||||
with open(file_path, 'a') as file:
|
||||
file.write(f"Quote #{quote_id}: {quote_text} (Votes: {vote_count}) (Year: {quote_year})\n")
|
||||
return True
|
||||
else:
|
||||
logging.warning(f"Quote #{quote_id} - Failed to find 'quotes' element.")
|
||||
return False
|
||||
except Exception as e:
|
||||
logging.warning(f"Quote #{quote_id} - Failed to extract content: {e}")
|
||||
return False
|
||||
elif response.status_code == 403:
|
||||
logging.warning(f"Quote #{quote_id} - Access forbidden (403).")
|
||||
return False
|
||||
else:
|
||||
logging.warning(f"Quote #{quote_id} - Page not found (status code: {response.status_code}).")
|
||||
return False
|
||||
|
||||
# Function to scrape a range of quotes concurrently
|
||||
def scrape_all_quotes_concurrently(start_id, end_id, max_workers=10):
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = {executor.submit(scrape_quote, quote_id): quote_id for quote_id in range(start_id, end_id + 1)}
|
||||
for future in as_completed(futures):
|
||||
quote_id = futures[future]
|
||||
try:
|
||||
success = future.result()
|
||||
if success:
|
||||
logging.info(f"Successfully scraped Quote #{quote_id}.")
|
||||
else:
|
||||
logging.warning(f"Failed to scrape Quote #{quote_id}.")
|
||||
except Exception as e:
|
||||
logging.error(f"Error scraping Quote #{quote_id}: {e}")
|
||||
|
||||
# Scrape quotes from 0 to 966506 using concurrent threads
|
||||
scrape_all_quotes_concurrently(0, 966506, max_workers=20)
|
||||
Reference in New Issue
Block a user