Fixed voting bug

2024-10-11 21:23:25 +01:00
parent a69c6d2b06
commit 1c4079f5b5
4 changed files with 2822 additions and 16 deletions
--- a/scrape/scrape.py
+++ b/scrape/scrape.py
@@ -0,0 +1,99 @@
+import requests
+from bs4 import BeautifulSoup
+import os
+import logging
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# Set up logging
+logging.basicConfig(filename='scrape_bashforever.log', level=logging.DEBUG,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
+
+# File path for the quotes
+file_path = 'bashforever_quotes.txt'
+
+# Create the file if it doesn't exist
+if not os.path.exists(file_path):
+    with open(file_path, 'w') as file:
+        file.write('')
+
+# Function to scrape a single quote page from bashforever.com
+def scrape_quote(quote_id):
+    url = f"https://bashforever.com/?{quote_id}"
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    }
+
+    logging.info(f"Fetching Quote #{quote_id} from {url}...")
+    
+    try:
+        response = requests.get(url, headers=headers)
+    except Exception as e:
+        logging.error(f"Request failed for Quote #{quote_id}: {e}")
+        return False
+    
+    # Check if the request was successful
+    if response.status_code == 200:
+        logging.debug(f"Successfully fetched content for Quote #{quote_id}.")
+        soup = BeautifulSoup(response.content, 'html.parser')
+
+        # Extract the quote text, vote count, and quote year
+        try:
+            quote_element = soup.find('div', class_='quotes')
+            if quote_element:
+                quote_id_tag = quote_element.find('a')
+                vote_pos_tag = quote_element.find('span', class_='pos')  # Positive votes
+                vote_neg_tag = quote_element.find('span', class_='neg')  # Negative votes
+                
+                # Determine the vote count based on which tag is available
+                if vote_neg_tag:
+                    vote_count = vote_neg_tag.text.strip()
+                elif vote_pos_tag:
+                    vote_count = vote_pos_tag.text.strip()
+                else:
+                    vote_count = "Unknown Votes"
+
+                quote_text_tag = quote_element.find('div').next_sibling
+                quote_year_tag = soup.find('div', class_='quoteYear')
+
+                # Handle missing elements gracefully
+                quote_id_text = quote_id_tag.text.strip() if quote_id_tag else "Unknown ID"
+                quote_text = quote_text_tag.strip() if quote_text_tag else "No text available"
+                quote_year = quote_year_tag.text.strip() if quote_year_tag else "Unknown Year"
+
+                logging.info(f"Quote #{quote_id} - Text: {quote_text}, Votes: {vote_count}, Year: {quote_year}")
+                
+                # Write to the file
+                with open(file_path, 'a') as file:
+                    file.write(f"Quote #{quote_id}: {quote_text} (Votes: {vote_count}) (Year: {quote_year})\n")
+                return True
+            else:
+                logging.warning(f"Quote #{quote_id} - Failed to find 'quotes' element.")
+                return False
+        except Exception as e:
+            logging.warning(f"Quote #{quote_id} - Failed to extract content: {e}")
+            return False
+    elif response.status_code == 403:
+        logging.warning(f"Quote #{quote_id} - Access forbidden (403).")
+        return False
+    else:
+        logging.warning(f"Quote #{quote_id} - Page not found (status code: {response.status_code}).")
+        return False
+
+# Function to scrape a range of quotes concurrently
+def scrape_all_quotes_concurrently(start_id, end_id, max_workers=10):
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {executor.submit(scrape_quote, quote_id): quote_id for quote_id in range(start_id, end_id + 1)}
+        for future in as_completed(futures):
+            quote_id = futures[future]
+            try:
+                success = future.result()
+                if success:
+                    logging.info(f"Successfully scraped Quote #{quote_id}.")
+                else:
+                    logging.warning(f"Failed to scrape Quote #{quote_id}.")
+            except Exception as e:
+                logging.error(f"Error scraping Quote #{quote_id}: {e}")
+
+# Scrape quotes from 0 to 966506 using concurrent threads
+scrape_all_quotes_concurrently(0, 966506, max_workers=20)