diff --git a/Kleinanzeigen-Preisabfrage-main/app.py b/Kleinanzeigen-Preisabfrage-main/app.py index 1517e4a..4418f95 100644 --- a/Kleinanzeigen-Preisabfrage-main/app.py +++ b/Kleinanzeigen-Preisabfrage-main/app.py @@ -1,78 +1,121 @@ -from flask import Flask, request, render_template, jsonify, send_file, session -import requests -from bs4 import BeautifulSoup -import pandas as pd -from concurrent.futures import ThreadPoolExecutor -import os -import json -import time -from flask_session import Session -import random -from threading import Thread, Event +from flask import Flask, request, render_template, jsonify, send_file, session +import requests +from bs4 import BeautifulSoup +import pandas as pd +from concurrent.futures import ThreadPoolExecutor +import os +import time +from flask_session import Session +import random +from threading import Thread, Event, Lock +import secrets +from werkzeug.utils import secure_filename -app = Flask(__name__) -app.config['SECRET_KEY'] = 'supersecretkey' -app.config['SESSION_TYPE'] = 'filesystem' -Session(app) +app = Flask(__name__) +app.config['SECRET_KEY'] = os.environ.get('SECRET_KEY', secrets.token_hex(32)) +app.config['SESSION_TYPE'] = 'filesystem' +Session(app) + +runningproxies = [] +runningproxies_lock = Lock() +update_event = Event() +updater_thread = None + +_proxy_cache = {"proxies": [], "expires_at": 0} +_proxy_cache_lock = Lock() +PROXY_CACHE_TTL = 600 -progress_data = {} -runningproxies = [] -update_event = Event() +def load_proxies(): + current_time = time.time() + with _proxy_cache_lock: + if _proxy_cache["proxies"] and current_time < _proxy_cache["expires_at"]: + return list(_proxy_cache["proxies"]) + + urls = { + "http": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt", + "socks4": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txt", + "socks5": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks5.txt" + } + proxies = [] + for proxy_type, url in urls.items(): + try: + response = requests.get(url, timeout=5) + response.raise_for_status() + except requests.RequestException: + continue + proxies.extend( + [f"{proxy_type}://{line.strip()}" for line in response.text.splitlines() if line.strip()] + ) + + with _proxy_cache_lock: + _proxy_cache["proxies"] = proxies + _proxy_cache["expires_at"] = current_time + PROXY_CACHE_TTL + + return proxies -def load_proxies(): - urls = { - "http": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt", - "socks4": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txt", - "socks5": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks5.txt" - } - proxies = [] - for proxy_type, url in urls.items(): - response = requests.get(url) - if response.status_code == 200: - proxies.extend([f"{proxy_type}://{line.strip()}" for line in response.text.splitlines() if line.strip()]) - # print(f"Loaded proxies: {proxies}") # Debugging-Ausgabe - return proxies +def check_proxy(proxy): + try: + proxy_type = proxy.split("://")[0] + response = requests.get( + "https://www.google.com", + proxies={proxy_type: proxy}, + timeout=5, + ) + return response.status_code == 200 + except requests.RequestException: + return False -def check_proxy(proxy): - try: - proxy_type = proxy.split("://")[0] - response = requests.get("https://www.google.com", proxies={proxy_type: proxy}, timeout=1) - return response.status_code == 200 - except: - return False +def update_running_proxies(proxies): + global runningproxies + while not update_event.is_set(): + new_proxies = [] + for proxy in proxies: + if len(new_proxies) >= 20: + break + if check_proxy(proxy): + new_proxies.append(proxy) + with runningproxies_lock: + runningproxies = new_proxies + if update_event.wait(300): + break + + +def start_proxy_updater(proxies): + global updater_thread + stop_proxy_updater() + update_event.clear() + updater_thread = Thread(target=update_running_proxies, args=(proxies,)) + updater_thread.daemon = True + updater_thread.start() + + +def stop_proxy_updater(): + global updater_thread + update_event.set() + if updater_thread and updater_thread.is_alive(): + updater_thread.join(timeout=1) + updater_thread = None -def update_running_proxies(proxies): - global runningproxies - while not update_event.is_set(): - new_proxies = [] - for proxy in proxies: - if len(new_proxies) >= 20: - break - if check_proxy(proxy): - new_proxies.append(proxy) - runningproxies = new_proxies - # print(f"Updated running proxies: {runningproxies}") # Debugging-Ausgabe - time.sleep(300) # Warte 5 Minuten +def get_random_proxy(timeout=10): + deadline = time.time() + timeout + while time.time() < deadline: + with runningproxies_lock: + available = list(runningproxies) + if available: + return random.choice(available) + if update_event.wait(1): + break + raise RuntimeError("No running proxies available") -def start_proxy_updater(proxies): - updater_thread = Thread(target=update_running_proxies, args=(proxies,)) - updater_thread.daemon = True - updater_thread.start() - -def get_random_proxy(): - while True: - if runningproxies: - proxy = random.choice(runningproxies) - print(f"Selected proxy: {proxy}") # Debugging-Ausgabe - return proxy - else: - print("No running proxies available.") # Debugging-Ausgabe - time.sleep(1) # Warte kurz, bevor erneut versucht wird - -def get_total_pages(query): - url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page=1" - response = requests.get(url) - soup = BeautifulSoup(response.text, 'html.parser') +def get_total_pages(query): + url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page=1" + try: + response = requests.get(url, timeout=10) + response.raise_for_status() + except requests.RequestException: + return 1 + + soup = BeautifulSoup(response.text, 'html.parser') pagination = soup.find('div', class_='pagination') if pagination: pages = pagination.find_all('a') @@ -81,21 +124,37 @@ def get_total_pages(query): total_pages = 1 return total_pages -def fetch_page(query, page, unwanted_words, minprice, maxprice, unwanted_prices, proxies): - while True: - proxy = get_random_proxy() - try: - proxy_type = proxy.split("://")[0] - url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page={page}" - response = requests.get(url, proxies={proxy_type: proxy}) - response.raise_for_status() - print(f"Successfully fetched page {page} using proxy {proxy}.") # Debugging-Ausgabe - break - except requests.exceptions.RequestException as e: - print(f"Failed to fetch page {page} using proxy {proxy}. Error: {e}") # Debugging-Ausgabe - continue - - soup = BeautifulSoup(response.text, 'html.parser') +def fetch_page(query, page, unwanted_words, minprice, maxprice, unwanted_prices, use_proxies): + url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page={page}" + attempts = 0 + max_attempts = 5 + last_error = None + + while attempts < max_attempts: + proxies = None + if use_proxies: + try: + proxy = get_random_proxy() + proxy_type = proxy.split("://")[0] + proxies = {proxy_type: proxy} + except RuntimeError as exc: + use_proxies = False + last_error = exc + continue + + try: + response = requests.get(url, proxies=proxies, timeout=10) + response.raise_for_status() + break + except requests.RequestException as exc: + last_error = exc + attempts += 1 + time.sleep(1) + else: + print(f"Failed to fetch page {page}: {last_error}") + return [] + + soup = BeautifulSoup(response.text, 'html.parser') items = [] for item in soup.find_all('li', class_='ad-listitem'): @@ -123,13 +182,31 @@ def search_item(query, unwanted_words, minprice, maxprice, maxpages): items = [] total_pages = get_total_pages(query) pages_to_search = min(total_pages, maxpages) - proxies = load_proxies() - start_proxy_updater(proxies) - - with ThreadPoolExecutor(max_workers=10) as executor: - futures = [executor.submit(fetch_page, query, page, unwanted_words, minprice, maxprice, ["Zu verschenken"], proxies) for page in range(1, pages_to_search + 1)] - for future in futures: - items.extend(future.result()) + proxies = load_proxies() + use_proxies = bool(proxies) + if use_proxies: + start_proxy_updater(proxies) + + try: + with ThreadPoolExecutor(max_workers=10) as executor: + futures = [ + executor.submit( + fetch_page, + query, + page, + unwanted_words, + minprice, + maxprice, + ["Zu verschenken"], + use_proxies, + ) + for page in range(1, pages_to_search + 1) + ] + for future in futures: + items.extend(future.result()) + finally: + if use_proxies: + stop_proxy_updater() # Ergebnisse nach Preis sortieren items.sort(key=lambda x: x[3]) @@ -148,42 +225,50 @@ def search_item(query, unwanted_words, minprice, maxprice, maxpages): def index(): return render_template('index.html') -@app.route('/search', methods=['POST']) -def search(): - queries = request.form.get('query').split('\n') - unwanted_words = request.form.get('unwanted_words').split('\n') - minprice = float(request.form.get('minprice') or 0) - maxprice = float(request.form.get('maxprice') or float('inf')) - maxpages = int(request.form.get('maxpages') or 0) - filename = request.form.get('filename') or 'kleinanzeigen_results' - - session_id = request.cookies.get('session') - if session_id not in progress_data: - progress_data[session_id] = {'current_item': 0, 'total_items': 0} - - all_items = [] - progress_data[session_id]['total_items'] = len(queries) - - for i, query in enumerate(queries): - query = query.strip() - if query: - items = search_item(query, unwanted_words, minprice, maxprice, maxpages) - all_items.extend(items) - progress_data[session_id]['current_item'] = i + 1 - time.sleep(0.1) # Füge eine kurze Verzögerung hinzu, um die Fortschrittsanzeige zu aktualisieren - - df = pd.DataFrame(all_items, columns=['Artikel', 'Preis', 'Link', 'PriceValue']) - df = df.drop(columns=['PriceValue']) - filepath = f'uploads/{filename}.xlsx' - df.to_excel(filepath, index=False) - - session['filepath'] = filepath - return jsonify(all_items) - -@app.route('/progress') -def progress(): - session_id = request.cookies.get('session') - return jsonify(progress_data.get(session_id, {'current_item': 0, 'total_items': 0})) +@app.route('/search', methods=['POST']) +def search(): + queries = request.form.get('query').split('\n') + unwanted_words = request.form.get('unwanted_words').split('\n') + minprice = float(request.form.get('minprice') or 0) + maxprice = float(request.form.get('maxprice') or float('inf')) + maxpages = int(request.form.get('maxpages') or 0) + filename = request.form.get('filename') or 'kleinanzeigen_results' + filename = secure_filename(filename) + if not filename: + filename = 'kleinanzeigen_results' + if not filename.lower().endswith('.xlsx'): + filename = f"{filename}.xlsx" + + session_progress = session.setdefault('progress', {'current_item': 0, 'total_items': 0}) + + all_items = [] + session_progress['total_items'] = len([q for q in queries if q.strip()]) + + processed_queries = 0 + for query in queries: + query = query.strip() + if query: + items = search_item(query, unwanted_words, minprice, maxprice, maxpages) + all_items.extend(items) + processed_queries += 1 + session_progress['current_item'] = processed_queries + session['progress'] = session_progress + session.modified = True + time.sleep(0.1) # Füge eine kurze Verzögerung hinzu, um die Fortschrittsanzeige zu aktualisieren + + df = pd.DataFrame(all_items, columns=['Artikel', 'Preis', 'Link', 'PriceValue']) + df = df.drop(columns=['PriceValue']) + os.makedirs('uploads', exist_ok=True) + filepath = os.path.join('uploads', filename) + df.to_excel(filepath, index=False) + + session['filepath'] = filepath + session.modified = True + return jsonify(all_items) + +@app.route('/progress') +def progress(): + return jsonify(session.get('progress', {'current_item': 0, 'total_items': 0})) @app.route('/download') def download(): @@ -192,7 +277,7 @@ def download(): return send_file(filepath, as_attachment=True) return "File not found", 404 -if __name__ == '__main__': - if not os.path.exists('uploads'): - os.makedirs('uploads') - app.run(debug=True) +if __name__ == '__main__': + if not os.path.exists('uploads'): + os.makedirs('uploads') + app.run()