Harden proxy handling and session security

This commit is contained in:
2025-11-23 20:09:29 +01:00
parent 2cf0836dd1
commit 429a003453

View File

@@ -1,78 +1,121 @@
from flask import Flask, request, render_template, jsonify, send_file, session from flask import Flask, request, render_template, jsonify, send_file, session
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import pandas as pd import pandas as pd
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
import os import os
import json import time
import time from flask_session import Session
from flask_session import Session import random
import random from threading import Thread, Event, Lock
from threading import Thread, Event import secrets
from werkzeug.utils import secure_filename
app = Flask(__name__) app = Flask(__name__)
app.config['SECRET_KEY'] = 'supersecretkey' app.config['SECRET_KEY'] = os.environ.get('SECRET_KEY', secrets.token_hex(32))
app.config['SESSION_TYPE'] = 'filesystem' app.config['SESSION_TYPE'] = 'filesystem'
Session(app) Session(app)
runningproxies = []
runningproxies_lock = Lock()
update_event = Event()
updater_thread = None
_proxy_cache = {"proxies": [], "expires_at": 0}
_proxy_cache_lock = Lock()
PROXY_CACHE_TTL = 600
progress_data = {} def load_proxies():
runningproxies = [] current_time = time.time()
update_event = Event() with _proxy_cache_lock:
if _proxy_cache["proxies"] and current_time < _proxy_cache["expires_at"]:
return list(_proxy_cache["proxies"])
urls = {
"http": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt",
"socks4": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txt",
"socks5": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks5.txt"
}
proxies = []
for proxy_type, url in urls.items():
try:
response = requests.get(url, timeout=5)
response.raise_for_status()
except requests.RequestException:
continue
proxies.extend(
[f"{proxy_type}://{line.strip()}" for line in response.text.splitlines() if line.strip()]
)
with _proxy_cache_lock:
_proxy_cache["proxies"] = proxies
_proxy_cache["expires_at"] = current_time + PROXY_CACHE_TTL
return proxies
def load_proxies(): def check_proxy(proxy):
urls = { try:
"http": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt", proxy_type = proxy.split("://")[0]
"socks4": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txt", response = requests.get(
"socks5": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks5.txt" "https://www.google.com",
} proxies={proxy_type: proxy},
proxies = [] timeout=5,
for proxy_type, url in urls.items(): )
response = requests.get(url) return response.status_code == 200
if response.status_code == 200: except requests.RequestException:
proxies.extend([f"{proxy_type}://{line.strip()}" for line in response.text.splitlines() if line.strip()]) return False
# print(f"Loaded proxies: {proxies}") # Debugging-Ausgabe
return proxies
def check_proxy(proxy): def update_running_proxies(proxies):
try: global runningproxies
proxy_type = proxy.split("://")[0] while not update_event.is_set():
response = requests.get("https://www.google.com", proxies={proxy_type: proxy}, timeout=1) new_proxies = []
return response.status_code == 200 for proxy in proxies:
except: if len(new_proxies) >= 20:
return False break
if check_proxy(proxy):
new_proxies.append(proxy)
with runningproxies_lock:
runningproxies = new_proxies
if update_event.wait(300):
break
def start_proxy_updater(proxies):
global updater_thread
stop_proxy_updater()
update_event.clear()
updater_thread = Thread(target=update_running_proxies, args=(proxies,))
updater_thread.daemon = True
updater_thread.start()
def stop_proxy_updater():
global updater_thread
update_event.set()
if updater_thread and updater_thread.is_alive():
updater_thread.join(timeout=1)
updater_thread = None
def update_running_proxies(proxies): def get_random_proxy(timeout=10):
global runningproxies deadline = time.time() + timeout
while not update_event.is_set(): while time.time() < deadline:
new_proxies = [] with runningproxies_lock:
for proxy in proxies: available = list(runningproxies)
if len(new_proxies) >= 20: if available:
break return random.choice(available)
if check_proxy(proxy): if update_event.wait(1):
new_proxies.append(proxy) break
runningproxies = new_proxies raise RuntimeError("No running proxies available")
# print(f"Updated running proxies: {runningproxies}") # Debugging-Ausgabe
time.sleep(300) # Warte 5 Minuten
def start_proxy_updater(proxies): def get_total_pages(query):
updater_thread = Thread(target=update_running_proxies, args=(proxies,)) url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page=1"
updater_thread.daemon = True try:
updater_thread.start() response = requests.get(url, timeout=10)
response.raise_for_status()
def get_random_proxy(): except requests.RequestException:
while True: return 1
if runningproxies:
proxy = random.choice(runningproxies) soup = BeautifulSoup(response.text, 'html.parser')
print(f"Selected proxy: {proxy}") # Debugging-Ausgabe
return proxy
else:
print("No running proxies available.") # Debugging-Ausgabe
time.sleep(1) # Warte kurz, bevor erneut versucht wird
def get_total_pages(query):
url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page=1"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
pagination = soup.find('div', class_='pagination') pagination = soup.find('div', class_='pagination')
if pagination: if pagination:
pages = pagination.find_all('a') pages = pagination.find_all('a')
@@ -81,21 +124,37 @@ def get_total_pages(query):
total_pages = 1 total_pages = 1
return total_pages return total_pages
def fetch_page(query, page, unwanted_words, minprice, maxprice, unwanted_prices, proxies): def fetch_page(query, page, unwanted_words, minprice, maxprice, unwanted_prices, use_proxies):
while True: url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page={page}"
proxy = get_random_proxy() attempts = 0
try: max_attempts = 5
proxy_type = proxy.split("://")[0] last_error = None
url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page={page}"
response = requests.get(url, proxies={proxy_type: proxy}) while attempts < max_attempts:
response.raise_for_status() proxies = None
print(f"Successfully fetched page {page} using proxy {proxy}.") # Debugging-Ausgabe if use_proxies:
break try:
except requests.exceptions.RequestException as e: proxy = get_random_proxy()
print(f"Failed to fetch page {page} using proxy {proxy}. Error: {e}") # Debugging-Ausgabe proxy_type = proxy.split("://")[0]
continue proxies = {proxy_type: proxy}
except RuntimeError as exc:
soup = BeautifulSoup(response.text, 'html.parser') use_proxies = False
last_error = exc
continue
try:
response = requests.get(url, proxies=proxies, timeout=10)
response.raise_for_status()
break
except requests.RequestException as exc:
last_error = exc
attempts += 1
time.sleep(1)
else:
print(f"Failed to fetch page {page}: {last_error}")
return []
soup = BeautifulSoup(response.text, 'html.parser')
items = [] items = []
for item in soup.find_all('li', class_='ad-listitem'): for item in soup.find_all('li', class_='ad-listitem'):
@@ -123,13 +182,31 @@ def search_item(query, unwanted_words, minprice, maxprice, maxpages):
items = [] items = []
total_pages = get_total_pages(query) total_pages = get_total_pages(query)
pages_to_search = min(total_pages, maxpages) pages_to_search = min(total_pages, maxpages)
proxies = load_proxies() proxies = load_proxies()
start_proxy_updater(proxies) use_proxies = bool(proxies)
if use_proxies:
with ThreadPoolExecutor(max_workers=10) as executor: start_proxy_updater(proxies)
futures = [executor.submit(fetch_page, query, page, unwanted_words, minprice, maxprice, ["Zu verschenken"], proxies) for page in range(1, pages_to_search + 1)]
for future in futures: try:
items.extend(future.result()) with ThreadPoolExecutor(max_workers=10) as executor:
futures = [
executor.submit(
fetch_page,
query,
page,
unwanted_words,
minprice,
maxprice,
["Zu verschenken"],
use_proxies,
)
for page in range(1, pages_to_search + 1)
]
for future in futures:
items.extend(future.result())
finally:
if use_proxies:
stop_proxy_updater()
# Ergebnisse nach Preis sortieren # Ergebnisse nach Preis sortieren
items.sort(key=lambda x: x[3]) items.sort(key=lambda x: x[3])
@@ -148,42 +225,50 @@ def search_item(query, unwanted_words, minprice, maxprice, maxpages):
def index(): def index():
return render_template('index.html') return render_template('index.html')
@app.route('/search', methods=['POST']) @app.route('/search', methods=['POST'])
def search(): def search():
queries = request.form.get('query').split('\n') queries = request.form.get('query').split('\n')
unwanted_words = request.form.get('unwanted_words').split('\n') unwanted_words = request.form.get('unwanted_words').split('\n')
minprice = float(request.form.get('minprice') or 0) minprice = float(request.form.get('minprice') or 0)
maxprice = float(request.form.get('maxprice') or float('inf')) maxprice = float(request.form.get('maxprice') or float('inf'))
maxpages = int(request.form.get('maxpages') or 0) maxpages = int(request.form.get('maxpages') or 0)
filename = request.form.get('filename') or 'kleinanzeigen_results' filename = request.form.get('filename') or 'kleinanzeigen_results'
filename = secure_filename(filename)
session_id = request.cookies.get('session') if not filename:
if session_id not in progress_data: filename = 'kleinanzeigen_results'
progress_data[session_id] = {'current_item': 0, 'total_items': 0} if not filename.lower().endswith('.xlsx'):
filename = f"{filename}.xlsx"
all_items = []
progress_data[session_id]['total_items'] = len(queries) session_progress = session.setdefault('progress', {'current_item': 0, 'total_items': 0})
for i, query in enumerate(queries): all_items = []
query = query.strip() session_progress['total_items'] = len([q for q in queries if q.strip()])
if query:
items = search_item(query, unwanted_words, minprice, maxprice, maxpages) processed_queries = 0
all_items.extend(items) for query in queries:
progress_data[session_id]['current_item'] = i + 1 query = query.strip()
time.sleep(0.1) # Füge eine kurze Verzögerung hinzu, um die Fortschrittsanzeige zu aktualisieren if query:
items = search_item(query, unwanted_words, minprice, maxprice, maxpages)
df = pd.DataFrame(all_items, columns=['Artikel', 'Preis', 'Link', 'PriceValue']) all_items.extend(items)
df = df.drop(columns=['PriceValue']) processed_queries += 1
filepath = f'uploads/{filename}.xlsx' session_progress['current_item'] = processed_queries
df.to_excel(filepath, index=False) session['progress'] = session_progress
session.modified = True
session['filepath'] = filepath time.sleep(0.1) # Füge eine kurze Verzögerung hinzu, um die Fortschrittsanzeige zu aktualisieren
return jsonify(all_items)
df = pd.DataFrame(all_items, columns=['Artikel', 'Preis', 'Link', 'PriceValue'])
@app.route('/progress') df = df.drop(columns=['PriceValue'])
def progress(): os.makedirs('uploads', exist_ok=True)
session_id = request.cookies.get('session') filepath = os.path.join('uploads', filename)
return jsonify(progress_data.get(session_id, {'current_item': 0, 'total_items': 0})) df.to_excel(filepath, index=False)
session['filepath'] = filepath
session.modified = True
return jsonify(all_items)
@app.route('/progress')
def progress():
return jsonify(session.get('progress', {'current_item': 0, 'total_items': 0}))
@app.route('/download') @app.route('/download')
def download(): def download():
@@ -192,7 +277,7 @@ def download():
return send_file(filepath, as_attachment=True) return send_file(filepath, as_attachment=True)
return "File not found", 404 return "File not found", 404
if __name__ == '__main__': if __name__ == '__main__':
if not os.path.exists('uploads'): if not os.path.exists('uploads'):
os.makedirs('uploads') os.makedirs('uploads')
app.run(debug=True) app.run()