1 Commits

Author SHA1 Message Date
429a003453 Harden proxy handling and session security 2025-11-23 20:09:29 +01:00

View File

@@ -4,22 +4,33 @@ from bs4 import BeautifulSoup
import pandas as pd import pandas as pd
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
import os import os
import json
import time import time
from flask_session import Session from flask_session import Session
import random import random
from threading import Thread, Event from threading import Thread, Event, Lock
import secrets
from werkzeug.utils import secure_filename
app = Flask(__name__) app = Flask(__name__)
app.config['SECRET_KEY'] = 'supersecretkey' app.config['SECRET_KEY'] = os.environ.get('SECRET_KEY', secrets.token_hex(32))
app.config['SESSION_TYPE'] = 'filesystem' app.config['SESSION_TYPE'] = 'filesystem'
Session(app) Session(app)
progress_data = {}
runningproxies = [] runningproxies = []
runningproxies_lock = Lock()
update_event = Event() update_event = Event()
updater_thread = None
_proxy_cache = {"proxies": [], "expires_at": 0}
_proxy_cache_lock = Lock()
PROXY_CACHE_TTL = 600
def load_proxies(): def load_proxies():
current_time = time.time()
with _proxy_cache_lock:
if _proxy_cache["proxies"] and current_time < _proxy_cache["expires_at"]:
return list(_proxy_cache["proxies"])
urls = { urls = {
"http": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt", "http": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt",
"socks4": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txt", "socks4": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txt",
@@ -27,18 +38,31 @@ def load_proxies():
} }
proxies = [] proxies = []
for proxy_type, url in urls.items(): for proxy_type, url in urls.items():
response = requests.get(url) try:
if response.status_code == 200: response = requests.get(url, timeout=5)
proxies.extend([f"{proxy_type}://{line.strip()}" for line in response.text.splitlines() if line.strip()]) response.raise_for_status()
# print(f"Loaded proxies: {proxies}") # Debugging-Ausgabe except requests.RequestException:
continue
proxies.extend(
[f"{proxy_type}://{line.strip()}" for line in response.text.splitlines() if line.strip()]
)
with _proxy_cache_lock:
_proxy_cache["proxies"] = proxies
_proxy_cache["expires_at"] = current_time + PROXY_CACHE_TTL
return proxies return proxies
def check_proxy(proxy): def check_proxy(proxy):
try: try:
proxy_type = proxy.split("://")[0] proxy_type = proxy.split("://")[0]
response = requests.get("https://www.google.com", proxies={proxy_type: proxy}, timeout=1) response = requests.get(
"https://www.google.com",
proxies={proxy_type: proxy},
timeout=5,
)
return response.status_code == 200 return response.status_code == 200
except: except requests.RequestException:
return False return False
def update_running_proxies(proxies): def update_running_proxies(proxies):
@@ -50,28 +74,47 @@ def update_running_proxies(proxies):
break break
if check_proxy(proxy): if check_proxy(proxy):
new_proxies.append(proxy) new_proxies.append(proxy)
runningproxies = new_proxies with runningproxies_lock:
# print(f"Updated running proxies: {runningproxies}") # Debugging-Ausgabe runningproxies = new_proxies
time.sleep(300) # Warte 5 Minuten if update_event.wait(300):
break
def start_proxy_updater(proxies): def start_proxy_updater(proxies):
global updater_thread
stop_proxy_updater()
update_event.clear()
updater_thread = Thread(target=update_running_proxies, args=(proxies,)) updater_thread = Thread(target=update_running_proxies, args=(proxies,))
updater_thread.daemon = True updater_thread.daemon = True
updater_thread.start() updater_thread.start()
def get_random_proxy():
while True: def stop_proxy_updater():
if runningproxies: global updater_thread
proxy = random.choice(runningproxies) update_event.set()
print(f"Selected proxy: {proxy}") # Debugging-Ausgabe if updater_thread and updater_thread.is_alive():
return proxy updater_thread.join(timeout=1)
else: updater_thread = None
print("No running proxies available.") # Debugging-Ausgabe
time.sleep(1) # Warte kurz, bevor erneut versucht wird def get_random_proxy(timeout=10):
deadline = time.time() + timeout
while time.time() < deadline:
with runningproxies_lock:
available = list(runningproxies)
if available:
return random.choice(available)
if update_event.wait(1):
break
raise RuntimeError("No running proxies available")
def get_total_pages(query): def get_total_pages(query):
url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page=1" url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page=1"
response = requests.get(url) try:
response = requests.get(url, timeout=10)
response.raise_for_status()
except requests.RequestException:
return 1
soup = BeautifulSoup(response.text, 'html.parser') soup = BeautifulSoup(response.text, 'html.parser')
pagination = soup.find('div', class_='pagination') pagination = soup.find('div', class_='pagination')
if pagination: if pagination:
@@ -81,19 +124,35 @@ def get_total_pages(query):
total_pages = 1 total_pages = 1
return total_pages return total_pages
def fetch_page(query, page, unwanted_words, minprice, maxprice, unwanted_prices, proxies): def fetch_page(query, page, unwanted_words, minprice, maxprice, unwanted_prices, use_proxies):
while True: url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page={page}"
proxy = get_random_proxy() attempts = 0
max_attempts = 5
last_error = None
while attempts < max_attempts:
proxies = None
if use_proxies:
try:
proxy = get_random_proxy()
proxy_type = proxy.split("://")[0]
proxies = {proxy_type: proxy}
except RuntimeError as exc:
use_proxies = False
last_error = exc
continue
try: try:
proxy_type = proxy.split("://")[0] response = requests.get(url, proxies=proxies, timeout=10)
url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page={page}"
response = requests.get(url, proxies={proxy_type: proxy})
response.raise_for_status() response.raise_for_status()
print(f"Successfully fetched page {page} using proxy {proxy}.") # Debugging-Ausgabe
break break
except requests.exceptions.RequestException as e: except requests.RequestException as exc:
print(f"Failed to fetch page {page} using proxy {proxy}. Error: {e}") # Debugging-Ausgabe last_error = exc
continue attempts += 1
time.sleep(1)
else:
print(f"Failed to fetch page {page}: {last_error}")
return []
soup = BeautifulSoup(response.text, 'html.parser') soup = BeautifulSoup(response.text, 'html.parser')
items = [] items = []
@@ -124,12 +183,30 @@ def search_item(query, unwanted_words, minprice, maxprice, maxpages):
total_pages = get_total_pages(query) total_pages = get_total_pages(query)
pages_to_search = min(total_pages, maxpages) pages_to_search = min(total_pages, maxpages)
proxies = load_proxies() proxies = load_proxies()
start_proxy_updater(proxies) use_proxies = bool(proxies)
if use_proxies:
start_proxy_updater(proxies)
with ThreadPoolExecutor(max_workers=10) as executor: try:
futures = [executor.submit(fetch_page, query, page, unwanted_words, minprice, maxprice, ["Zu verschenken"], proxies) for page in range(1, pages_to_search + 1)] with ThreadPoolExecutor(max_workers=10) as executor:
for future in futures: futures = [
items.extend(future.result()) executor.submit(
fetch_page,
query,
page,
unwanted_words,
minprice,
maxprice,
["Zu verschenken"],
use_proxies,
)
for page in range(1, pages_to_search + 1)
]
for future in futures:
items.extend(future.result())
finally:
if use_proxies:
stop_proxy_updater()
# Ergebnisse nach Preis sortieren # Ergebnisse nach Preis sortieren
items.sort(key=lambda x: x[3]) items.sort(key=lambda x: x[3])
@@ -156,34 +233,42 @@ def search():
maxprice = float(request.form.get('maxprice') or float('inf')) maxprice = float(request.form.get('maxprice') or float('inf'))
maxpages = int(request.form.get('maxpages') or 0) maxpages = int(request.form.get('maxpages') or 0)
filename = request.form.get('filename') or 'kleinanzeigen_results' filename = request.form.get('filename') or 'kleinanzeigen_results'
filename = secure_filename(filename)
if not filename:
filename = 'kleinanzeigen_results'
if not filename.lower().endswith('.xlsx'):
filename = f"{filename}.xlsx"
session_id = request.cookies.get('session') session_progress = session.setdefault('progress', {'current_item': 0, 'total_items': 0})
if session_id not in progress_data:
progress_data[session_id] = {'current_item': 0, 'total_items': 0}
all_items = [] all_items = []
progress_data[session_id]['total_items'] = len(queries) session_progress['total_items'] = len([q for q in queries if q.strip()])
for i, query in enumerate(queries): processed_queries = 0
for query in queries:
query = query.strip() query = query.strip()
if query: if query:
items = search_item(query, unwanted_words, minprice, maxprice, maxpages) items = search_item(query, unwanted_words, minprice, maxprice, maxpages)
all_items.extend(items) all_items.extend(items)
progress_data[session_id]['current_item'] = i + 1 processed_queries += 1
session_progress['current_item'] = processed_queries
session['progress'] = session_progress
session.modified = True
time.sleep(0.1) # Füge eine kurze Verzögerung hinzu, um die Fortschrittsanzeige zu aktualisieren time.sleep(0.1) # Füge eine kurze Verzögerung hinzu, um die Fortschrittsanzeige zu aktualisieren
df = pd.DataFrame(all_items, columns=['Artikel', 'Preis', 'Link', 'PriceValue']) df = pd.DataFrame(all_items, columns=['Artikel', 'Preis', 'Link', 'PriceValue'])
df = df.drop(columns=['PriceValue']) df = df.drop(columns=['PriceValue'])
filepath = f'uploads/{filename}.xlsx' os.makedirs('uploads', exist_ok=True)
filepath = os.path.join('uploads', filename)
df.to_excel(filepath, index=False) df.to_excel(filepath, index=False)
session['filepath'] = filepath session['filepath'] = filepath
session.modified = True
return jsonify(all_items) return jsonify(all_items)
@app.route('/progress') @app.route('/progress')
def progress(): def progress():
session_id = request.cookies.get('session') return jsonify(session.get('progress', {'current_item': 0, 'total_items': 0}))
return jsonify(progress_data.get(session_id, {'current_item': 0, 'total_items': 0}))
@app.route('/download') @app.route('/download')
def download(): def download():
@@ -195,4 +280,4 @@ def download():
if __name__ == '__main__': if __name__ == '__main__':
if not os.path.exists('uploads'): if not os.path.exists('uploads'):
os.makedirs('uploads') os.makedirs('uploads')
app.run(debug=True) app.run()