Files
Kleinanzeigen-Preisabfrage/Kleinanzeigen-Preisabfrage-main/app.py

284 lines
9.2 KiB
Python

from flask import Flask, request, render_template, jsonify, send_file, session
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import os
import time
from flask_session import Session
import random
from threading import Thread, Event, Lock
import secrets
from werkzeug.utils import secure_filename
app = Flask(__name__)
app.config['SECRET_KEY'] = os.environ.get('SECRET_KEY', secrets.token_hex(32))
app.config['SESSION_TYPE'] = 'filesystem'
Session(app)
runningproxies = []
runningproxies_lock = Lock()
update_event = Event()
updater_thread = None
_proxy_cache = {"proxies": [], "expires_at": 0}
_proxy_cache_lock = Lock()
PROXY_CACHE_TTL = 600
def load_proxies():
current_time = time.time()
with _proxy_cache_lock:
if _proxy_cache["proxies"] and current_time < _proxy_cache["expires_at"]:
return list(_proxy_cache["proxies"])
urls = {
"http": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt",
"socks4": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txt",
"socks5": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks5.txt"
}
proxies = []
for proxy_type, url in urls.items():
try:
response = requests.get(url, timeout=5)
response.raise_for_status()
except requests.RequestException:
continue
proxies.extend(
[f"{proxy_type}://{line.strip()}" for line in response.text.splitlines() if line.strip()]
)
with _proxy_cache_lock:
_proxy_cache["proxies"] = proxies
_proxy_cache["expires_at"] = current_time + PROXY_CACHE_TTL
return proxies
def check_proxy(proxy):
try:
proxy_type = proxy.split("://")[0]
response = requests.get(
"https://www.google.com",
proxies={proxy_type: proxy},
timeout=5,
)
return response.status_code == 200
except requests.RequestException:
return False
def update_running_proxies(proxies):
global runningproxies
while not update_event.is_set():
new_proxies = []
for proxy in proxies:
if len(new_proxies) >= 20:
break
if check_proxy(proxy):
new_proxies.append(proxy)
with runningproxies_lock:
runningproxies = new_proxies
if update_event.wait(300):
break
def start_proxy_updater(proxies):
global updater_thread
stop_proxy_updater()
update_event.clear()
updater_thread = Thread(target=update_running_proxies, args=(proxies,))
updater_thread.daemon = True
updater_thread.start()
def stop_proxy_updater():
global updater_thread
update_event.set()
if updater_thread and updater_thread.is_alive():
updater_thread.join(timeout=1)
updater_thread = None
def get_random_proxy(timeout=10):
deadline = time.time() + timeout
while time.time() < deadline:
with runningproxies_lock:
available = list(runningproxies)
if available:
return random.choice(available)
if update_event.wait(1):
break
raise RuntimeError("No running proxies available")
def get_total_pages(query):
url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page=1"
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
except requests.RequestException:
return 1
soup = BeautifulSoup(response.text, 'html.parser')
pagination = soup.find('div', class_='pagination')
if pagination:
pages = pagination.find_all('a')
total_pages = int(pages[-2].text) if pages else 1
else:
total_pages = 1
return total_pages
def fetch_page(query, page, unwanted_words, minprice, maxprice, unwanted_prices, use_proxies):
url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page={page}"
attempts = 0
max_attempts = 5
last_error = None
while attempts < max_attempts:
proxies = None
if use_proxies:
try:
proxy = get_random_proxy()
proxy_type = proxy.split("://")[0]
proxies = {proxy_type: proxy}
except RuntimeError as exc:
use_proxies = False
last_error = exc
continue
try:
response = requests.get(url, proxies=proxies, timeout=10)
response.raise_for_status()
break
except requests.RequestException as exc:
last_error = exc
attempts += 1
time.sleep(1)
else:
print(f"Failed to fetch page {page}: {last_error}")
return []
soup = BeautifulSoup(response.text, 'html.parser')
items = []
for item in soup.find_all('li', class_='ad-listitem'):
title_tag = item.find('a', class_='ellipsis')
title = title_tag.text.strip().lower() if title_tag else 'n/a'
price_tag = item.find('p', class_='aditem-main--middle--price-shipping--price')
price = price_tag.text.strip() if price_tag else 'n/a'
if price != 'n/a':
price = price.replace('', '').replace('VB', '').replace(',', '').strip()
price_value = float(price) if price.isdigit() else 0
price = f"{price}" # Preis mit Eurozeichen
else:
price_value = 0
link = "https://www.kleinanzeigen.de" + title_tag['href'] if title_tag else 'n/a'
if (minprice <= price_value <= maxprice or 'vb' in price.lower()) and not any(word.lower() in title for word in unwanted_words) and price not in unwanted_prices:
items.append([title, price, link, price_value])
return items
def search_item(query, unwanted_words, minprice, maxprice, maxpages):
items = []
total_pages = get_total_pages(query)
pages_to_search = min(total_pages, maxpages)
proxies = load_proxies()
use_proxies = bool(proxies)
if use_proxies:
start_proxy_updater(proxies)
try:
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [
executor.submit(
fetch_page,
query,
page,
unwanted_words,
minprice,
maxprice,
["Zu verschenken"],
use_proxies,
)
for page in range(1, pages_to_search + 1)
]
for future in futures:
items.extend(future.result())
finally:
if use_proxies:
stop_proxy_updater()
# Ergebnisse nach Preis sortieren
items.sort(key=lambda x: x[3])
# Entferne doppelte Einträge basierend auf dem Link
unique_items = []
seen_links = set()
for item in items:
if item[2] not in seen_links:
unique_items.append(item)
seen_links.add(item[2])
return unique_items
@app.route('/')
def index():
return render_template('index.html')
@app.route('/search', methods=['POST'])
def search():
queries = request.form.get('query').split('\n')
unwanted_words = request.form.get('unwanted_words').split('\n')
minprice = float(request.form.get('minprice') or 0)
maxprice = float(request.form.get('maxprice') or float('inf'))
maxpages = int(request.form.get('maxpages') or 0)
filename = request.form.get('filename') or 'kleinanzeigen_results'
filename = secure_filename(filename)
if not filename:
filename = 'kleinanzeigen_results'
if not filename.lower().endswith('.xlsx'):
filename = f"{filename}.xlsx"
session_progress = session.setdefault('progress', {'current_item': 0, 'total_items': 0})
all_items = []
session_progress['total_items'] = len([q for q in queries if q.strip()])
processed_queries = 0
for query in queries:
query = query.strip()
if query:
items = search_item(query, unwanted_words, minprice, maxprice, maxpages)
all_items.extend(items)
processed_queries += 1
session_progress['current_item'] = processed_queries
session['progress'] = session_progress
session.modified = True
time.sleep(0.1) # Füge eine kurze Verzögerung hinzu, um die Fortschrittsanzeige zu aktualisieren
df = pd.DataFrame(all_items, columns=['Artikel', 'Preis', 'Link', 'PriceValue'])
df = df.drop(columns=['PriceValue'])
os.makedirs('uploads', exist_ok=True)
filepath = os.path.join('uploads', filename)
df.to_excel(filepath, index=False)
session['filepath'] = filepath
session.modified = True
return jsonify(all_items)
@app.route('/progress')
def progress():
return jsonify(session.get('progress', {'current_item': 0, 'total_items': 0}))
@app.route('/download')
def download():
filepath = session.get('filepath')
if filepath and os.path.exists(filepath):
return send_file(filepath, as_attachment=True)
return "File not found", 404
if __name__ == '__main__':
if not os.path.exists('uploads'):
os.makedirs('uploads')
app.run()