Kleinanzeigen-Preisabfrage/Kleinanzeigen-Preisabfrage-main/app.py

from flask import Flask, request, render_template, jsonify, send_file, session
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import os
import time
from flask_session import Session
import random
from threading import Thread, Event, Lock
import secrets
from werkzeug.utils import secure_filename

app = Flask(__name__)
app.config['SECRET_KEY'] = os.environ.get('SECRET_KEY', secrets.token_hex(32))
app.config['SESSION_TYPE'] = 'filesystem'
Session(app)

runningproxies = []
runningproxies_lock = Lock()
update_event = Event()
updater_thread = None

_proxy_cache = {"proxies": [], "expires_at": 0}
_proxy_cache_lock = Lock()
PROXY_CACHE_TTL = 600

def load_proxies():
    current_time = time.time()
    with _proxy_cache_lock:
        if _proxy_cache["proxies"] and current_time < _proxy_cache["expires_at"]:
            return list(_proxy_cache["proxies"])

    urls = {
        "http": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt",
        "socks4": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txt",
        "socks5": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks5.txt"
    }
    proxies = []
    for proxy_type, url in urls.items():
        try:
            response = requests.get(url, timeout=5)
            response.raise_for_status()
        except requests.RequestException:
            continue
        proxies.extend(
            [f"{proxy_type}://{line.strip()}" for line in response.text.splitlines() if line.strip()]
        )

    with _proxy_cache_lock:
        _proxy_cache["proxies"] = proxies
        _proxy_cache["expires_at"] = current_time + PROXY_CACHE_TTL

    return proxies

def check_proxy(proxy):
    try:
        proxy_type = proxy.split("://")[0]
        response = requests.get(
            "https://www.google.com",
            proxies={proxy_type: proxy},
            timeout=5,
        )
        return response.status_code == 200
    except requests.RequestException:
        return False

def update_running_proxies(proxies):
    global runningproxies
    while not update_event.is_set():
        new_proxies = []
        for proxy in proxies:
            if len(new_proxies) >= 20:
                break
            if check_proxy(proxy):
                new_proxies.append(proxy)
        with runningproxies_lock:
            runningproxies = new_proxies
        if update_event.wait(300):
            break


def start_proxy_updater(proxies):
    global updater_thread
    stop_proxy_updater()
    update_event.clear()
    updater_thread = Thread(target=update_running_proxies, args=(proxies,))
    updater_thread.daemon = True
    updater_thread.start()


def stop_proxy_updater():
    global updater_thread
    update_event.set()
    if updater_thread and updater_thread.is_alive():
        updater_thread.join(timeout=1)
    updater_thread = None

def get_random_proxy(timeout=10):
    deadline = time.time() + timeout
    while time.time() < deadline:
        with runningproxies_lock:
            available = list(runningproxies)
        if available:
            return random.choice(available)
        if update_event.wait(1):
            break
    raise RuntimeError("No running proxies available")

def get_total_pages(query):
    url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page=1"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.RequestException:
        return 1

    soup = BeautifulSoup(response.text, 'html.parser')
    pagination = soup.find('div', class_='pagination')
    if pagination:
        pages = pagination.find_all('a')
        total_pages = int(pages[-2].text) if pages else 1
    else:
        total_pages = 1
    return total_pages

def fetch_page(query, page, unwanted_words, minprice, maxprice, unwanted_prices, use_proxies):
    url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page={page}"
    attempts = 0
    max_attempts = 5
    last_error = None

    while attempts < max_attempts:
        proxies = None
        if use_proxies:
            try:
                proxy = get_random_proxy()
                proxy_type = proxy.split("://")[0]
                proxies = {proxy_type: proxy}
            except RuntimeError as exc:
                use_proxies = False
                last_error = exc
                continue

        try:
            response = requests.get(url, proxies=proxies, timeout=10)
            response.raise_for_status()
            break
        except requests.RequestException as exc:
            last_error = exc
            attempts += 1
            time.sleep(1)
    else:
        print(f"Failed to fetch page {page}: {last_error}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    items = []

    for item in soup.find_all('li', class_='ad-listitem'):
        title_tag = item.find('a', class_='ellipsis')
        title = title_tag.text.strip().lower() if title_tag else 'n/a'

        price_tag = item.find('p', class_='aditem-main--middle--price-shipping--price')
        price = price_tag.text.strip() if price_tag else 'n/a'

        if price != 'n/a':
            price = price.replace('€', '').replace('VB', '').replace(',', '').strip()
            price_value = float(price) if price.isdigit() else 0
            price = f"{price} €"  # Preis mit Eurozeichen
        else:
            price_value = 0

        link = "https://www.kleinanzeigen.de" + title_tag['href'] if title_tag else 'n/a'

        if (minprice <= price_value <= maxprice or 'vb' in price.lower()) and not any(word.lower() in title for word in unwanted_words) and price not in unwanted_prices:
            items.append([title, price, link, price_value])

    return items

def search_item(query, unwanted_words, minprice, maxprice, maxpages):
    items = []
    total_pages = get_total_pages(query)
    pages_to_search = min(total_pages, maxpages)
    proxies = load_proxies()
    use_proxies = bool(proxies)
    if use_proxies:
        start_proxy_updater(proxies)

    try:
        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = [
                executor.submit(
                    fetch_page,
                    query,
                    page,
                    unwanted_words,
                    minprice,
                    maxprice,
                    ["Zu verschenken"],
                    use_proxies,
                )
                for page in range(1, pages_to_search + 1)
            ]
            for future in futures:
                items.extend(future.result())
    finally:
        if use_proxies:
            stop_proxy_updater()

    # Ergebnisse nach Preis sortieren
    items.sort(key=lambda x: x[3])

    # Entferne doppelte Einträge basierend auf dem Link
    unique_items = []
    seen_links = set()
    for item in items:
        if item[2] not in seen_links:
            unique_items.append(item)
            seen_links.add(item[2])

    return unique_items

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/search', methods=['POST'])
def search():
    queries = request.form.get('query').split('\n')
    unwanted_words = request.form.get('unwanted_words').split('\n')
    minprice = float(request.form.get('minprice') or 0)
    maxprice = float(request.form.get('maxprice') or float('inf'))
    maxpages = int(request.form.get('maxpages') or 0)
    filename = request.form.get('filename') or 'kleinanzeigen_results'
    filename = secure_filename(filename)
    if not filename:
        filename = 'kleinanzeigen_results'
    if not filename.lower().endswith('.xlsx'):
        filename = f"{filename}.xlsx"

    session_progress = session.setdefault('progress', {'current_item': 0, 'total_items': 0})

    all_items = []
    session_progress['total_items'] = len([q for q in queries if q.strip()])

    processed_queries = 0
    for query in queries:
        query = query.strip()
        if query:
            items = search_item(query, unwanted_words, minprice, maxprice, maxpages)
            all_items.extend(items)
            processed_queries += 1
            session_progress['current_item'] = processed_queries
            session['progress'] = session_progress
            session.modified = True
            time.sleep(0.1)  # Füge eine kurze Verzögerung hinzu, um die Fortschrittsanzeige zu aktualisieren

    df = pd.DataFrame(all_items, columns=['Artikel', 'Preis', 'Link', 'PriceValue'])
    df = df.drop(columns=['PriceValue'])
    os.makedirs('uploads', exist_ok=True)
    filepath = os.path.join('uploads', filename)
    df.to_excel(filepath, index=False)

    session['filepath'] = filepath
    session.modified = True
    return jsonify(all_items)

@app.route('/progress')
def progress():
    return jsonify(session.get('progress', {'current_item': 0, 'total_items': 0}))

@app.route('/download')
def download():
    filepath = session.get('filepath')
    if filepath and os.path.exists(filepath):
        return send_file(filepath, as_attachment=True)
    return "File not found", 404

if __name__ == '__main__':
    if not os.path.exists('uploads'):
        os.makedirs('uploads')
    app.run()