284 lines
9.2 KiB
Python
284 lines
9.2 KiB
Python
from flask import Flask, request, render_template, jsonify, send_file, session
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
import os
|
|
import time
|
|
from flask_session import Session
|
|
import random
|
|
from threading import Thread, Event, Lock
|
|
import secrets
|
|
from werkzeug.utils import secure_filename
|
|
|
|
app = Flask(__name__)
|
|
app.config['SECRET_KEY'] = os.environ.get('SECRET_KEY', secrets.token_hex(32))
|
|
app.config['SESSION_TYPE'] = 'filesystem'
|
|
Session(app)
|
|
|
|
runningproxies = []
|
|
runningproxies_lock = Lock()
|
|
update_event = Event()
|
|
updater_thread = None
|
|
|
|
_proxy_cache = {"proxies": [], "expires_at": 0}
|
|
_proxy_cache_lock = Lock()
|
|
PROXY_CACHE_TTL = 600
|
|
|
|
def load_proxies():
|
|
current_time = time.time()
|
|
with _proxy_cache_lock:
|
|
if _proxy_cache["proxies"] and current_time < _proxy_cache["expires_at"]:
|
|
return list(_proxy_cache["proxies"])
|
|
|
|
urls = {
|
|
"http": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt",
|
|
"socks4": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txt",
|
|
"socks5": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks5.txt"
|
|
}
|
|
proxies = []
|
|
for proxy_type, url in urls.items():
|
|
try:
|
|
response = requests.get(url, timeout=5)
|
|
response.raise_for_status()
|
|
except requests.RequestException:
|
|
continue
|
|
proxies.extend(
|
|
[f"{proxy_type}://{line.strip()}" for line in response.text.splitlines() if line.strip()]
|
|
)
|
|
|
|
with _proxy_cache_lock:
|
|
_proxy_cache["proxies"] = proxies
|
|
_proxy_cache["expires_at"] = current_time + PROXY_CACHE_TTL
|
|
|
|
return proxies
|
|
|
|
def check_proxy(proxy):
|
|
try:
|
|
proxy_type = proxy.split("://")[0]
|
|
response = requests.get(
|
|
"https://www.google.com",
|
|
proxies={proxy_type: proxy},
|
|
timeout=5,
|
|
)
|
|
return response.status_code == 200
|
|
except requests.RequestException:
|
|
return False
|
|
|
|
def update_running_proxies(proxies):
|
|
global runningproxies
|
|
while not update_event.is_set():
|
|
new_proxies = []
|
|
for proxy in proxies:
|
|
if len(new_proxies) >= 20:
|
|
break
|
|
if check_proxy(proxy):
|
|
new_proxies.append(proxy)
|
|
with runningproxies_lock:
|
|
runningproxies = new_proxies
|
|
if update_event.wait(300):
|
|
break
|
|
|
|
|
|
def start_proxy_updater(proxies):
|
|
global updater_thread
|
|
stop_proxy_updater()
|
|
update_event.clear()
|
|
updater_thread = Thread(target=update_running_proxies, args=(proxies,))
|
|
updater_thread.daemon = True
|
|
updater_thread.start()
|
|
|
|
|
|
def stop_proxy_updater():
|
|
global updater_thread
|
|
update_event.set()
|
|
if updater_thread and updater_thread.is_alive():
|
|
updater_thread.join(timeout=1)
|
|
updater_thread = None
|
|
|
|
def get_random_proxy(timeout=10):
|
|
deadline = time.time() + timeout
|
|
while time.time() < deadline:
|
|
with runningproxies_lock:
|
|
available = list(runningproxies)
|
|
if available:
|
|
return random.choice(available)
|
|
if update_event.wait(1):
|
|
break
|
|
raise RuntimeError("No running proxies available")
|
|
|
|
def get_total_pages(query):
|
|
url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page=1"
|
|
try:
|
|
response = requests.get(url, timeout=10)
|
|
response.raise_for_status()
|
|
except requests.RequestException:
|
|
return 1
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
pagination = soup.find('div', class_='pagination')
|
|
if pagination:
|
|
pages = pagination.find_all('a')
|
|
total_pages = int(pages[-2].text) if pages else 1
|
|
else:
|
|
total_pages = 1
|
|
return total_pages
|
|
|
|
def fetch_page(query, page, unwanted_words, minprice, maxprice, unwanted_prices, use_proxies):
|
|
url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page={page}"
|
|
attempts = 0
|
|
max_attempts = 5
|
|
last_error = None
|
|
|
|
while attempts < max_attempts:
|
|
proxies = None
|
|
if use_proxies:
|
|
try:
|
|
proxy = get_random_proxy()
|
|
proxy_type = proxy.split("://")[0]
|
|
proxies = {proxy_type: proxy}
|
|
except RuntimeError as exc:
|
|
use_proxies = False
|
|
last_error = exc
|
|
continue
|
|
|
|
try:
|
|
response = requests.get(url, proxies=proxies, timeout=10)
|
|
response.raise_for_status()
|
|
break
|
|
except requests.RequestException as exc:
|
|
last_error = exc
|
|
attempts += 1
|
|
time.sleep(1)
|
|
else:
|
|
print(f"Failed to fetch page {page}: {last_error}")
|
|
return []
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
items = []
|
|
|
|
for item in soup.find_all('li', class_='ad-listitem'):
|
|
title_tag = item.find('a', class_='ellipsis')
|
|
title = title_tag.text.strip().lower() if title_tag else 'n/a'
|
|
|
|
price_tag = item.find('p', class_='aditem-main--middle--price-shipping--price')
|
|
price = price_tag.text.strip() if price_tag else 'n/a'
|
|
|
|
if price != 'n/a':
|
|
price = price.replace('€', '').replace('VB', '').replace(',', '').strip()
|
|
price_value = float(price) if price.isdigit() else 0
|
|
price = f"{price} €" # Preis mit Eurozeichen
|
|
else:
|
|
price_value = 0
|
|
|
|
link = "https://www.kleinanzeigen.de" + title_tag['href'] if title_tag else 'n/a'
|
|
|
|
if (minprice <= price_value <= maxprice or 'vb' in price.lower()) and not any(word.lower() in title for word in unwanted_words) and price not in unwanted_prices:
|
|
items.append([title, price, link, price_value])
|
|
|
|
return items
|
|
|
|
def search_item(query, unwanted_words, minprice, maxprice, maxpages):
|
|
items = []
|
|
total_pages = get_total_pages(query)
|
|
pages_to_search = min(total_pages, maxpages)
|
|
proxies = load_proxies()
|
|
use_proxies = bool(proxies)
|
|
if use_proxies:
|
|
start_proxy_updater(proxies)
|
|
|
|
try:
|
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
|
futures = [
|
|
executor.submit(
|
|
fetch_page,
|
|
query,
|
|
page,
|
|
unwanted_words,
|
|
minprice,
|
|
maxprice,
|
|
["Zu verschenken"],
|
|
use_proxies,
|
|
)
|
|
for page in range(1, pages_to_search + 1)
|
|
]
|
|
for future in futures:
|
|
items.extend(future.result())
|
|
finally:
|
|
if use_proxies:
|
|
stop_proxy_updater()
|
|
|
|
# Ergebnisse nach Preis sortieren
|
|
items.sort(key=lambda x: x[3])
|
|
|
|
# Entferne doppelte Einträge basierend auf dem Link
|
|
unique_items = []
|
|
seen_links = set()
|
|
for item in items:
|
|
if item[2] not in seen_links:
|
|
unique_items.append(item)
|
|
seen_links.add(item[2])
|
|
|
|
return unique_items
|
|
|
|
@app.route('/')
|
|
def index():
|
|
return render_template('index.html')
|
|
|
|
@app.route('/search', methods=['POST'])
|
|
def search():
|
|
queries = request.form.get('query').split('\n')
|
|
unwanted_words = request.form.get('unwanted_words').split('\n')
|
|
minprice = float(request.form.get('minprice') or 0)
|
|
maxprice = float(request.form.get('maxprice') or float('inf'))
|
|
maxpages = int(request.form.get('maxpages') or 0)
|
|
filename = request.form.get('filename') or 'kleinanzeigen_results'
|
|
filename = secure_filename(filename)
|
|
if not filename:
|
|
filename = 'kleinanzeigen_results'
|
|
if not filename.lower().endswith('.xlsx'):
|
|
filename = f"{filename}.xlsx"
|
|
|
|
session_progress = session.setdefault('progress', {'current_item': 0, 'total_items': 0})
|
|
|
|
all_items = []
|
|
session_progress['total_items'] = len([q for q in queries if q.strip()])
|
|
|
|
processed_queries = 0
|
|
for query in queries:
|
|
query = query.strip()
|
|
if query:
|
|
items = search_item(query, unwanted_words, minprice, maxprice, maxpages)
|
|
all_items.extend(items)
|
|
processed_queries += 1
|
|
session_progress['current_item'] = processed_queries
|
|
session['progress'] = session_progress
|
|
session.modified = True
|
|
time.sleep(0.1) # Füge eine kurze Verzögerung hinzu, um die Fortschrittsanzeige zu aktualisieren
|
|
|
|
df = pd.DataFrame(all_items, columns=['Artikel', 'Preis', 'Link', 'PriceValue'])
|
|
df = df.drop(columns=['PriceValue'])
|
|
os.makedirs('uploads', exist_ok=True)
|
|
filepath = os.path.join('uploads', filename)
|
|
df.to_excel(filepath, index=False)
|
|
|
|
session['filepath'] = filepath
|
|
session.modified = True
|
|
return jsonify(all_items)
|
|
|
|
@app.route('/progress')
|
|
def progress():
|
|
return jsonify(session.get('progress', {'current_item': 0, 'total_items': 0}))
|
|
|
|
@app.route('/download')
|
|
def download():
|
|
filepath = session.get('filepath')
|
|
if filepath and os.path.exists(filepath):
|
|
return send_file(filepath, as_attachment=True)
|
|
return "File not found", 404
|
|
|
|
if __name__ == '__main__':
|
|
if not os.path.exists('uploads'):
|
|
os.makedirs('uploads')
|
|
app.run()
|