Compare commits
1 Commits
main
...
codex/chec
| Author | SHA1 | Date | |
|---|---|---|---|
| 429a003453 |
@@ -4,22 +4,33 @@ from bs4 import BeautifulSoup
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
import os
|
import os
|
||||||
import json
|
|
||||||
import time
|
import time
|
||||||
from flask_session import Session
|
from flask_session import Session
|
||||||
import random
|
import random
|
||||||
from threading import Thread, Event
|
from threading import Thread, Event, Lock
|
||||||
|
import secrets
|
||||||
|
from werkzeug.utils import secure_filename
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
app.config['SECRET_KEY'] = 'supersecretkey'
|
app.config['SECRET_KEY'] = os.environ.get('SECRET_KEY', secrets.token_hex(32))
|
||||||
app.config['SESSION_TYPE'] = 'filesystem'
|
app.config['SESSION_TYPE'] = 'filesystem'
|
||||||
Session(app)
|
Session(app)
|
||||||
|
|
||||||
progress_data = {}
|
|
||||||
runningproxies = []
|
runningproxies = []
|
||||||
|
runningproxies_lock = Lock()
|
||||||
update_event = Event()
|
update_event = Event()
|
||||||
|
updater_thread = None
|
||||||
|
|
||||||
|
_proxy_cache = {"proxies": [], "expires_at": 0}
|
||||||
|
_proxy_cache_lock = Lock()
|
||||||
|
PROXY_CACHE_TTL = 600
|
||||||
|
|
||||||
def load_proxies():
|
def load_proxies():
|
||||||
|
current_time = time.time()
|
||||||
|
with _proxy_cache_lock:
|
||||||
|
if _proxy_cache["proxies"] and current_time < _proxy_cache["expires_at"]:
|
||||||
|
return list(_proxy_cache["proxies"])
|
||||||
|
|
||||||
urls = {
|
urls = {
|
||||||
"http": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt",
|
"http": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt",
|
||||||
"socks4": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txt",
|
"socks4": "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txt",
|
||||||
@@ -27,18 +38,31 @@ def load_proxies():
|
|||||||
}
|
}
|
||||||
proxies = []
|
proxies = []
|
||||||
for proxy_type, url in urls.items():
|
for proxy_type, url in urls.items():
|
||||||
response = requests.get(url)
|
try:
|
||||||
if response.status_code == 200:
|
response = requests.get(url, timeout=5)
|
||||||
proxies.extend([f"{proxy_type}://{line.strip()}" for line in response.text.splitlines() if line.strip()])
|
response.raise_for_status()
|
||||||
# print(f"Loaded proxies: {proxies}") # Debugging-Ausgabe
|
except requests.RequestException:
|
||||||
|
continue
|
||||||
|
proxies.extend(
|
||||||
|
[f"{proxy_type}://{line.strip()}" for line in response.text.splitlines() if line.strip()]
|
||||||
|
)
|
||||||
|
|
||||||
|
with _proxy_cache_lock:
|
||||||
|
_proxy_cache["proxies"] = proxies
|
||||||
|
_proxy_cache["expires_at"] = current_time + PROXY_CACHE_TTL
|
||||||
|
|
||||||
return proxies
|
return proxies
|
||||||
|
|
||||||
def check_proxy(proxy):
|
def check_proxy(proxy):
|
||||||
try:
|
try:
|
||||||
proxy_type = proxy.split("://")[0]
|
proxy_type = proxy.split("://")[0]
|
||||||
response = requests.get("https://www.google.com", proxies={proxy_type: proxy}, timeout=1)
|
response = requests.get(
|
||||||
|
"https://www.google.com",
|
||||||
|
proxies={proxy_type: proxy},
|
||||||
|
timeout=5,
|
||||||
|
)
|
||||||
return response.status_code == 200
|
return response.status_code == 200
|
||||||
except:
|
except requests.RequestException:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def update_running_proxies(proxies):
|
def update_running_proxies(proxies):
|
||||||
@@ -50,28 +74,47 @@ def update_running_proxies(proxies):
|
|||||||
break
|
break
|
||||||
if check_proxy(proxy):
|
if check_proxy(proxy):
|
||||||
new_proxies.append(proxy)
|
new_proxies.append(proxy)
|
||||||
runningproxies = new_proxies
|
with runningproxies_lock:
|
||||||
# print(f"Updated running proxies: {runningproxies}") # Debugging-Ausgabe
|
runningproxies = new_proxies
|
||||||
time.sleep(300) # Warte 5 Minuten
|
if update_event.wait(300):
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
def start_proxy_updater(proxies):
|
def start_proxy_updater(proxies):
|
||||||
|
global updater_thread
|
||||||
|
stop_proxy_updater()
|
||||||
|
update_event.clear()
|
||||||
updater_thread = Thread(target=update_running_proxies, args=(proxies,))
|
updater_thread = Thread(target=update_running_proxies, args=(proxies,))
|
||||||
updater_thread.daemon = True
|
updater_thread.daemon = True
|
||||||
updater_thread.start()
|
updater_thread.start()
|
||||||
|
|
||||||
def get_random_proxy():
|
|
||||||
while True:
|
def stop_proxy_updater():
|
||||||
if runningproxies:
|
global updater_thread
|
||||||
proxy = random.choice(runningproxies)
|
update_event.set()
|
||||||
print(f"Selected proxy: {proxy}") # Debugging-Ausgabe
|
if updater_thread and updater_thread.is_alive():
|
||||||
return proxy
|
updater_thread.join(timeout=1)
|
||||||
else:
|
updater_thread = None
|
||||||
print("No running proxies available.") # Debugging-Ausgabe
|
|
||||||
time.sleep(1) # Warte kurz, bevor erneut versucht wird
|
def get_random_proxy(timeout=10):
|
||||||
|
deadline = time.time() + timeout
|
||||||
|
while time.time() < deadline:
|
||||||
|
with runningproxies_lock:
|
||||||
|
available = list(runningproxies)
|
||||||
|
if available:
|
||||||
|
return random.choice(available)
|
||||||
|
if update_event.wait(1):
|
||||||
|
break
|
||||||
|
raise RuntimeError("No running proxies available")
|
||||||
|
|
||||||
def get_total_pages(query):
|
def get_total_pages(query):
|
||||||
url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page=1"
|
url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page=1"
|
||||||
response = requests.get(url)
|
try:
|
||||||
|
response = requests.get(url, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
except requests.RequestException:
|
||||||
|
return 1
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
pagination = soup.find('div', class_='pagination')
|
pagination = soup.find('div', class_='pagination')
|
||||||
if pagination:
|
if pagination:
|
||||||
@@ -81,19 +124,35 @@ def get_total_pages(query):
|
|||||||
total_pages = 1
|
total_pages = 1
|
||||||
return total_pages
|
return total_pages
|
||||||
|
|
||||||
def fetch_page(query, page, unwanted_words, minprice, maxprice, unwanted_prices, proxies):
|
def fetch_page(query, page, unwanted_words, minprice, maxprice, unwanted_prices, use_proxies):
|
||||||
while True:
|
url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page={page}"
|
||||||
proxy = get_random_proxy()
|
attempts = 0
|
||||||
|
max_attempts = 5
|
||||||
|
last_error = None
|
||||||
|
|
||||||
|
while attempts < max_attempts:
|
||||||
|
proxies = None
|
||||||
|
if use_proxies:
|
||||||
|
try:
|
||||||
|
proxy = get_random_proxy()
|
||||||
|
proxy_type = proxy.split("://")[0]
|
||||||
|
proxies = {proxy_type: proxy}
|
||||||
|
except RuntimeError as exc:
|
||||||
|
use_proxies = False
|
||||||
|
last_error = exc
|
||||||
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
proxy_type = proxy.split("://")[0]
|
response = requests.get(url, proxies=proxies, timeout=10)
|
||||||
url = f"https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={query}&page={page}"
|
|
||||||
response = requests.get(url, proxies={proxy_type: proxy})
|
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
print(f"Successfully fetched page {page} using proxy {proxy}.") # Debugging-Ausgabe
|
|
||||||
break
|
break
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.RequestException as exc:
|
||||||
print(f"Failed to fetch page {page} using proxy {proxy}. Error: {e}") # Debugging-Ausgabe
|
last_error = exc
|
||||||
continue
|
attempts += 1
|
||||||
|
time.sleep(1)
|
||||||
|
else:
|
||||||
|
print(f"Failed to fetch page {page}: {last_error}")
|
||||||
|
return []
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
items = []
|
items = []
|
||||||
@@ -124,12 +183,30 @@ def search_item(query, unwanted_words, minprice, maxprice, maxpages):
|
|||||||
total_pages = get_total_pages(query)
|
total_pages = get_total_pages(query)
|
||||||
pages_to_search = min(total_pages, maxpages)
|
pages_to_search = min(total_pages, maxpages)
|
||||||
proxies = load_proxies()
|
proxies = load_proxies()
|
||||||
start_proxy_updater(proxies)
|
use_proxies = bool(proxies)
|
||||||
|
if use_proxies:
|
||||||
|
start_proxy_updater(proxies)
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
try:
|
||||||
futures = [executor.submit(fetch_page, query, page, unwanted_words, minprice, maxprice, ["Zu verschenken"], proxies) for page in range(1, pages_to_search + 1)]
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||||
for future in futures:
|
futures = [
|
||||||
items.extend(future.result())
|
executor.submit(
|
||||||
|
fetch_page,
|
||||||
|
query,
|
||||||
|
page,
|
||||||
|
unwanted_words,
|
||||||
|
minprice,
|
||||||
|
maxprice,
|
||||||
|
["Zu verschenken"],
|
||||||
|
use_proxies,
|
||||||
|
)
|
||||||
|
for page in range(1, pages_to_search + 1)
|
||||||
|
]
|
||||||
|
for future in futures:
|
||||||
|
items.extend(future.result())
|
||||||
|
finally:
|
||||||
|
if use_proxies:
|
||||||
|
stop_proxy_updater()
|
||||||
|
|
||||||
# Ergebnisse nach Preis sortieren
|
# Ergebnisse nach Preis sortieren
|
||||||
items.sort(key=lambda x: x[3])
|
items.sort(key=lambda x: x[3])
|
||||||
@@ -156,34 +233,42 @@ def search():
|
|||||||
maxprice = float(request.form.get('maxprice') or float('inf'))
|
maxprice = float(request.form.get('maxprice') or float('inf'))
|
||||||
maxpages = int(request.form.get('maxpages') or 0)
|
maxpages = int(request.form.get('maxpages') or 0)
|
||||||
filename = request.form.get('filename') or 'kleinanzeigen_results'
|
filename = request.form.get('filename') or 'kleinanzeigen_results'
|
||||||
|
filename = secure_filename(filename)
|
||||||
|
if not filename:
|
||||||
|
filename = 'kleinanzeigen_results'
|
||||||
|
if not filename.lower().endswith('.xlsx'):
|
||||||
|
filename = f"{filename}.xlsx"
|
||||||
|
|
||||||
session_id = request.cookies.get('session')
|
session_progress = session.setdefault('progress', {'current_item': 0, 'total_items': 0})
|
||||||
if session_id not in progress_data:
|
|
||||||
progress_data[session_id] = {'current_item': 0, 'total_items': 0}
|
|
||||||
|
|
||||||
all_items = []
|
all_items = []
|
||||||
progress_data[session_id]['total_items'] = len(queries)
|
session_progress['total_items'] = len([q for q in queries if q.strip()])
|
||||||
|
|
||||||
for i, query in enumerate(queries):
|
processed_queries = 0
|
||||||
|
for query in queries:
|
||||||
query = query.strip()
|
query = query.strip()
|
||||||
if query:
|
if query:
|
||||||
items = search_item(query, unwanted_words, minprice, maxprice, maxpages)
|
items = search_item(query, unwanted_words, minprice, maxprice, maxpages)
|
||||||
all_items.extend(items)
|
all_items.extend(items)
|
||||||
progress_data[session_id]['current_item'] = i + 1
|
processed_queries += 1
|
||||||
|
session_progress['current_item'] = processed_queries
|
||||||
|
session['progress'] = session_progress
|
||||||
|
session.modified = True
|
||||||
time.sleep(0.1) # Füge eine kurze Verzögerung hinzu, um die Fortschrittsanzeige zu aktualisieren
|
time.sleep(0.1) # Füge eine kurze Verzögerung hinzu, um die Fortschrittsanzeige zu aktualisieren
|
||||||
|
|
||||||
df = pd.DataFrame(all_items, columns=['Artikel', 'Preis', 'Link', 'PriceValue'])
|
df = pd.DataFrame(all_items, columns=['Artikel', 'Preis', 'Link', 'PriceValue'])
|
||||||
df = df.drop(columns=['PriceValue'])
|
df = df.drop(columns=['PriceValue'])
|
||||||
filepath = f'uploads/{filename}.xlsx'
|
os.makedirs('uploads', exist_ok=True)
|
||||||
|
filepath = os.path.join('uploads', filename)
|
||||||
df.to_excel(filepath, index=False)
|
df.to_excel(filepath, index=False)
|
||||||
|
|
||||||
session['filepath'] = filepath
|
session['filepath'] = filepath
|
||||||
|
session.modified = True
|
||||||
return jsonify(all_items)
|
return jsonify(all_items)
|
||||||
|
|
||||||
@app.route('/progress')
|
@app.route('/progress')
|
||||||
def progress():
|
def progress():
|
||||||
session_id = request.cookies.get('session')
|
return jsonify(session.get('progress', {'current_item': 0, 'total_items': 0}))
|
||||||
return jsonify(progress_data.get(session_id, {'current_item': 0, 'total_items': 0}))
|
|
||||||
|
|
||||||
@app.route('/download')
|
@app.route('/download')
|
||||||
def download():
|
def download():
|
||||||
@@ -195,4 +280,4 @@ def download():
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
if not os.path.exists('uploads'):
|
if not os.path.exists('uploads'):
|
||||||
os.makedirs('uploads')
|
os.makedirs('uploads')
|
||||||
app.run(debug=True)
|
app.run()
|
||||||
|
|||||||
Reference in New Issue
Block a user