diff --git a/requirements.txt b/requirements.txt index 7593449..1e2eb4b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ requests==2.32.3 soupsieve==2.6 typing_extensions==4.12.2 urllib3==2.3.0 +playwright==1.45.0 diff --git a/src/bgl/bgl.py b/src/bgl/bgl.py index f361e49..f44880a 100644 --- a/src/bgl/bgl.py +++ b/src/bgl/bgl.py @@ -34,7 +34,8 @@ def fetch_all_properties(): 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0', } - + # debug print url + # print(url) response = requests.request("GET", url, headers=headers) soup = BeautifulSoup(response.text, "html.parser") diff --git a/src/lwb/lwb_form.xml b/src/lwb/lwb_form.xml index b638942..14a921c 100644 --- a/src/lwb/lwb_form.xml +++ b/src/lwb/lwb_form.xml @@ -1,217 +1 @@ - - -
\ No newline at end of file +\n\r \ No newline at end of file diff --git a/src/lwb/scrape_image.py b/src/lwb/scrape_image.py index 60fccff..f4d629e 100644 --- a/src/lwb/scrape_image.py +++ b/src/lwb/scrape_image.py @@ -1,5 +1,5 @@ import requests -from src.lwb.scraper import EASYSQUARE_HEADERS, EASYSQUARE_PARAMS +from src.lwb.scraper import EASYSQUARE_HEADERS def scrape_image(url, owner): session = requests.Session() @@ -19,10 +19,14 @@ def scrape_image(url, owner): # return empty image return b'' - if owner == "LWB": - response = session.get(url, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS) + if owner == "LWB": + # Image URL already carries the required params; only send headers + response = session.get(url, headers=EASYSQUARE_HEADERS) if response.status_code != 200: - print(f"Fehler beim Abrufen von Easysquare: {response.status_code}") + print( + f"Fehler beim Abrufen von Easysquare: " + f"{response.status_code}" + ) # return empty image return b'' @@ -30,8 +34,10 @@ def scrape_image(url, owner): if response is None: response = session.get(url) if response.status_code != 200: - print(f"Fehler beim Abrufen der Standardquelle: {response.status_code}") + print( + f"Fehler beim Abrufen der Standardquelle: " + f"{response.status_code}" + ) return b'' - - + return response.content diff --git a/src/lwb/scraper.py b/src/lwb/scraper.py index 7f41ffe..5c8f3fc 100644 --- a/src/lwb/scraper.py +++ b/src/lwb/scraper.py @@ -3,81 +3,136 @@ import xml.etree.ElementTree as ET import src.lwb.format as format import hashlib import os +import time from dotenv import load_dotenv load_dotenv() -SESSION_CREATE_URL = "https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL" +SESSION_CREATE_URL = ( + "https://portal1s.easysquare.com/meinelwb/index.html" + "?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL" +) +# Session tokens (from .env as fallback; will be refreshed dynamically) SAP_SESSIONID = os.getenv("SAP_SESSIONID") COOKIE_SESSION = os.getenv("COOKIE_SESSION") EASYSQUARE_URL = "https://portal1s.easysquare.com/prorex/xmlforms" + +# Mutable headers so imports see updates when we refresh session values EASYSQUARE_HEADERS = { "DNT": "1", "Host": "portal1s.easysquare.com", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Cookie": f"SAP_SESSIONID_PP0_581={SAP_SESSIONID}; sap-usercontext=sap-language=D&sap-client=581; cookiesession1={COOKIE_SESSION}", - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0", + "Accept": ( + "text/html,application/xhtml+xml," + "application/xml;q=0.9,*/*;q=0.8" + ), + # Cookie gets filled by _update_cookie_header() + "Cookie": "", + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) " + "Gecko/20100101 Firefox/135.0" + ), "Accept-Encoding": "gzip, deflate, br, zstd", "Accept-Language": "de,en-US;q=0.7,en;q=0.3", - "Upgrade-Insecure-Requests": "1" + "Upgrade-Insecure-Requests": "1", + # Helpful headers observed from network calls + "X-Requested-With": "XMLHttpRequest", + "Referer": SESSION_CREATE_URL, } EASYSQUARE_PARAMS = { "application": "ESQ_IA_REOBJ", "sap-client": "581", "command": "action", "name": "boxlist", - "api": "6.169", - "head-oppc-version": "6.169.22", - "_": "1736761256321" + "api": "6.249", + "head-oppc-version": "6.249.1", + # dynamic '_' gets applied at request time; leave placeholder + "_": "1755259702945", } -SETUP_QUERY_PARAMS_URL = "https://portal1s.easysquare.com/prorex/xmlforms?application=ESQ_IA_REOBJ&sap-client=581&command=action&name=save&id=4B76A3C8-3E4D-4217-B54A-7C28C920748C&api=6.169&head-oppc-version=6.169.22&originalId=842F0073-DC21-A841-4E80-B1BD5E404E35&resourceOrigin=form" +SETUP_QUERY_PARAMS_URL = ( + "https://portal1s.easysquare.com/prorex/xmlforms" + "?application=ESQ_IA_REOBJ" + "&sap-client=581" + "&command=action" + "&name=save" + "&id=E3920A27-432A-4127-96FC-6433ED32FDDE" + "&api=6.249" + "&head-oppc-version=6.249.1" + "&originalId=3C9DAA99-1C5D-4810-5B5E-AFE704639EF5" + "&resourceOrigin=form" +) -# curl --location 'https://portal1s.easysquare.com/prorex/xmlforms?application=ESQ_IA_REOBJ&sap-client=581&command=action&name=boxlist&api=6.169&head-oppc-version=6.169.22&_=1736761255682' \ -# --header 'DNT: 1' \ -# --header 'UTC: 1736761256321' \ -# --header 'Host: portal1s.easysquare.com' \ -# --header 'host: portal1s.easysquare.com' \ -# --header 'Accept: text/plain, */*; q=0.01' \ -# --header 'Cookie: cookiesession1=678ADA67ADF0813997206FE9F4133118; sap-usercontext=sap-language=de&sap-client=581; SAP_SESSIONID_PP0_581=Vg3w4pn8whD76BldaU2wvP-YzyrRkRHvhWoKELG5Agg%3d' \ -# --header 'Referer: https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL' \ -# --header 'Sec-GPC: 1' \ -# --header 'oppc-id: D9925A2D-4ED9-4911-8AD3-2626DA41FBB0' \ -# --header 'Connection: keep-alive' \ -# --header 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0' \ -# --header 'Content-Type: text/plain;charset=UTF-8' \ -# --header 'Sec-Fetch-Dest: empty' \ -# --header 'Sec-Fetch-Mode: cors' \ -# --header 'Sec-Fetch-Site: same-origin' \ -# --header 'Accept-Encoding: gzip, deflate, br, zstd' \ -# --header 'Accept-Language: de,en-US;q=0.7,en;q=0.3' \ -# --header 'X-Requested-With: XMLHttpRequest' +# Example curl reference omitted for brevity. -# setup query params for lwb session -def setup_query_params(): +def _now_ms() -> str: + return str(int(time.time() * 1000)) - # request this url with POST an xml form - # load xml form from file - xml_form = "" - with open("src/lwb/lwb_form.xml", "r") as file: - xml_form = file.read() - - # post xml form to SETUP_QUERY_PARAMS_URL - response = requests.post(SETUP_QUERY_PARAMS_URL, data=xml_form, headers=EASYSQUARE_HEADERS) +def _update_cookie_header(): + """Update Cookie header so other modules can see fresh values.""" + cookie = [] + if SAP_SESSIONID: + cookie.append(f"SAP_SESSIONID_PP0_581={SAP_SESSIONID}") + cookie.append("sap-usercontext=sap-language=D&sap-client=581") + if COOKIE_SESSION: + cookie.append(f"cookiesession1={COOKIE_SESSION}") + EASYSQUARE_HEADERS["Cookie"] = "; ".join(cookie) - if response.status_code != 200: - print(f"Fehler beim Abrufen von Easysquare: {response.status_code}") - return [] - - print(response.content) - return response.content +# initialize cookie header from any .env-provided values +_update_cookie_header() + + +def _fetch_csrf_token(session: requests.Session) -> str: + """Fetch an X-CSRF-Token by calling the list endpoint with 'fetch'.""" + _update_cookie_header() + headers = dict(EASYSQUARE_HEADERS) + headers["X-CSRF-Token"] = "fetch" + headers["Accept"] = "text/plain, */*; q=0.01" + headers["UTC"] = _now_ms() + params = dict(EASYSQUARE_PARAMS) + params["_"] = _now_ms() + r = session.get(EASYSQUARE_URL, headers=headers, params=params) + return r.headers.get("x-csrf-token") or r.headers.get("X-CSRF-Token") or "" + + +# setup query params for lwb session (submit stored filter form) +def setup_query_params(headless: bool = True) -> bool: + """Initialize search parameters via Playwright by driving the UI. + + Replaces the previous XML POST approach. It will: + - click "Ich suche eine Wohnung" + - open the "Immobiliensuche" card and click "MEHR ANZEIGEN" + - set "Maximale Trefferanzahl" to 1000 + - click "Suchen" + Updates in-memory cookies if Playwright yields fresher values. + Returns True if the flow ran; False otherwise. + """ + try: + from src.lwb.session_bootstrap import apply_search_via_ui + except Exception as e: + print(f"Playwright-Setup nicht verfügbar: {e}") + return False + + try: + vals = apply_search_via_ui(headless=headless, save_to_env=False) + global COOKIE_SESSION, SAP_SESSIONID + if vals.get("COOKIE_SESSION"): + COOKIE_SESSION = vals["COOKIE_SESSION"] + if vals.get("SAP_SESSIONID"): + SAP_SESSIONID = vals["SAP_SESSIONID"] + _update_cookie_header() + return True + except Exception as e: + print(f"Fehler beim Setzen der Suchparameter per UI: {e}") + return False + +# Call Session Create and get the session from the response cookies + -# Call Session Create and get the session from teh response cookies def create_session(): # request url with chromium browser and get the cookies session = requests.Session() @@ -86,91 +141,142 @@ def create_session(): if response.status_code != 200: print(f"Fehler Session von Easysquare: {response.status_code}") return [] - + # get the cookies from the response cookies = response.cookies + global COOKIE_SESSION COOKIE_SESSION = cookies.get("cookiesession1") - print(COOKIE_SESSION) - - - url = "https://portal1s.easysquare.com/meinelwb/api5/authenticate?api=6.169&sap-language=de" + _update_cookie_header() + url = ( + "https://portal1s.easysquare.com/meinelwb/api5/authenticate" + "?api=6.169&sap-language=de" + ) payload = { 'sap-field_b64': "dXNlcj1ERU1PJnBhc3N3b3JkPXByb21vczE2" } + headers = { - 'DNT': '1', - 'UTC': '1738713279005', - 'Host': 'portal1s.easysquare.com', - 'host': 'portal1s.easysquare.com', - 'Accept': 'text/html, */*; q=0.01', - 'Cookie': f'esq-alias=%2fmeinelwb; sap-usercontext=sap-language=de&sap-client=581; cookiesession1={COOKIE_SESSION}', - 'Origin': 'https://portal1s.easysquare.com', - 'Referer': 'https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL', - 'Sec-GPC': '1', - 'Connection': 'keep-alive', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0', - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - 'X-CSRF-Token': 'fetch', - 'Sec-Fetch-Dest': 'empty', - 'Sec-Fetch-Mode': 'cors', - 'Sec-Fetch-Site': 'same-origin', - 'Accept-Encoding': 'gzip, deflate, br, zstd', - 'Accept-Language': 'de,en-US;q=0.7,en;q=0.3', - 'X-Requested-With': 'XMLHttpRequest' + 'DNT': '1', + 'UTC': _now_ms(), + 'Host': 'portal1s.easysquare.com', + 'host': 'portal1s.easysquare.com', + 'Accept': 'text/html, */*; q=0.01', + 'Cookie': ( + 'esq-alias=%2fmeinelwb; ' + 'sap-usercontext=sap-language=de&sap-client=581; ' + f'cookiesession1={COOKIE_SESSION}' + ), + 'Origin': 'https://portal1s.easysquare.com', + 'Referer': SESSION_CREATE_URL, + 'Sec-GPC': '1', + 'Connection': 'keep-alive', + 'User-Agent': EASYSQUARE_HEADERS['User-Agent'], + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'X-CSRF-Token': 'fetch', + 'Sec-Fetch-Dest': 'empty', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'same-origin', + 'Accept-Encoding': 'gzip, deflate, br, zstd', + 'Accept-Language': 'de,en-US;q=0.7,en;q=0.3', + 'X-Requested-With': 'XMLHttpRequest' } - print(headers) - - response = requests.request("POST", url, headers=headers, data=payload) - - print(response.text) + response = requests.request( + "POST", + url, + headers=headers, + data=payload, + ) if response.status_code != 200: - print(f"Fehler beim Session Erstellen via Easysquare: {response.status_code}") + print( + f"Fehler beim Session Erstellen via Easysquare: " + f"{response.status_code}" + ) return [] - + # get the cookies from the response cookies = response.cookies global SAP_SESSIONID SAP_SESSIONID = cookies.get("SAP_SESSIONID_PP0_581") - - print(SAP_SESSIONID) + _update_cookie_header() + print(f"SAP_SESSIONID_PP0_581: {SAP_SESSIONID}") - # Funktion: Scrape von Easysquare +def _ensure_session_ready(): + # If missing tokens, try to create session + global COOKIE_SESSION, SAP_SESSIONID + if not COOKIE_SESSION or not SAP_SESSIONID: + # First try headless browser bootstrap for robust cookie capture + try: + from src.lwb.session_bootstrap import fetch_session + + vals = fetch_session(headless=True, save_to_env=False) + got_cookie = vals.get("COOKIE_SESSION") + got_sap = vals.get("SAP_SESSIONID") + if got_cookie: + COOKIE_SESSION = got_cookie + if got_sap: + SAP_SESSIONID = got_sap + _update_cookie_header() + except Exception: + # Fallback to legacy request-based bootstrap + create_session() + + def scrape_easysquare(): + _ensure_session_ready() + + # Submit stored search/filter form to ensure listing context is ready + try: + setup_query_params() + except Exception as e: + # non-fatal; we'll still attempt list fetch + print(f"Warnung: Setup der Suchparameter fehlgeschlagen: {e}") + + # Build params with fresh timestamp + params = dict(EASYSQUARE_PARAMS) + params["_"] = _now_ms() + session = requests.Session() - response = session.get(EASYSQUARE_URL, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS) - + response = session.get( + EASYSQUARE_URL, + headers=EASYSQUARE_HEADERS, + params=params, + ) + if response.status_code != 200: print(f"Fehler beim Abrufen von Easysquare: {response.status_code}") - # print("Versuche Session zu erstellen") - # create_session() return [] # XML-Daten parsen root = ET.fromstring(response.content) namespace = {"ns": "http://www.openpromos.com/OPPC/XMLForms"} - + properties = [] for head in root.findall(".//ns:head", namespace): - prop_title = head.find("ns:title", namespace).text - subtitle = head.find("ns:subtitle", namespace).text - abstract = head.find("ns:abstract", namespace).text.strip() + title_el = head.find("ns:title", namespace) + subtitle_el = head.find("ns:subtitle", namespace) + abstract_el = head.find("ns:abstract", namespace) + prop_title = title_el.text if title_el is not None else "" + subtitle = subtitle_el.text if subtitle_el is not None else "" + abstract = ( + abstract_el.text.strip() + if abstract_el is not None and abstract_el.text + else "" + ) - # get adress lat and long - # - - adress = head.find("ns:address", namespace) - lat = adress.get("lat") - lon = adress.get("lon") + # get address lat/lon + address_el = head.find("ns:address", namespace) + lat = address_el.get("lat") if address_el is not None else "" + lon = address_el.get("lon") if address_el is not None else "" image = head.find("ns:image", namespace) - iamge_resourceId = image.get("resourceId") - - id = head.find("ns:id", namespace).text + image_resource_id = ( + image.get("resourceId") if image is not None else "" + ) # Details extrahieren rooms = "N/A" @@ -191,28 +297,42 @@ def scrape_easysquare(): availability = value # link create google maps link with lat and long - link = f"https://www.google.com/maps/search/?api=1&query={lat},{lon}" + link = ( + f"https://www.google.com/maps/search/?api=1&query={lat},{lon}" + if lat and lon + else "" + ) - # https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id=1EC8D4E6-191A-A827-47FF-72D8C5379070&name=get - image_url = f"https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id={iamge_resourceId}&name=get" + # image url for listing + base_img = "https://portal1s.easysquare.com/prorex/xmlforms/image.jpg" + image_url = ( + f"{base_img}?application=ESQ_IA_REOBJ&command=action" + f"&id={image_resource_id}&name=get" + if image_resource_id + else "" + ) - # the id should be a hash create by the title, subtitle, rooms, size, rent, availability - hashID = f"{prop_title}{subtitle}{rooms}{size}{rent}{availability}" - id = hashlib.sha256(hashID.encode('utf-8')).hexdigest() + # Hash from title, subtitle, rooms, size, rent, availability + hash_id = ( + f"{prop_title}{subtitle}{rooms}{size}{rent}{availability}" + ) + prop_id = hashlib.sha256(hash_id.encode('utf-8')).hexdigest() - properties.append({ - "id": id, - "title": "LWB - " + prop_title, - "subtitle": subtitle, - "rooms": format.format_room(rooms), - "size": format.format_roomSize(size), - "rent": format.format_money(rent), - "link": link, - "abstract": abstract, - "warm_rent": "", - "availability": format.format_date(availability), - "image_url": image_url, - "owner": "LWB", - }) + properties.append( + { + "id": prop_id, + "title": f"LWB - {prop_title}", + "subtitle": subtitle, + "rooms": format.format_room(rooms), + "size": format.format_roomSize(size), + "rent": format.format_money(rent), + "link": link, + "abstract": abstract, + "warm_rent": "", + "availability": format.format_date(availability), + "image_url": image_url, + "owner": "LWB", + } + ) - return properties \ No newline at end of file + return properties diff --git a/src/lwb/session_bootstrap.py b/src/lwb/session_bootstrap.py new file mode 100644 index 0000000..e56f403 --- /dev/null +++ b/src/lwb/session_bootstrap.py @@ -0,0 +1,341 @@ +import os +import sys +import time +from typing import Dict, Optional + +from dotenv import load_dotenv + + +SESSION_CREATE_URL = ( + "https://portal1s.easysquare.com/meinelwb/index.html" + "?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL" +) + + +def _update_env_file(values: Dict[str, str], env_path: str = ".env") -> None: + """Create or update .env with given key/value pairs.""" + existing = {} + if os.path.exists(env_path): + with open(env_path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + k, v = line.split("=", 1) + existing[k] = v + existing.update(values) + lines = [f"{k}={v}\n" for k, v in existing.items()] + with open(env_path, "w", encoding="utf-8") as f: + f.writelines(lines) + + +def fetch_session( + headless: bool = True, + save_to_env: bool = True, +) -> Dict[str, Optional[str]]: + """Open the LWB portal in a headless browser and extract cookies. + + Returns a dict with keys: COOKIE_SESSION, SAP_SESSIONID. + When save_to_env=True, writes/updates these in .env at repo root. + """ + try: + from playwright.sync_api import sync_playwright + except Exception: # pragma: no cover - dependency missing + print( + "Playwright not installed. Install: pip install playwright " + "&& python -m playwright install" + ) + raise + + load_dotenv() + result: Dict[str, Optional[str]] = { + "COOKIE_SESSION": None, + "SAP_SESSIONID": None, + } + + with sync_playwright() as p: + browser = p.chromium.launch(headless=headless) + context = browser.new_context() + page = context.new_page() + page.goto(SESSION_CREATE_URL, wait_until="networkidle") + + # Click the entry button: "Ich suche eine Wohnung" + try: + page.get_by_role( + "button", name="Ich suche eine Wohnung" + ).click(timeout=3000) + except Exception: + try: + page.get_by_role( + "link", name="Ich suche eine Wohnung" + ).click(timeout=3000) + except Exception: + try: + page.get_by_text( + "Ich suche eine Wohnung", exact=True + ).click(timeout=3000) + except Exception: + pass + + # Let the portal finish any navigation after the click + try: + page.wait_for_load_state("networkidle", timeout=5000) + except Exception: + pass + + # Try to read cookiesession1 quickly (set by the portal) + cookie_session = None + sap_session = None + for _ in range(10): + cookies = {c["name"]: c for c in context.cookies()} + if not cookie_session and "cookiesession1" in cookies: + cookie_session = cookies["cookiesession1"]["value"] + if not sap_session and "SAP_SESSIONID_PP0_581" in cookies: + sap_session = cookies["SAP_SESSIONID_PP0_581"]["value"] + if cookie_session and sap_session: + break + time.sleep(0.5) + + # If SAP session not present, trigger authenticate from the page context + if not sap_session: + try: + js_lines = [ + "async () => {", + " const base = '/meinelwb/api5/authenticate';", + " const url = base + '?api=6.169&sap-language=de';", + " const body = new URLSearchParams({", + " 'sap-field_b64':", + " 'dXNlcj1ERU1PJnBhc3N3b3JkPXByb21vczE2'", + " }).toString();", + " await fetch(url, {", + " method: 'POST',", + " headers: {", + " 'Content-Type':", + " 'application/x-www-form-urlencoded;' +", + " ' charset=UTF-8',", + " 'X-CSRF-Token': 'fetch',", + " 'X-Requested-With': 'XMLHttpRequest'", + " },", + " body", + " });", + " return true;", + "}", + ] + page.evaluate("\n".join(js_lines)) + except Exception: + pass + + # Re-check cookies for SAP session + for _ in range(10): + cookies = {c["name"]: c for c in context.cookies()} + if "SAP_SESSIONID_PP0_581" in cookies: + sap_session = cookies["SAP_SESSIONID_PP0_581"]["value"] + break + time.sleep(0.5) + + # Best-effort read of localStorage (useful for debugging) + try: + storage_dump = page.evaluate( + "() => JSON.stringify(window.localStorage)" + ) + if storage_dump and len(storage_dump) > 2: + pass # Not strictly needed; kept for debugging future issues + except Exception: + pass + + result["COOKIE_SESSION"] = cookie_session + result["SAP_SESSIONID"] = sap_session + + browser.close() + + if save_to_env and (result["COOKIE_SESSION"] or result["SAP_SESSIONID"]): + to_write = {} + if result["COOKIE_SESSION"]: + to_write["COOKIE_SESSION"] = result["COOKIE_SESSION"] + if result["SAP_SESSIONID"]: + to_write["SAP_SESSIONID"] = result["SAP_SESSIONID"] + if to_write: + _update_env_file(to_write) + + return result + + +def apply_search_via_ui( + headless: bool = True, + save_to_env: bool = True, +) -> Dict[str, Optional[str]]: + """Drive the UI to initialize the search context. + + Steps: + - Open portal, click "Ich suche eine Wohnung". + - In services section with title "Immobiliensuche", click "MEHR ANZEIGEN". + - Set "Maximale Trefferanzahl" to 1000 and click "Suchen". + Returns latest cookies (COOKIE_SESSION, SAP_SESSIONID) and optionally + writes them to .env. + """ + try: + from playwright.sync_api import sync_playwright + except Exception: # pragma: no cover + print( + "Playwright not installed. Install: pip install playwright " + "&& python -m playwright install" + ) + raise + + load_dotenv() + result: Dict[str, Optional[str]] = { + "COOKIE_SESSION": None, + "SAP_SESSIONID": None, + } + + with sync_playwright() as p: + browser = p.chromium.launch(headless=headless) + context = browser.new_context() + page = context.new_page() + page.goto(SESSION_CREATE_URL, wait_until="networkidle") + + # Click entry button (robust tries) + clicked = False + for sel in [ + lambda: page.get_by_role( + "button", name="Ich suche eine Wohnung" + ).click(timeout=3000), + lambda: page.get_by_role( + "link", name="Ich suche eine Wohnung" + ).click(timeout=3000), + lambda: page.get_by_text( + "Ich suche eine Wohnung", exact=True + ).click(timeout=3000), + ]: + try: + sel() + clicked = True + break + except Exception: + pass + if clicked: + try: + page.wait_for_load_state("networkidle", timeout=5000) + except Exception: + pass + + # Click MEHR ANZEIGEN in the Immobiliensuche container + try: + container = page.locator( + "div.easy-services-service-container" + ).filter(has_text="Immobiliensuche") + container.get_by_role( + "button", name="MEHR ANZEIGEN" + ).first.click(timeout=5000) + except Exception: + try: + page.get_by_role("button", name="MEHR ANZEIGEN").click( + timeout=5000 + ) + except Exception: + pass + + # Fill "Maximale Trefferanzahl" to 1000 + def _fill_max_results(): + # Try common label variations + for label in ( + "Maximale Trefferanzahl", + "Maximale Trefferzahl", + "Maximale\u00A0Trefferanzahl", + "Maximale\u00A0Trefferzahl", + ): + try: + page.get_by_label(label).fill("1000", timeout=3000) + return True + except Exception: + continue + # Fallback: find inputs near the label text + try: + near = page.get_by_text("Maximale") + near_locator = near.locator( + "xpath=following::input[1]" + ) + near_locator.fill("1000", timeout=3000) + return True + except Exception: + return False + + try: + _fill_max_results() + except Exception: + pass + + # Click Suchen + for name in ["Suchen", "SUCHEN"]: + try: + page.get_by_role("button", name=name).click(timeout=4000) + break + except Exception: + try: + page.get_by_text(name, exact=True).click(timeout=4000) + break + except Exception: + continue + + try: + page.wait_for_load_state("networkidle", timeout=6000) + except Exception: + pass + + # Collect cookies + cookies = {c["name"]: c for c in context.cookies()} + if "cookiesession1" in cookies: + result["COOKIE_SESSION"] = cookies["cookiesession1"].get( + "value" + ) + if "SAP_SESSIONID_PP0_581" in cookies: + result["SAP_SESSIONID"] = cookies["SAP_SESSIONID_PP0_581"].get( + "value" + ) + + # render next page and keep it open for 10 seconds + page.wait_for_timeout(10000) + + browser.close() + + if save_to_env and (result["COOKIE_SESSION"] or result["SAP_SESSIONID"]): + to_write = {} + if result["COOKIE_SESSION"]: + to_write["COOKIE_SESSION"] = result["COOKIE_SESSION"] + if result["SAP_SESSIONID"]: + to_write["SAP_SESSIONID"] = result["SAP_SESSIONID"] + if to_write: + _update_env_file(to_write) + + return result + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser( + description="Fetch LWB session cookies via headless browser" + ) + parser.add_argument( + "--headed", action="store_true", help="Run browser in headed mode" + ) + parser.add_argument( + "--no-save", action="store_true", help="Do not write values into .env" + ) + args = parser.parse_args() + + try: + vals = fetch_session( + headless=not args.headed, + save_to_env=not args.no_save, + ) + print("COOKIE_SESSION=", vals.get("COOKIE_SESSION")) + print("SAP_SESSIONID=", vals.get("SAP_SESSIONID")) + if not (vals.get("COOKIE_SESSION") and vals.get("SAP_SESSIONID")): + print( + "Warning: One or both values are missing. Try --headed to " + "complete any prompts." + ) + except Exception as e: + print(f"Error while fetching session: {e}") + sys.exit(1) diff --git a/src/vlw/scraper.py b/src/vlw/scraper.py index e317d1f..8d45cb4 100644 --- a/src/vlw/scraper.py +++ b/src/vlw/scraper.py @@ -17,10 +17,11 @@ def scrape_vlw(): "senden": "suchen", } + #debug print url + # print(f"Fetching VLW properties from: {url} with params: {parameter}") response = requests.get(url=url, params=parameter) soup = BeautifulSoup(response.content, 'html.parser') - properties = [] # get div with class "estate-result-list"