Merge branch '5-automate-lwb-scraping' into 'main'

Resolve "automate lwb scraping"

Closes #5

See merge request fsr-im/tools/flatscraper!2
This commit is contained in:
Elmar Kresse
2025-08-15 16:01:20 +02:00
11 changed files with 614 additions and 362 deletions

View File

@@ -37,8 +37,7 @@ You can run the bot natively on your machine or use a Docker image. The requirem
### 1. Environment Setup
Ensure that the `.env` file is configured correctly. An example is available in the `sample.env` file. Copy it to `.env` and fill in the required values.
The `SAP_SESSIONID` and `COOKIE_SESSSION` are obtained after performing a search on the LWB website. Use your browser's developer tools to locate them in local storage.
*Future versions will include automatic form processing to obtain a valid session ID.*
You no longer need to set `SAP_SESSIONID` or `COOKIE_SESSION` manually. The scraper opens the LWB portal with a headless browser, clicks “Ich suche eine Wohnung”, opens “Immobiliensuche” → “MEHR ANZEIGEN”, sets the maximum results to 1000, clicks “Suchen”, and extracts session cookies automatically.
### 2. Python Environment

13
main.py
View File

@@ -1,4 +1,3 @@
from bs4 import BeautifulSoup
import json
import time
import src.wogetra.scraper as wogetra_scraper
@@ -14,6 +13,7 @@ TARGET_URL = "https://www.wogetra.de/immobilien-vermarktungsart/miete/"
# Store known property IDs to avoid duplicate notifications
known_properties = set()
# Main loop to periodically check for new listings
def main():
global known_properties
@@ -28,11 +28,11 @@ def main():
while True:
current_time = time.strftime("%H:%M:%S", time.localtime())
print("Scraping properties at " + current_time)
properties_wogetra = wogetra_scraper.scrape_wogetra()
print("Scraped " + str(len(properties_wogetra)) + " properties from Wogetra")
properties = properties_wogetra
properties_lwb = lwb_scraper.scrape_easysquare()
print("Scraped " + str(len(properties_lwb)) + " properties from LWB")
properties += properties_lwb
@@ -40,15 +40,14 @@ def main():
properties_lipsia = lipsia_scraper.scrape_lipsia()
print("Scraped " + str(len(properties_lipsia)) + " properties from Lipsia")
properties += properties_lipsia
properties_bgl = bgl_scraper.fetch_all_properties()
print("Scraped " + str(len(properties_bgl)) + " properties from BGL")
properties += properties_bgl
properties_vlw = vlw_scraper.scrape_vlw()
print("Scraped " + str(len(properties_vlw)) + " properties from VLW")
properties = properties_vlw
properties += properties_vlw
for prop in properties:
if prop["id"] not in known_properties:
@@ -56,7 +55,6 @@ def main():
localwebhook.send_to_discord(prop)
known_properties.add(prop["id"])
# save known properties to file
with open("known_properties.json", "w") as file:
json.dump(list(known_properties), file)
@@ -65,5 +63,6 @@ def main():
print("Waiting for the next check...")
time.sleep(300) # Check every 5 minutes
if __name__ == "__main__":
main()

View File

@@ -7,3 +7,4 @@ requests==2.32.3
soupsieve==2.6
typing_extensions==4.12.2
urllib3==2.3.0
playwright==1.45.0

View File

@@ -1,3 +1 @@
SAP_SESSIONID=UrN6nRbjuCBe4dkLw7vkJLcpV5zniRHvhkwAAKG5Agg%3d
COOKIE_SESSION=678ADA67ADF0813997206FE9F4132819
WEBHOOK_URL=https://discord.com/api/webhooks/1327600813367432462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzLKAaa

View File

@@ -34,7 +34,8 @@ def fetch_all_properties():
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0',
}
# debug print url
# print(url)
response = requests.request("GET", url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

View File

@@ -8,6 +8,7 @@ load_dotenv()
# Webhook URL from Discord
WEBHOOK_URL = os.getenv("WEBHOOK_URL")
# Funktion: Nachricht an Discord senden
def send_to_discord(property_data):
@@ -23,10 +24,9 @@ def send_to_discord(property_data):
f"**Beschreibung:** {property_data['abstract']}"
)
# Set headers
headers = {"Content-Type": "application/json"}
# Check for optional image URL
if "image_url" in property_data and property_data["image_url"]:
try:

File diff suppressed because one or more lines are too long

View File

@@ -1,5 +1,5 @@
import requests
from src.lwb.scraper import EASYSQUARE_HEADERS, EASYSQUARE_PARAMS
from src.lwb.scraper import EASYSQUARE_HEADERS
def scrape_image(url, owner):
session = requests.Session()
@@ -19,10 +19,14 @@ def scrape_image(url, owner):
# return empty image
return b''
if owner == "LWB":
response = session.get(url, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS)
if owner == "LWB":
# Image URL already carries the required params; only send headers
response = session.get(url, headers=EASYSQUARE_HEADERS)
if response.status_code != 200:
print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
print(
f"Fehler beim Abrufen von Easysquare: "
f"{response.status_code}"
)
# return empty image
return b''
@@ -30,8 +34,10 @@ def scrape_image(url, owner):
if response is None:
response = session.get(url)
if response.status_code != 200:
print(f"Fehler beim Abrufen der Standardquelle: {response.status_code}")
print(
f"Fehler beim Abrufen der Standardquelle: "
f"{response.status_code}"
)
return b''
return response.content

View File

@@ -2,82 +2,133 @@ import requests
import xml.etree.ElementTree as ET
import src.lwb.format as format
import hashlib
import os
from dotenv import load_dotenv
import time
load_dotenv()
SESSION_CREATE_URL = (
"https://portal1s.easysquare.com/meinelwb/index.html"
"?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
)
SESSION_CREATE_URL = "https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
SAP_SESSIONID = os.getenv("SAP_SESSIONID")
COOKIE_SESSION = os.getenv("COOKIE_SESSION")
# Session tokens (resolved dynamically via session bootstrap; no .env needed)
SAP_SESSIONID = None
COOKIE_SESSION = None
EASYSQUARE_URL = "https://portal1s.easysquare.com/prorex/xmlforms"
# Mutable headers so imports see updates when we refresh session values
EASYSQUARE_HEADERS = {
"DNT": "1",
"Host": "portal1s.easysquare.com",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Cookie": f"SAP_SESSIONID_PP0_581={SAP_SESSIONID}; sap-usercontext=sap-language=D&sap-client=581; cookiesession1={COOKIE_SESSION}",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
"Accept": (
"text/html,application/xhtml+xml,"
"application/xml;q=0.9,*/*;q=0.8"
),
# Cookie gets filled by _update_cookie_header()
"Cookie": "",
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) "
"Gecko/20100101 Firefox/135.0"
),
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "de,en-US;q=0.7,en;q=0.3",
"Upgrade-Insecure-Requests": "1"
"Upgrade-Insecure-Requests": "1",
# Helpful headers observed from network calls
"X-Requested-With": "XMLHttpRequest",
"Referer": SESSION_CREATE_URL,
}
EASYSQUARE_PARAMS = {
"application": "ESQ_IA_REOBJ",
"sap-client": "581",
"command": "action",
"name": "boxlist",
"api": "6.169",
"head-oppc-version": "6.169.22",
"_": "1736761256321"
"api": "6.249",
"head-oppc-version": "6.249.1",
# dynamic '_' gets applied at request time; leave placeholder
"_": "1755259702945",
}
SETUP_QUERY_PARAMS_URL = "https://portal1s.easysquare.com/prorex/xmlforms?application=ESQ_IA_REOBJ&sap-client=581&command=action&name=save&id=4B76A3C8-3E4D-4217-B54A-7C28C920748C&api=6.169&head-oppc-version=6.169.22&originalId=842F0073-DC21-A841-4E80-B1BD5E404E35&resourceOrigin=form"
SETUP_QUERY_PARAMS_URL = (
"https://portal1s.easysquare.com/prorex/xmlforms"
"?application=ESQ_IA_REOBJ"
"&sap-client=581"
"&command=action"
"&name=save"
"&id=E3920A27-432A-4127-96FC-6433ED32FDDE"
"&api=6.249"
"&head-oppc-version=6.249.1"
"&originalId=3C9DAA99-1C5D-4810-5B5E-AFE704639EF5"
"&resourceOrigin=form"
)
# curl --location 'https://portal1s.easysquare.com/prorex/xmlforms?application=ESQ_IA_REOBJ&sap-client=581&command=action&name=boxlist&api=6.169&head-oppc-version=6.169.22&_=1736761255682' \
# --header 'DNT: 1' \
# --header 'UTC: 1736761256321' \
# --header 'Host: portal1s.easysquare.com' \
# --header 'host: portal1s.easysquare.com' \
# --header 'Accept: text/plain, */*; q=0.01' \
# --header 'Cookie: cookiesession1=678ADA67ADF0813997206FE9F4133118; sap-usercontext=sap-language=de&sap-client=581; SAP_SESSIONID_PP0_581=Vg3w4pn8whD76BldaU2wvP-YzyrRkRHvhWoKELG5Agg%3d' \
# --header 'Referer: https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL' \
# --header 'Sec-GPC: 1' \
# --header 'oppc-id: D9925A2D-4ED9-4911-8AD3-2626DA41FBB0' \
# --header 'Connection: keep-alive' \
# --header 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0' \
# --header 'Content-Type: text/plain;charset=UTF-8' \
# --header 'Sec-Fetch-Dest: empty' \
# --header 'Sec-Fetch-Mode: cors' \
# --header 'Sec-Fetch-Site: same-origin' \
# --header 'Accept-Encoding: gzip, deflate, br, zstd' \
# --header 'Accept-Language: de,en-US;q=0.7,en;q=0.3' \
# --header 'X-Requested-With: XMLHttpRequest'
# Example curl reference omitted for brevity.
# setup query params for lwb session
def setup_query_params():
def _now_ms() -> str:
return str(int(time.time() * 1000))
# request this url with POST an xml form
# load xml form from file
xml_form = ""
with open("src/lwb/lwb_form.xml", "r") as file:
xml_form = file.read()
# post xml form to SETUP_QUERY_PARAMS_URL
response = requests.post(SETUP_QUERY_PARAMS_URL, data=xml_form, headers=EASYSQUARE_HEADERS)
def _update_cookie_header():
"""Update Cookie header so other modules can see fresh values."""
cookie = []
if SAP_SESSIONID:
cookie.append(f"SAP_SESSIONID_PP0_581={SAP_SESSIONID}")
cookie.append("sap-usercontext=sap-language=D&sap-client=581")
if COOKIE_SESSION:
cookie.append(f"cookiesession1={COOKIE_SESSION}")
EASYSQUARE_HEADERS["Cookie"] = "; ".join(cookie)
if response.status_code != 200:
print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
return []
print(response.content)
return response.content
# initialize cookie header from any .env-provided values
_update_cookie_header()
def _fetch_csrf_token(session: requests.Session) -> str:
"""Fetch an X-CSRF-Token by calling the list endpoint with 'fetch'."""
_update_cookie_header()
headers = dict(EASYSQUARE_HEADERS)
headers["X-CSRF-Token"] = "fetch"
headers["Accept"] = "text/plain, */*; q=0.01"
headers["UTC"] = _now_ms()
params = dict(EASYSQUARE_PARAMS)
params["_"] = _now_ms()
r = session.get(EASYSQUARE_URL, headers=headers, params=params)
return r.headers.get("x-csrf-token") or r.headers.get("X-CSRF-Token") or ""
# setup query params for lwb session (submit stored filter form)
def setup_query_params(headless: bool = True) -> bool:
"""Initialize search parameters via Playwright by driving the UI.
Replaces the previous XML POST approach. It will:
- click "Ich suche eine Wohnung"
- open the "Immobiliensuche" card and click "MEHR ANZEIGEN"
- set "Maximale Trefferanzahl" to 1000
- click "Suchen"
Updates in-memory cookies if Playwright yields fresher values.
Returns True if the flow ran; False otherwise.
"""
try:
from src.lwb.session_bootstrap import apply_search_via_ui
except Exception as e:
print(f"Playwright-Setup nicht verfügbar: {e}")
return False
try:
vals = apply_search_via_ui(headless=headless, save_to_env=False)
global COOKIE_SESSION, SAP_SESSIONID
if vals.get("COOKIE_SESSION"):
COOKIE_SESSION = vals["COOKIE_SESSION"]
if vals.get("SAP_SESSIONID"):
SAP_SESSIONID = vals["SAP_SESSIONID"]
_update_cookie_header()
return True
except Exception as e:
print(f"Fehler beim Setzen der Suchparameter per UI: {e}")
return False
# Call Session Create and get the session from the response cookies
# Call Session Create and get the session from teh response cookies
def create_session():
# request url with chromium browser and get the cookies
session = requests.Session()
@@ -86,91 +137,142 @@ def create_session():
if response.status_code != 200:
print(f"Fehler Session von Easysquare: {response.status_code}")
return []
# get the cookies from the response
cookies = response.cookies
global COOKIE_SESSION
COOKIE_SESSION = cookies.get("cookiesession1")
print(COOKIE_SESSION)
url = "https://portal1s.easysquare.com/meinelwb/api5/authenticate?api=6.169&sap-language=de"
_update_cookie_header()
url = (
"https://portal1s.easysquare.com/meinelwb/api5/authenticate"
"?api=6.169&sap-language=de"
)
payload = {
'sap-field_b64': "dXNlcj1ERU1PJnBhc3N3b3JkPXByb21vczE2"
}
headers = {
'DNT': '1',
'UTC': '1738713279005',
'Host': 'portal1s.easysquare.com',
'host': 'portal1s.easysquare.com',
'Accept': 'text/html, */*; q=0.01',
'Cookie': f'esq-alias=%2fmeinelwb; sap-usercontext=sap-language=de&sap-client=581; cookiesession1={COOKIE_SESSION}',
'Origin': 'https://portal1s.easysquare.com',
'Referer': 'https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL',
'Sec-GPC': '1',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-CSRF-Token': 'fetch',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
'X-Requested-With': 'XMLHttpRequest'
'DNT': '1',
'UTC': _now_ms(),
'Host': 'portal1s.easysquare.com',
'host': 'portal1s.easysquare.com',
'Accept': 'text/html, */*; q=0.01',
'Cookie': (
'esq-alias=%2fmeinelwb; '
'sap-usercontext=sap-language=de&sap-client=581; '
f'cookiesession1={COOKIE_SESSION}'
),
'Origin': 'https://portal1s.easysquare.com',
'Referer': SESSION_CREATE_URL,
'Sec-GPC': '1',
'Connection': 'keep-alive',
'User-Agent': EASYSQUARE_HEADERS['User-Agent'],
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-CSRF-Token': 'fetch',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
'X-Requested-With': 'XMLHttpRequest'
}
print(headers)
response = requests.request("POST", url, headers=headers, data=payload)
print(response.text)
response = requests.request(
"POST",
url,
headers=headers,
data=payload,
)
if response.status_code != 200:
print(f"Fehler beim Session Erstellen via Easysquare: {response.status_code}")
print(
f"Fehler beim Session Erstellen via Easysquare: "
f"{response.status_code}"
)
return []
# get the cookies from the response
cookies = response.cookies
global SAP_SESSIONID
SAP_SESSIONID = cookies.get("SAP_SESSIONID_PP0_581")
print(SAP_SESSIONID)
_update_cookie_header()
print(f"SAP_SESSIONID_PP0_581: {SAP_SESSIONID}")
# Funktion: Scrape von Easysquare
def _ensure_session_ready():
# If missing tokens, try to create session
global COOKIE_SESSION, SAP_SESSIONID
if not COOKIE_SESSION or not SAP_SESSIONID:
# First try headless browser bootstrap for robust cookie capture
try:
from src.lwb.session_bootstrap import fetch_session
vals = fetch_session(headless=True, save_to_env=False)
got_cookie = vals.get("COOKIE_SESSION")
got_sap = vals.get("SAP_SESSIONID")
if got_cookie:
COOKIE_SESSION = got_cookie
if got_sap:
SAP_SESSIONID = got_sap
_update_cookie_header()
except Exception:
# Fallback to legacy request-based bootstrap
create_session()
def scrape_easysquare():
_ensure_session_ready()
# Submit stored search/filter form to ensure listing context is ready
try:
setup_query_params()
except Exception as e:
# non-fatal; we'll still attempt list fetch
print(f"Warnung: Setup der Suchparameter fehlgeschlagen: {e}")
# Build params with fresh timestamp
params = dict(EASYSQUARE_PARAMS)
params["_"] = _now_ms()
session = requests.Session()
response = session.get(EASYSQUARE_URL, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS)
response = session.get(
EASYSQUARE_URL,
headers=EASYSQUARE_HEADERS,
params=params,
)
if response.status_code != 200:
print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
# print("Versuche Session zu erstellen")
# create_session()
return []
# XML-Daten parsen
root = ET.fromstring(response.content)
namespace = {"ns": "http://www.openpromos.com/OPPC/XMLForms"}
properties = []
for head in root.findall(".//ns:head", namespace):
prop_title = head.find("ns:title", namespace).text
subtitle = head.find("ns:subtitle", namespace).text
abstract = head.find("ns:abstract", namespace).text.strip()
title_el = head.find("ns:title", namespace)
subtitle_el = head.find("ns:subtitle", namespace)
abstract_el = head.find("ns:abstract", namespace)
prop_title = title_el.text if title_el is not None else ""
subtitle = subtitle_el.text if subtitle_el is not None else ""
abstract = (
abstract_el.text.strip()
if abstract_el is not None and abstract_el.text
else ""
)
# get adress lat and long
# <address city="" lat="51.346061" lon="12.3774656" postcode="" street=""/>
adress = head.find("ns:address", namespace)
lat = adress.get("lat")
lon = adress.get("lon")
# get address lat/lon
address_el = head.find("ns:address", namespace)
lat = address_el.get("lat") if address_el is not None else ""
lon = address_el.get("lon") if address_el is not None else ""
image = head.find("ns:image", namespace)
iamge_resourceId = image.get("resourceId")
id = head.find("ns:id", namespace).text
image_resource_id = (
image.get("resourceId") if image is not None else ""
)
# Details extrahieren
rooms = "N/A"
@@ -191,28 +293,42 @@ def scrape_easysquare():
availability = value
# link create google maps link with lat and long
link = f"https://www.google.com/maps/search/?api=1&query={lat},{lon}"
link = (
f"https://www.google.com/maps/search/?api=1&query={lat},{lon}"
if lat and lon
else ""
)
# https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id=1EC8D4E6-191A-A827-47FF-72D8C5379070&name=get
image_url = f"https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id={iamge_resourceId}&name=get"
# image url for listing
base_img = "https://portal1s.easysquare.com/prorex/xmlforms/image.jpg"
image_url = (
f"{base_img}?application=ESQ_IA_REOBJ&command=action"
f"&id={image_resource_id}&name=get"
if image_resource_id
else ""
)
# the id should be a hash create by the title, subtitle, rooms, size, rent, availability
hashID = f"{prop_title}{subtitle}{rooms}{size}{rent}{availability}"
id = hashlib.sha256(hashID.encode('utf-8')).hexdigest()
# Hash from title, subtitle, rooms, size, rent, availability
hash_id = (
f"{prop_title}{subtitle}{rooms}{size}{rent}{availability}"
)
prop_id = hashlib.sha256(hash_id.encode('utf-8')).hexdigest()
properties.append({
"id": id,
"title": "LWB - " + prop_title,
"subtitle": subtitle,
"rooms": format.format_room(rooms),
"size": format.format_roomSize(size),
"rent": format.format_money(rent),
"link": link,
"abstract": abstract,
"warm_rent": "",
"availability": format.format_date(availability),
"image_url": image_url,
"owner": "LWB",
})
properties.append(
{
"id": prop_id,
"title": f"LWB - {prop_title}",
"subtitle": subtitle,
"rooms": format.format_room(rooms),
"size": format.format_roomSize(size),
"rent": format.format_money(rent),
"link": link,
"abstract": abstract,
"warm_rent": "",
"availability": format.format_date(availability),
"image_url": image_url,
"owner": "LWB",
}
)
return properties
return properties

View File

@@ -0,0 +1,347 @@
import os
import sys
import time
from typing import Dict, Optional
from dotenv import load_dotenv
SESSION_CREATE_URL = (
"https://portal1s.easysquare.com/meinelwb/index.html"
"?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
)
# UI string constants
BTN_SEARCH_ENTRY = "Ich suche eine Wohnung"
BTN_MORE = "MEHR ANZEIGEN"
BTN_SEARCH = ("Suchen", "SUCHEN")
LABEL_MAX_RESULTS = (
"Maximale Trefferanzahl",
"Maximale Trefferzahl",
"Maximale\u00A0Trefferanzahl",
"Maximale\u00A0Trefferzahl",
)
def _cookie_map(context) -> dict:
"""Return a name->cookie dict safely from Playwright context cookies."""
mapping = {}
try:
for c in context.cookies(): # type: ignore[attr-defined]
name = c.get("name") if isinstance(c, dict) else None
if name:
mapping[name] = c
except Exception:
pass
return mapping
def _update_env_file(values: Dict[str, str], env_path: str = ".env") -> None:
"""Create or update .env with given key/value pairs."""
existing = {}
if os.path.exists(env_path):
with open(env_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
k, v = line.split("=", 1)
existing[k] = v
existing.update(values)
lines = [f"{k}={v}\n" for k, v in existing.items()]
with open(env_path, "w", encoding="utf-8") as f:
f.writelines(lines)
def fetch_session(
headless: bool = True,
save_to_env: bool = False,
) -> Dict[str, Optional[str]]:
"""Open the LWB portal in a headless browser and extract cookies.
Returns a dict with keys: COOKIE_SESSION, SAP_SESSIONID.
When save_to_env=True, writes/updates these in .env at repo root.
"""
try:
from playwright.sync_api import sync_playwright
except Exception: # pragma: no cover - dependency missing
print(
"Playwright not installed. Install: pip install playwright "
"&& python -m playwright install"
)
raise
# .env not required for cookie fetch; loading is harmless but optional
try:
load_dotenv()
except Exception:
pass
result: Dict[str, Optional[str]] = {
"COOKIE_SESSION": None,
"SAP_SESSIONID": None,
}
with sync_playwright() as p:
browser = p.chromium.launch(headless=headless)
context = browser.new_context()
page = context.new_page()
page.goto(SESSION_CREATE_URL, wait_until="networkidle")
# Click the entry button: "Ich suche eine Wohnung"
try:
page.get_by_role("button", name=BTN_SEARCH_ENTRY).click(timeout=3000)
except Exception:
try:
page.get_by_role("link", name=BTN_SEARCH_ENTRY).click(timeout=3000)
except Exception:
try:
page.get_by_text(BTN_SEARCH_ENTRY, exact=True).click(timeout=3000)
except Exception:
pass
# Let the portal finish any navigation after the click
try:
page.wait_for_load_state("networkidle", timeout=5000)
except Exception:
pass
# Try to read cookiesession1 quickly (set by the portal)
cookie_session = None
sap_session = None
for _ in range(10):
cookies = _cookie_map(context)
if not cookie_session and "cookiesession1" in cookies:
cookie_session = cookies.get("cookiesession1", {}).get("value")
if not sap_session and "SAP_SESSIONID_PP0_581" in cookies:
sap_session = cookies.get("SAP_SESSIONID_PP0_581", {}).get("value")
if cookie_session and sap_session:
break
time.sleep(0.5)
# If SAP session not present, trigger authenticate from the page context
if not sap_session:
try:
js_lines = [
"async () => {",
" const base = '/meinelwb/api5/authenticate';",
" const url = base + '?api=6.169&sap-language=de';",
" const body = new URLSearchParams({",
" 'sap-field_b64':",
" 'dXNlcj1ERU1PJnBhc3N3b3JkPXByb21vczE2'",
" }).toString();",
" await fetch(url, {",
" method: 'POST',",
" headers: {",
" 'Content-Type':",
" 'application/x-www-form-urlencoded;' +",
" ' charset=UTF-8',",
" 'X-CSRF-Token': 'fetch',",
" 'X-Requested-With': 'XMLHttpRequest'",
" },",
" body",
" });",
" return true;",
"}",
]
page.evaluate("\n".join(js_lines))
except Exception:
pass
# Re-check cookies for SAP session
for _ in range(10):
cookies = _cookie_map(context)
if "SAP_SESSIONID_PP0_581" in cookies:
sap_session = cookies.get("SAP_SESSIONID_PP0_581", {}).get("value")
break
time.sleep(0.5)
# Best-effort read of localStorage (useful for debugging)
try:
storage_dump = page.evaluate(
"() => JSON.stringify(window.localStorage)"
)
if storage_dump and len(storage_dump) > 2:
pass # Not strictly needed; kept for debugging future issues
except Exception:
pass
result["COOKIE_SESSION"] = cookie_session
result["SAP_SESSIONID"] = sap_session
browser.close()
if save_to_env and (result["COOKIE_SESSION"] or result["SAP_SESSIONID"]):
to_write = {}
if result["COOKIE_SESSION"]:
to_write["COOKIE_SESSION"] = result["COOKIE_SESSION"]
if result["SAP_SESSIONID"]:
to_write["SAP_SESSIONID"] = result["SAP_SESSIONID"]
if to_write:
_update_env_file(to_write)
return result
def apply_search_via_ui(
headless: bool = True,
save_to_env: bool = False,
) -> Dict[str, Optional[str]]:
"""Drive the UI to initialize the search context.
Steps:
- Open portal, click "Ich suche eine Wohnung".
- In services section with title "Immobiliensuche", click "MEHR ANZEIGEN".
- Set "Maximale Trefferanzahl" to 1000 and click "Suchen".
Returns latest cookies (COOKIE_SESSION, SAP_SESSIONID) and optionally
writes them to .env.
"""
try:
from playwright.sync_api import sync_playwright
except Exception: # pragma: no cover
print(
"Playwright not installed. Install: pip install playwright "
"&& python -m playwright install"
)
raise
try:
load_dotenv()
except Exception:
pass
result: Dict[str, Optional[str]] = {
"COOKIE_SESSION": None,
"SAP_SESSIONID": None,
}
with sync_playwright() as p:
browser = p.chromium.launch(headless=headless)
context = browser.new_context()
page = context.new_page()
page.goto(SESSION_CREATE_URL, wait_until="networkidle")
# Click entry button (robust tries)
clicked = False
for sel in [
lambda: page.get_by_role("button", name=BTN_SEARCH_ENTRY).click(timeout=3000),
lambda: page.get_by_role("link", name=BTN_SEARCH_ENTRY).click(timeout=3000),
lambda: page.get_by_text(BTN_SEARCH_ENTRY, exact=True).click(timeout=3000),
]:
try:
sel()
clicked = True
break
except Exception:
pass
if clicked:
try:
page.wait_for_load_state("networkidle", timeout=5000)
except Exception:
pass
# Click MEHR ANZEIGEN in the Immobiliensuche container
try:
container = page.locator(
"div.easy-services-service-container"
).filter(has_text="Immobiliensuche")
container.get_by_role("button", name=BTN_MORE).first.click(timeout=5000)
except Exception:
try:
page.get_by_role("button", name=BTN_MORE).click(timeout=5000)
except Exception:
pass
# Fill "Maximale Trefferanzahl" to 1000
def _fill_max_results():
# Try common label variations
for label in LABEL_MAX_RESULTS:
try:
page.get_by_label(label).fill("1000", timeout=3000)
return True
except Exception:
continue
# Fallback: find inputs near the label text
try:
near = page.get_by_text("Maximale")
near_locator = near.locator(
"xpath=following::input[1]"
)
near_locator.fill("1000", timeout=3000)
return True
except Exception:
return False
try:
_fill_max_results()
except Exception:
pass
# Click Suchen
for name in BTN_SEARCH:
try:
page.get_by_role("button", name=name).click(timeout=4000)
break
except Exception:
try:
page.get_by_text(name, exact=True).click(timeout=4000)
break
except Exception:
continue
try:
page.wait_for_load_state("networkidle", timeout=6000)
except Exception:
pass
# Collect cookies
cookies = _cookie_map(context)
if "cookiesession1" in cookies:
result["COOKIE_SESSION"] = cookies.get("cookiesession1", {}).get("value")
if "SAP_SESSIONID_PP0_581" in cookies:
result["SAP_SESSIONID"] = cookies.get("SAP_SESSIONID_PP0_581", {}).get("value")
# render next page and keep it open for 10 seconds
page.wait_for_timeout(10000)
browser.close()
if save_to_env and (result["COOKIE_SESSION"] or result["SAP_SESSIONID"]):
to_write = {}
if result["COOKIE_SESSION"]:
to_write["COOKIE_SESSION"] = result["COOKIE_SESSION"]
if result["SAP_SESSIONID"]:
to_write["SAP_SESSIONID"] = result["SAP_SESSIONID"]
if to_write:
_update_env_file(to_write)
return result
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Fetch LWB session cookies via headless browser"
)
parser.add_argument(
"--headed", action="store_true", help="Run browser in headed mode"
)
parser.add_argument(
"--no-save", action="store_true", help="Do not write values into .env"
)
args = parser.parse_args()
try:
vals = fetch_session(
headless=not args.headed,
save_to_env=not args.no_save,
)
print("COOKIE_SESSION=", vals.get("COOKIE_SESSION"))
print("SAP_SESSIONID=", vals.get("SAP_SESSIONID"))
if not (vals.get("COOKIE_SESSION") and vals.get("SAP_SESSIONID")):
print(
"Warning: One or both values are missing. Try --headed to "
"complete any prompts."
)
except Exception as e:
print(f"Error while fetching session: {e}")
sys.exit(1)

View File

@@ -17,10 +17,11 @@ def scrape_vlw():
"senden": "suchen",
}
#debug print url
# print(f"Fetching VLW properties from: {url} with params: {parameter}")
response = requests.get(url=url, params=parameter)
soup = BeautifulSoup(response.content, 'html.parser')
properties = []
# get div with class "estate-result-list"