mirror of
https://gitlab.dit.htwk-leipzig.de/fsr-im/tools/flatscraper.git
synced 2025-08-29 17:43:50 +02:00
added playwright lwb scraping with chromium headless
This commit is contained in:
@@ -7,3 +7,4 @@ requests==2.32.3
|
||||
soupsieve==2.6
|
||||
typing_extensions==4.12.2
|
||||
urllib3==2.3.0
|
||||
playwright==1.45.0
|
||||
|
@@ -34,7 +34,8 @@ def fetch_all_properties():
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Cache-Control': 'max-age=0',
|
||||
}
|
||||
|
||||
# debug print url
|
||||
# print(url)
|
||||
response = requests.request("GET", url, headers=headers)
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
File diff suppressed because one or more lines are too long
@@ -1,5 +1,5 @@
|
||||
import requests
|
||||
from src.lwb.scraper import EASYSQUARE_HEADERS, EASYSQUARE_PARAMS
|
||||
from src.lwb.scraper import EASYSQUARE_HEADERS
|
||||
|
||||
def scrape_image(url, owner):
|
||||
session = requests.Session()
|
||||
@@ -19,10 +19,14 @@ def scrape_image(url, owner):
|
||||
# return empty image
|
||||
return b''
|
||||
|
||||
if owner == "LWB":
|
||||
response = session.get(url, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS)
|
||||
if owner == "LWB":
|
||||
# Image URL already carries the required params; only send headers
|
||||
response = session.get(url, headers=EASYSQUARE_HEADERS)
|
||||
if response.status_code != 200:
|
||||
print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
|
||||
print(
|
||||
f"Fehler beim Abrufen von Easysquare: "
|
||||
f"{response.status_code}"
|
||||
)
|
||||
# return empty image
|
||||
return b''
|
||||
|
||||
@@ -30,8 +34,10 @@ def scrape_image(url, owner):
|
||||
if response is None:
|
||||
response = session.get(url)
|
||||
if response.status_code != 200:
|
||||
print(f"Fehler beim Abrufen der Standardquelle: {response.status_code}")
|
||||
print(
|
||||
f"Fehler beim Abrufen der Standardquelle: "
|
||||
f"{response.status_code}"
|
||||
)
|
||||
return b''
|
||||
|
||||
|
||||
|
||||
return response.content
|
||||
|
@@ -3,81 +3,136 @@ import xml.etree.ElementTree as ET
|
||||
import src.lwb.format as format
|
||||
import hashlib
|
||||
import os
|
||||
import time
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
SESSION_CREATE_URL = "https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
|
||||
SESSION_CREATE_URL = (
|
||||
"https://portal1s.easysquare.com/meinelwb/index.html"
|
||||
"?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
|
||||
)
|
||||
|
||||
# Session tokens (from .env as fallback; will be refreshed dynamically)
|
||||
SAP_SESSIONID = os.getenv("SAP_SESSIONID")
|
||||
COOKIE_SESSION = os.getenv("COOKIE_SESSION")
|
||||
|
||||
EASYSQUARE_URL = "https://portal1s.easysquare.com/prorex/xmlforms"
|
||||
|
||||
# Mutable headers so imports see updates when we refresh session values
|
||||
EASYSQUARE_HEADERS = {
|
||||
"DNT": "1",
|
||||
"Host": "portal1s.easysquare.com",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Cookie": f"SAP_SESSIONID_PP0_581={SAP_SESSIONID}; sap-usercontext=sap-language=D&sap-client=581; cookiesession1={COOKIE_SESSION}",
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
|
||||
"Accept": (
|
||||
"text/html,application/xhtml+xml,"
|
||||
"application/xml;q=0.9,*/*;q=0.8"
|
||||
),
|
||||
# Cookie gets filled by _update_cookie_header()
|
||||
"Cookie": "",
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) "
|
||||
"Gecko/20100101 Firefox/135.0"
|
||||
),
|
||||
"Accept-Encoding": "gzip, deflate, br, zstd",
|
||||
"Accept-Language": "de,en-US;q=0.7,en;q=0.3",
|
||||
"Upgrade-Insecure-Requests": "1"
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
# Helpful headers observed from network calls
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
"Referer": SESSION_CREATE_URL,
|
||||
}
|
||||
EASYSQUARE_PARAMS = {
|
||||
"application": "ESQ_IA_REOBJ",
|
||||
"sap-client": "581",
|
||||
"command": "action",
|
||||
"name": "boxlist",
|
||||
"api": "6.169",
|
||||
"head-oppc-version": "6.169.22",
|
||||
"_": "1736761256321"
|
||||
"api": "6.249",
|
||||
"head-oppc-version": "6.249.1",
|
||||
# dynamic '_' gets applied at request time; leave placeholder
|
||||
"_": "1755259702945",
|
||||
}
|
||||
|
||||
SETUP_QUERY_PARAMS_URL = "https://portal1s.easysquare.com/prorex/xmlforms?application=ESQ_IA_REOBJ&sap-client=581&command=action&name=save&id=4B76A3C8-3E4D-4217-B54A-7C28C920748C&api=6.169&head-oppc-version=6.169.22&originalId=842F0073-DC21-A841-4E80-B1BD5E404E35&resourceOrigin=form"
|
||||
SETUP_QUERY_PARAMS_URL = (
|
||||
"https://portal1s.easysquare.com/prorex/xmlforms"
|
||||
"?application=ESQ_IA_REOBJ"
|
||||
"&sap-client=581"
|
||||
"&command=action"
|
||||
"&name=save"
|
||||
"&id=E3920A27-432A-4127-96FC-6433ED32FDDE"
|
||||
"&api=6.249"
|
||||
"&head-oppc-version=6.249.1"
|
||||
"&originalId=3C9DAA99-1C5D-4810-5B5E-AFE704639EF5"
|
||||
"&resourceOrigin=form"
|
||||
)
|
||||
|
||||
# curl --location 'https://portal1s.easysquare.com/prorex/xmlforms?application=ESQ_IA_REOBJ&sap-client=581&command=action&name=boxlist&api=6.169&head-oppc-version=6.169.22&_=1736761255682' \
|
||||
# --header 'DNT: 1' \
|
||||
# --header 'UTC: 1736761256321' \
|
||||
# --header 'Host: portal1s.easysquare.com' \
|
||||
# --header 'host: portal1s.easysquare.com' \
|
||||
# --header 'Accept: text/plain, */*; q=0.01' \
|
||||
# --header 'Cookie: cookiesession1=678ADA67ADF0813997206FE9F4133118; sap-usercontext=sap-language=de&sap-client=581; SAP_SESSIONID_PP0_581=Vg3w4pn8whD76BldaU2wvP-YzyrRkRHvhWoKELG5Agg%3d' \
|
||||
# --header 'Referer: https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL' \
|
||||
# --header 'Sec-GPC: 1' \
|
||||
# --header 'oppc-id: D9925A2D-4ED9-4911-8AD3-2626DA41FBB0' \
|
||||
# --header 'Connection: keep-alive' \
|
||||
# --header 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0' \
|
||||
# --header 'Content-Type: text/plain;charset=UTF-8' \
|
||||
# --header 'Sec-Fetch-Dest: empty' \
|
||||
# --header 'Sec-Fetch-Mode: cors' \
|
||||
# --header 'Sec-Fetch-Site: same-origin' \
|
||||
# --header 'Accept-Encoding: gzip, deflate, br, zstd' \
|
||||
# --header 'Accept-Language: de,en-US;q=0.7,en;q=0.3' \
|
||||
# --header 'X-Requested-With: XMLHttpRequest'
|
||||
# Example curl reference omitted for brevity.
|
||||
|
||||
|
||||
# setup query params for lwb session
|
||||
def setup_query_params():
|
||||
def _now_ms() -> str:
|
||||
return str(int(time.time() * 1000))
|
||||
|
||||
# request this url with POST an xml form
|
||||
|
||||
# load xml form from file
|
||||
xml_form = ""
|
||||
with open("src/lwb/lwb_form.xml", "r") as file:
|
||||
xml_form = file.read()
|
||||
|
||||
# post xml form to SETUP_QUERY_PARAMS_URL
|
||||
response = requests.post(SETUP_QUERY_PARAMS_URL, data=xml_form, headers=EASYSQUARE_HEADERS)
|
||||
def _update_cookie_header():
|
||||
"""Update Cookie header so other modules can see fresh values."""
|
||||
cookie = []
|
||||
if SAP_SESSIONID:
|
||||
cookie.append(f"SAP_SESSIONID_PP0_581={SAP_SESSIONID}")
|
||||
cookie.append("sap-usercontext=sap-language=D&sap-client=581")
|
||||
if COOKIE_SESSION:
|
||||
cookie.append(f"cookiesession1={COOKIE_SESSION}")
|
||||
EASYSQUARE_HEADERS["Cookie"] = "; ".join(cookie)
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
|
||||
return []
|
||||
|
||||
print(response.content)
|
||||
|
||||
return response.content
|
||||
# initialize cookie header from any .env-provided values
|
||||
_update_cookie_header()
|
||||
|
||||
|
||||
def _fetch_csrf_token(session: requests.Session) -> str:
|
||||
"""Fetch an X-CSRF-Token by calling the list endpoint with 'fetch'."""
|
||||
_update_cookie_header()
|
||||
headers = dict(EASYSQUARE_HEADERS)
|
||||
headers["X-CSRF-Token"] = "fetch"
|
||||
headers["Accept"] = "text/plain, */*; q=0.01"
|
||||
headers["UTC"] = _now_ms()
|
||||
params = dict(EASYSQUARE_PARAMS)
|
||||
params["_"] = _now_ms()
|
||||
r = session.get(EASYSQUARE_URL, headers=headers, params=params)
|
||||
return r.headers.get("x-csrf-token") or r.headers.get("X-CSRF-Token") or ""
|
||||
|
||||
|
||||
# setup query params for lwb session (submit stored filter form)
|
||||
def setup_query_params(headless: bool = True) -> bool:
|
||||
"""Initialize search parameters via Playwright by driving the UI.
|
||||
|
||||
Replaces the previous XML POST approach. It will:
|
||||
- click "Ich suche eine Wohnung"
|
||||
- open the "Immobiliensuche" card and click "MEHR ANZEIGEN"
|
||||
- set "Maximale Trefferanzahl" to 1000
|
||||
- click "Suchen"
|
||||
Updates in-memory cookies if Playwright yields fresher values.
|
||||
Returns True if the flow ran; False otherwise.
|
||||
"""
|
||||
try:
|
||||
from src.lwb.session_bootstrap import apply_search_via_ui
|
||||
except Exception as e:
|
||||
print(f"Playwright-Setup nicht verfügbar: {e}")
|
||||
return False
|
||||
|
||||
try:
|
||||
vals = apply_search_via_ui(headless=headless, save_to_env=False)
|
||||
global COOKIE_SESSION, SAP_SESSIONID
|
||||
if vals.get("COOKIE_SESSION"):
|
||||
COOKIE_SESSION = vals["COOKIE_SESSION"]
|
||||
if vals.get("SAP_SESSIONID"):
|
||||
SAP_SESSIONID = vals["SAP_SESSIONID"]
|
||||
_update_cookie_header()
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Fehler beim Setzen der Suchparameter per UI: {e}")
|
||||
return False
|
||||
|
||||
# Call Session Create and get the session from the response cookies
|
||||
|
||||
|
||||
# Call Session Create and get the session from teh response cookies
|
||||
def create_session():
|
||||
# request url with chromium browser and get the cookies
|
||||
session = requests.Session()
|
||||
@@ -86,91 +141,142 @@ def create_session():
|
||||
if response.status_code != 200:
|
||||
print(f"Fehler Session von Easysquare: {response.status_code}")
|
||||
return []
|
||||
|
||||
|
||||
# get the cookies from the response
|
||||
cookies = response.cookies
|
||||
global COOKIE_SESSION
|
||||
COOKIE_SESSION = cookies.get("cookiesession1")
|
||||
print(COOKIE_SESSION)
|
||||
|
||||
|
||||
url = "https://portal1s.easysquare.com/meinelwb/api5/authenticate?api=6.169&sap-language=de"
|
||||
_update_cookie_header()
|
||||
url = (
|
||||
"https://portal1s.easysquare.com/meinelwb/api5/authenticate"
|
||||
"?api=6.169&sap-language=de"
|
||||
)
|
||||
|
||||
payload = {
|
||||
'sap-field_b64': "dXNlcj1ERU1PJnBhc3N3b3JkPXByb21vczE2"
|
||||
}
|
||||
|
||||
headers = {
|
||||
'DNT': '1',
|
||||
'UTC': '1738713279005',
|
||||
'Host': 'portal1s.easysquare.com',
|
||||
'host': 'portal1s.easysquare.com',
|
||||
'Accept': 'text/html, */*; q=0.01',
|
||||
'Cookie': f'esq-alias=%2fmeinelwb; sap-usercontext=sap-language=de&sap-client=581; cookiesession1={COOKIE_SESSION}',
|
||||
'Origin': 'https://portal1s.easysquare.com',
|
||||
'Referer': 'https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL',
|
||||
'Sec-GPC': '1',
|
||||
'Connection': 'keep-alive',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0',
|
||||
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
|
||||
'X-CSRF-Token': 'fetch',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
||||
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
|
||||
'X-Requested-With': 'XMLHttpRequest'
|
||||
'DNT': '1',
|
||||
'UTC': _now_ms(),
|
||||
'Host': 'portal1s.easysquare.com',
|
||||
'host': 'portal1s.easysquare.com',
|
||||
'Accept': 'text/html, */*; q=0.01',
|
||||
'Cookie': (
|
||||
'esq-alias=%2fmeinelwb; '
|
||||
'sap-usercontext=sap-language=de&sap-client=581; '
|
||||
f'cookiesession1={COOKIE_SESSION}'
|
||||
),
|
||||
'Origin': 'https://portal1s.easysquare.com',
|
||||
'Referer': SESSION_CREATE_URL,
|
||||
'Sec-GPC': '1',
|
||||
'Connection': 'keep-alive',
|
||||
'User-Agent': EASYSQUARE_HEADERS['User-Agent'],
|
||||
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
|
||||
'X-CSRF-Token': 'fetch',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
||||
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
|
||||
'X-Requested-With': 'XMLHttpRequest'
|
||||
}
|
||||
|
||||
print(headers)
|
||||
|
||||
response = requests.request("POST", url, headers=headers, data=payload)
|
||||
|
||||
print(response.text)
|
||||
response = requests.request(
|
||||
"POST",
|
||||
url,
|
||||
headers=headers,
|
||||
data=payload,
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"Fehler beim Session Erstellen via Easysquare: {response.status_code}")
|
||||
print(
|
||||
f"Fehler beim Session Erstellen via Easysquare: "
|
||||
f"{response.status_code}"
|
||||
)
|
||||
return []
|
||||
|
||||
|
||||
# get the cookies from the response
|
||||
cookies = response.cookies
|
||||
global SAP_SESSIONID
|
||||
SAP_SESSIONID = cookies.get("SAP_SESSIONID_PP0_581")
|
||||
|
||||
print(SAP_SESSIONID)
|
||||
_update_cookie_header()
|
||||
print(f"SAP_SESSIONID_PP0_581: {SAP_SESSIONID}")
|
||||
|
||||
|
||||
|
||||
# Funktion: Scrape von Easysquare
|
||||
def _ensure_session_ready():
|
||||
# If missing tokens, try to create session
|
||||
global COOKIE_SESSION, SAP_SESSIONID
|
||||
if not COOKIE_SESSION or not SAP_SESSIONID:
|
||||
# First try headless browser bootstrap for robust cookie capture
|
||||
try:
|
||||
from src.lwb.session_bootstrap import fetch_session
|
||||
|
||||
vals = fetch_session(headless=True, save_to_env=False)
|
||||
got_cookie = vals.get("COOKIE_SESSION")
|
||||
got_sap = vals.get("SAP_SESSIONID")
|
||||
if got_cookie:
|
||||
COOKIE_SESSION = got_cookie
|
||||
if got_sap:
|
||||
SAP_SESSIONID = got_sap
|
||||
_update_cookie_header()
|
||||
except Exception:
|
||||
# Fallback to legacy request-based bootstrap
|
||||
create_session()
|
||||
|
||||
|
||||
def scrape_easysquare():
|
||||
_ensure_session_ready()
|
||||
|
||||
# Submit stored search/filter form to ensure listing context is ready
|
||||
try:
|
||||
setup_query_params()
|
||||
except Exception as e:
|
||||
# non-fatal; we'll still attempt list fetch
|
||||
print(f"Warnung: Setup der Suchparameter fehlgeschlagen: {e}")
|
||||
|
||||
# Build params with fresh timestamp
|
||||
params = dict(EASYSQUARE_PARAMS)
|
||||
params["_"] = _now_ms()
|
||||
|
||||
session = requests.Session()
|
||||
response = session.get(EASYSQUARE_URL, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS)
|
||||
|
||||
response = session.get(
|
||||
EASYSQUARE_URL,
|
||||
headers=EASYSQUARE_HEADERS,
|
||||
params=params,
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
|
||||
# print("Versuche Session zu erstellen")
|
||||
# create_session()
|
||||
return []
|
||||
|
||||
# XML-Daten parsen
|
||||
root = ET.fromstring(response.content)
|
||||
namespace = {"ns": "http://www.openpromos.com/OPPC/XMLForms"}
|
||||
|
||||
|
||||
properties = []
|
||||
for head in root.findall(".//ns:head", namespace):
|
||||
prop_title = head.find("ns:title", namespace).text
|
||||
subtitle = head.find("ns:subtitle", namespace).text
|
||||
abstract = head.find("ns:abstract", namespace).text.strip()
|
||||
title_el = head.find("ns:title", namespace)
|
||||
subtitle_el = head.find("ns:subtitle", namespace)
|
||||
abstract_el = head.find("ns:abstract", namespace)
|
||||
prop_title = title_el.text if title_el is not None else ""
|
||||
subtitle = subtitle_el.text if subtitle_el is not None else ""
|
||||
abstract = (
|
||||
abstract_el.text.strip()
|
||||
if abstract_el is not None and abstract_el.text
|
||||
else ""
|
||||
)
|
||||
|
||||
# get adress lat and long
|
||||
# <address city="" lat="51.346061" lon="12.3774656" postcode="" street=""/>
|
||||
|
||||
adress = head.find("ns:address", namespace)
|
||||
lat = adress.get("lat")
|
||||
lon = adress.get("lon")
|
||||
# get address lat/lon
|
||||
address_el = head.find("ns:address", namespace)
|
||||
lat = address_el.get("lat") if address_el is not None else ""
|
||||
lon = address_el.get("lon") if address_el is not None else ""
|
||||
|
||||
image = head.find("ns:image", namespace)
|
||||
iamge_resourceId = image.get("resourceId")
|
||||
|
||||
id = head.find("ns:id", namespace).text
|
||||
image_resource_id = (
|
||||
image.get("resourceId") if image is not None else ""
|
||||
)
|
||||
|
||||
# Details extrahieren
|
||||
rooms = "N/A"
|
||||
@@ -191,28 +297,42 @@ def scrape_easysquare():
|
||||
availability = value
|
||||
|
||||
# link create google maps link with lat and long
|
||||
link = f"https://www.google.com/maps/search/?api=1&query={lat},{lon}"
|
||||
link = (
|
||||
f"https://www.google.com/maps/search/?api=1&query={lat},{lon}"
|
||||
if lat and lon
|
||||
else ""
|
||||
)
|
||||
|
||||
# https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id=1EC8D4E6-191A-A827-47FF-72D8C5379070&name=get
|
||||
image_url = f"https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id={iamge_resourceId}&name=get"
|
||||
# image url for listing
|
||||
base_img = "https://portal1s.easysquare.com/prorex/xmlforms/image.jpg"
|
||||
image_url = (
|
||||
f"{base_img}?application=ESQ_IA_REOBJ&command=action"
|
||||
f"&id={image_resource_id}&name=get"
|
||||
if image_resource_id
|
||||
else ""
|
||||
)
|
||||
|
||||
# the id should be a hash create by the title, subtitle, rooms, size, rent, availability
|
||||
hashID = f"{prop_title}{subtitle}{rooms}{size}{rent}{availability}"
|
||||
id = hashlib.sha256(hashID.encode('utf-8')).hexdigest()
|
||||
# Hash from title, subtitle, rooms, size, rent, availability
|
||||
hash_id = (
|
||||
f"{prop_title}{subtitle}{rooms}{size}{rent}{availability}"
|
||||
)
|
||||
prop_id = hashlib.sha256(hash_id.encode('utf-8')).hexdigest()
|
||||
|
||||
properties.append({
|
||||
"id": id,
|
||||
"title": "LWB - " + prop_title,
|
||||
"subtitle": subtitle,
|
||||
"rooms": format.format_room(rooms),
|
||||
"size": format.format_roomSize(size),
|
||||
"rent": format.format_money(rent),
|
||||
"link": link,
|
||||
"abstract": abstract,
|
||||
"warm_rent": "",
|
||||
"availability": format.format_date(availability),
|
||||
"image_url": image_url,
|
||||
"owner": "LWB",
|
||||
})
|
||||
properties.append(
|
||||
{
|
||||
"id": prop_id,
|
||||
"title": f"LWB - {prop_title}",
|
||||
"subtitle": subtitle,
|
||||
"rooms": format.format_room(rooms),
|
||||
"size": format.format_roomSize(size),
|
||||
"rent": format.format_money(rent),
|
||||
"link": link,
|
||||
"abstract": abstract,
|
||||
"warm_rent": "",
|
||||
"availability": format.format_date(availability),
|
||||
"image_url": image_url,
|
||||
"owner": "LWB",
|
||||
}
|
||||
)
|
||||
|
||||
return properties
|
||||
return properties
|
||||
|
341
src/lwb/session_bootstrap.py
Normal file
341
src/lwb/session_bootstrap.py
Normal file
@@ -0,0 +1,341 @@
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import Dict, Optional
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
||||
SESSION_CREATE_URL = (
|
||||
"https://portal1s.easysquare.com/meinelwb/index.html"
|
||||
"?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
|
||||
)
|
||||
|
||||
|
||||
def _update_env_file(values: Dict[str, str], env_path: str = ".env") -> None:
|
||||
"""Create or update .env with given key/value pairs."""
|
||||
existing = {}
|
||||
if os.path.exists(env_path):
|
||||
with open(env_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
k, v = line.split("=", 1)
|
||||
existing[k] = v
|
||||
existing.update(values)
|
||||
lines = [f"{k}={v}\n" for k, v in existing.items()]
|
||||
with open(env_path, "w", encoding="utf-8") as f:
|
||||
f.writelines(lines)
|
||||
|
||||
|
||||
def fetch_session(
|
||||
headless: bool = True,
|
||||
save_to_env: bool = True,
|
||||
) -> Dict[str, Optional[str]]:
|
||||
"""Open the LWB portal in a headless browser and extract cookies.
|
||||
|
||||
Returns a dict with keys: COOKIE_SESSION, SAP_SESSIONID.
|
||||
When save_to_env=True, writes/updates these in .env at repo root.
|
||||
"""
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
except Exception: # pragma: no cover - dependency missing
|
||||
print(
|
||||
"Playwright not installed. Install: pip install playwright "
|
||||
"&& python -m playwright install"
|
||||
)
|
||||
raise
|
||||
|
||||
load_dotenv()
|
||||
result: Dict[str, Optional[str]] = {
|
||||
"COOKIE_SESSION": None,
|
||||
"SAP_SESSIONID": None,
|
||||
}
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=headless)
|
||||
context = browser.new_context()
|
||||
page = context.new_page()
|
||||
page.goto(SESSION_CREATE_URL, wait_until="networkidle")
|
||||
|
||||
# Click the entry button: "Ich suche eine Wohnung"
|
||||
try:
|
||||
page.get_by_role(
|
||||
"button", name="Ich suche eine Wohnung"
|
||||
).click(timeout=3000)
|
||||
except Exception:
|
||||
try:
|
||||
page.get_by_role(
|
||||
"link", name="Ich suche eine Wohnung"
|
||||
).click(timeout=3000)
|
||||
except Exception:
|
||||
try:
|
||||
page.get_by_text(
|
||||
"Ich suche eine Wohnung", exact=True
|
||||
).click(timeout=3000)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Let the portal finish any navigation after the click
|
||||
try:
|
||||
page.wait_for_load_state("networkidle", timeout=5000)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Try to read cookiesession1 quickly (set by the portal)
|
||||
cookie_session = None
|
||||
sap_session = None
|
||||
for _ in range(10):
|
||||
cookies = {c["name"]: c for c in context.cookies()}
|
||||
if not cookie_session and "cookiesession1" in cookies:
|
||||
cookie_session = cookies["cookiesession1"]["value"]
|
||||
if not sap_session and "SAP_SESSIONID_PP0_581" in cookies:
|
||||
sap_session = cookies["SAP_SESSIONID_PP0_581"]["value"]
|
||||
if cookie_session and sap_session:
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
# If SAP session not present, trigger authenticate from the page context
|
||||
if not sap_session:
|
||||
try:
|
||||
js_lines = [
|
||||
"async () => {",
|
||||
" const base = '/meinelwb/api5/authenticate';",
|
||||
" const url = base + '?api=6.169&sap-language=de';",
|
||||
" const body = new URLSearchParams({",
|
||||
" 'sap-field_b64':",
|
||||
" 'dXNlcj1ERU1PJnBhc3N3b3JkPXByb21vczE2'",
|
||||
" }).toString();",
|
||||
" await fetch(url, {",
|
||||
" method: 'POST',",
|
||||
" headers: {",
|
||||
" 'Content-Type':",
|
||||
" 'application/x-www-form-urlencoded;' +",
|
||||
" ' charset=UTF-8',",
|
||||
" 'X-CSRF-Token': 'fetch',",
|
||||
" 'X-Requested-With': 'XMLHttpRequest'",
|
||||
" },",
|
||||
" body",
|
||||
" });",
|
||||
" return true;",
|
||||
"}",
|
||||
]
|
||||
page.evaluate("\n".join(js_lines))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Re-check cookies for SAP session
|
||||
for _ in range(10):
|
||||
cookies = {c["name"]: c for c in context.cookies()}
|
||||
if "SAP_SESSIONID_PP0_581" in cookies:
|
||||
sap_session = cookies["SAP_SESSIONID_PP0_581"]["value"]
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
# Best-effort read of localStorage (useful for debugging)
|
||||
try:
|
||||
storage_dump = page.evaluate(
|
||||
"() => JSON.stringify(window.localStorage)"
|
||||
)
|
||||
if storage_dump and len(storage_dump) > 2:
|
||||
pass # Not strictly needed; kept for debugging future issues
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
result["COOKIE_SESSION"] = cookie_session
|
||||
result["SAP_SESSIONID"] = sap_session
|
||||
|
||||
browser.close()
|
||||
|
||||
if save_to_env and (result["COOKIE_SESSION"] or result["SAP_SESSIONID"]):
|
||||
to_write = {}
|
||||
if result["COOKIE_SESSION"]:
|
||||
to_write["COOKIE_SESSION"] = result["COOKIE_SESSION"]
|
||||
if result["SAP_SESSIONID"]:
|
||||
to_write["SAP_SESSIONID"] = result["SAP_SESSIONID"]
|
||||
if to_write:
|
||||
_update_env_file(to_write)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def apply_search_via_ui(
|
||||
headless: bool = True,
|
||||
save_to_env: bool = True,
|
||||
) -> Dict[str, Optional[str]]:
|
||||
"""Drive the UI to initialize the search context.
|
||||
|
||||
Steps:
|
||||
- Open portal, click "Ich suche eine Wohnung".
|
||||
- In services section with title "Immobiliensuche", click "MEHR ANZEIGEN".
|
||||
- Set "Maximale Trefferanzahl" to 1000 and click "Suchen".
|
||||
Returns latest cookies (COOKIE_SESSION, SAP_SESSIONID) and optionally
|
||||
writes them to .env.
|
||||
"""
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
except Exception: # pragma: no cover
|
||||
print(
|
||||
"Playwright not installed. Install: pip install playwright "
|
||||
"&& python -m playwright install"
|
||||
)
|
||||
raise
|
||||
|
||||
load_dotenv()
|
||||
result: Dict[str, Optional[str]] = {
|
||||
"COOKIE_SESSION": None,
|
||||
"SAP_SESSIONID": None,
|
||||
}
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=headless)
|
||||
context = browser.new_context()
|
||||
page = context.new_page()
|
||||
page.goto(SESSION_CREATE_URL, wait_until="networkidle")
|
||||
|
||||
# Click entry button (robust tries)
|
||||
clicked = False
|
||||
for sel in [
|
||||
lambda: page.get_by_role(
|
||||
"button", name="Ich suche eine Wohnung"
|
||||
).click(timeout=3000),
|
||||
lambda: page.get_by_role(
|
||||
"link", name="Ich suche eine Wohnung"
|
||||
).click(timeout=3000),
|
||||
lambda: page.get_by_text(
|
||||
"Ich suche eine Wohnung", exact=True
|
||||
).click(timeout=3000),
|
||||
]:
|
||||
try:
|
||||
sel()
|
||||
clicked = True
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
if clicked:
|
||||
try:
|
||||
page.wait_for_load_state("networkidle", timeout=5000)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Click MEHR ANZEIGEN in the Immobiliensuche container
|
||||
try:
|
||||
container = page.locator(
|
||||
"div.easy-services-service-container"
|
||||
).filter(has_text="Immobiliensuche")
|
||||
container.get_by_role(
|
||||
"button", name="MEHR ANZEIGEN"
|
||||
).first.click(timeout=5000)
|
||||
except Exception:
|
||||
try:
|
||||
page.get_by_role("button", name="MEHR ANZEIGEN").click(
|
||||
timeout=5000
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fill "Maximale Trefferanzahl" to 1000
|
||||
def _fill_max_results():
|
||||
# Try common label variations
|
||||
for label in (
|
||||
"Maximale Trefferanzahl",
|
||||
"Maximale Trefferzahl",
|
||||
"Maximale\u00A0Trefferanzahl",
|
||||
"Maximale\u00A0Trefferzahl",
|
||||
):
|
||||
try:
|
||||
page.get_by_label(label).fill("1000", timeout=3000)
|
||||
return True
|
||||
except Exception:
|
||||
continue
|
||||
# Fallback: find inputs near the label text
|
||||
try:
|
||||
near = page.get_by_text("Maximale")
|
||||
near_locator = near.locator(
|
||||
"xpath=following::input[1]"
|
||||
)
|
||||
near_locator.fill("1000", timeout=3000)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
try:
|
||||
_fill_max_results()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Click Suchen
|
||||
for name in ["Suchen", "SUCHEN"]:
|
||||
try:
|
||||
page.get_by_role("button", name=name).click(timeout=4000)
|
||||
break
|
||||
except Exception:
|
||||
try:
|
||||
page.get_by_text(name, exact=True).click(timeout=4000)
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
try:
|
||||
page.wait_for_load_state("networkidle", timeout=6000)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Collect cookies
|
||||
cookies = {c["name"]: c for c in context.cookies()}
|
||||
if "cookiesession1" in cookies:
|
||||
result["COOKIE_SESSION"] = cookies["cookiesession1"].get(
|
||||
"value"
|
||||
)
|
||||
if "SAP_SESSIONID_PP0_581" in cookies:
|
||||
result["SAP_SESSIONID"] = cookies["SAP_SESSIONID_PP0_581"].get(
|
||||
"value"
|
||||
)
|
||||
|
||||
# render next page and keep it open for 10 seconds
|
||||
page.wait_for_timeout(10000)
|
||||
|
||||
browser.close()
|
||||
|
||||
if save_to_env and (result["COOKIE_SESSION"] or result["SAP_SESSIONID"]):
|
||||
to_write = {}
|
||||
if result["COOKIE_SESSION"]:
|
||||
to_write["COOKIE_SESSION"] = result["COOKIE_SESSION"]
|
||||
if result["SAP_SESSIONID"]:
|
||||
to_write["SAP_SESSIONID"] = result["SAP_SESSIONID"]
|
||||
if to_write:
|
||||
_update_env_file(to_write)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Fetch LWB session cookies via headless browser"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--headed", action="store_true", help="Run browser in headed mode"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-save", action="store_true", help="Do not write values into .env"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
vals = fetch_session(
|
||||
headless=not args.headed,
|
||||
save_to_env=not args.no_save,
|
||||
)
|
||||
print("COOKIE_SESSION=", vals.get("COOKIE_SESSION"))
|
||||
print("SAP_SESSIONID=", vals.get("SAP_SESSIONID"))
|
||||
if not (vals.get("COOKIE_SESSION") and vals.get("SAP_SESSIONID")):
|
||||
print(
|
||||
"Warning: One or both values are missing. Try --headed to "
|
||||
"complete any prompts."
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error while fetching session: {e}")
|
||||
sys.exit(1)
|
@@ -17,10 +17,11 @@ def scrape_vlw():
|
||||
"senden": "suchen",
|
||||
}
|
||||
|
||||
#debug print url
|
||||
# print(f"Fetching VLW properties from: {url} with params: {parameter}")
|
||||
response = requests.get(url=url, params=parameter)
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
|
||||
properties = []
|
||||
|
||||
# get div with class "estate-result-list"
|
||||
|
Reference in New Issue
Block a user