mirror of
https://gitlab.dit.htwk-leipzig.de/fsr-im/tools/flatscraper.git
synced 2025-08-30 01:53:52 +02:00
feat: automate session cookie retrieval and remove manual session ID setup
This commit is contained in:
@@ -37,8 +37,7 @@ You can run the bot natively on your machine or use a Docker image. The requirem
|
|||||||
### 1. Environment Setup
|
### 1. Environment Setup
|
||||||
|
|
||||||
Ensure that the `.env` file is configured correctly. An example is available in the `sample.env` file. Copy it to `.env` and fill in the required values.
|
Ensure that the `.env` file is configured correctly. An example is available in the `sample.env` file. Copy it to `.env` and fill in the required values.
|
||||||
The `SAP_SESSIONID` and `COOKIE_SESSSION` are obtained after performing a search on the LWB website. Use your browser's developer tools to locate them in local storage.
|
You no longer need to set `SAP_SESSIONID` or `COOKIE_SESSION` manually. The scraper opens the LWB portal with a headless browser, clicks “Ich suche eine Wohnung”, opens “Immobiliensuche” → “MEHR ANZEIGEN”, sets the maximum results to 1000, clicks “Suchen”, and extracts session cookies automatically.
|
||||||
*Future versions will include automatic form processing to obtain a valid session ID.*
|
|
||||||
|
|
||||||
### 2. Python Environment
|
### 2. Python Environment
|
||||||
|
|
||||||
|
@@ -1,3 +1 @@
|
|||||||
SAP_SESSIONID=UrN6nRbjuCBe4dkLw7vkJLcpV5zniRHvhkwAAKG5Agg%3d
|
|
||||||
COOKIE_SESSION=678ADA67ADF0813997206FE9F4132819
|
|
||||||
WEBHOOK_URL=https://discord.com/api/webhooks/1327600813367432462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzLKAaa
|
WEBHOOK_URL=https://discord.com/api/webhooks/1327600813367432462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzLKAaa
|
@@ -2,20 +2,16 @@ import requests
|
|||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
import src.lwb.format as format
|
import src.lwb.format as format
|
||||||
import hashlib
|
import hashlib
|
||||||
import os
|
|
||||||
import time
|
import time
|
||||||
from dotenv import load_dotenv
|
|
||||||
|
|
||||||
load_dotenv()
|
|
||||||
|
|
||||||
SESSION_CREATE_URL = (
|
SESSION_CREATE_URL = (
|
||||||
"https://portal1s.easysquare.com/meinelwb/index.html"
|
"https://portal1s.easysquare.com/meinelwb/index.html"
|
||||||
"?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
|
"?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Session tokens (from .env as fallback; will be refreshed dynamically)
|
# Session tokens (resolved dynamically via session bootstrap; no .env needed)
|
||||||
SAP_SESSIONID = os.getenv("SAP_SESSIONID")
|
SAP_SESSIONID = None
|
||||||
COOKIE_SESSION = os.getenv("COOKIE_SESSION")
|
COOKIE_SESSION = None
|
||||||
|
|
||||||
EASYSQUARE_URL = "https://portal1s.easysquare.com/prorex/xmlforms"
|
EASYSQUARE_URL = "https://portal1s.easysquare.com/prorex/xmlforms"
|
||||||
|
|
||||||
|
@@ -11,6 +11,30 @@ SESSION_CREATE_URL = (
|
|||||||
"?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
|
"?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# UI string constants
|
||||||
|
BTN_SEARCH_ENTRY = "Ich suche eine Wohnung"
|
||||||
|
BTN_MORE = "MEHR ANZEIGEN"
|
||||||
|
BTN_SEARCH = ("Suchen", "SUCHEN")
|
||||||
|
LABEL_MAX_RESULTS = (
|
||||||
|
"Maximale Trefferanzahl",
|
||||||
|
"Maximale Trefferzahl",
|
||||||
|
"Maximale\u00A0Trefferanzahl",
|
||||||
|
"Maximale\u00A0Trefferzahl",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _cookie_map(context) -> dict:
|
||||||
|
"""Return a name->cookie dict safely from Playwright context cookies."""
|
||||||
|
mapping = {}
|
||||||
|
try:
|
||||||
|
for c in context.cookies(): # type: ignore[attr-defined]
|
||||||
|
name = c.get("name") if isinstance(c, dict) else None
|
||||||
|
if name:
|
||||||
|
mapping[name] = c
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return mapping
|
||||||
|
|
||||||
|
|
||||||
def _update_env_file(values: Dict[str, str], env_path: str = ".env") -> None:
|
def _update_env_file(values: Dict[str, str], env_path: str = ".env") -> None:
|
||||||
"""Create or update .env with given key/value pairs."""
|
"""Create or update .env with given key/value pairs."""
|
||||||
@@ -31,7 +55,7 @@ def _update_env_file(values: Dict[str, str], env_path: str = ".env") -> None:
|
|||||||
|
|
||||||
def fetch_session(
|
def fetch_session(
|
||||||
headless: bool = True,
|
headless: bool = True,
|
||||||
save_to_env: bool = True,
|
save_to_env: bool = False,
|
||||||
) -> Dict[str, Optional[str]]:
|
) -> Dict[str, Optional[str]]:
|
||||||
"""Open the LWB portal in a headless browser and extract cookies.
|
"""Open the LWB portal in a headless browser and extract cookies.
|
||||||
|
|
||||||
@@ -47,7 +71,11 @@ def fetch_session(
|
|||||||
)
|
)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
load_dotenv()
|
# .env not required for cookie fetch; loading is harmless but optional
|
||||||
|
try:
|
||||||
|
load_dotenv()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
result: Dict[str, Optional[str]] = {
|
result: Dict[str, Optional[str]] = {
|
||||||
"COOKIE_SESSION": None,
|
"COOKIE_SESSION": None,
|
||||||
"SAP_SESSIONID": None,
|
"SAP_SESSIONID": None,
|
||||||
@@ -61,19 +89,13 @@ def fetch_session(
|
|||||||
|
|
||||||
# Click the entry button: "Ich suche eine Wohnung"
|
# Click the entry button: "Ich suche eine Wohnung"
|
||||||
try:
|
try:
|
||||||
page.get_by_role(
|
page.get_by_role("button", name=BTN_SEARCH_ENTRY).click(timeout=3000)
|
||||||
"button", name="Ich suche eine Wohnung"
|
|
||||||
).click(timeout=3000)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
try:
|
try:
|
||||||
page.get_by_role(
|
page.get_by_role("link", name=BTN_SEARCH_ENTRY).click(timeout=3000)
|
||||||
"link", name="Ich suche eine Wohnung"
|
|
||||||
).click(timeout=3000)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
try:
|
try:
|
||||||
page.get_by_text(
|
page.get_by_text(BTN_SEARCH_ENTRY, exact=True).click(timeout=3000)
|
||||||
"Ich suche eine Wohnung", exact=True
|
|
||||||
).click(timeout=3000)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -87,11 +109,11 @@ def fetch_session(
|
|||||||
cookie_session = None
|
cookie_session = None
|
||||||
sap_session = None
|
sap_session = None
|
||||||
for _ in range(10):
|
for _ in range(10):
|
||||||
cookies = {c["name"]: c for c in context.cookies()}
|
cookies = _cookie_map(context)
|
||||||
if not cookie_session and "cookiesession1" in cookies:
|
if not cookie_session and "cookiesession1" in cookies:
|
||||||
cookie_session = cookies["cookiesession1"]["value"]
|
cookie_session = cookies.get("cookiesession1", {}).get("value")
|
||||||
if not sap_session and "SAP_SESSIONID_PP0_581" in cookies:
|
if not sap_session and "SAP_SESSIONID_PP0_581" in cookies:
|
||||||
sap_session = cookies["SAP_SESSIONID_PP0_581"]["value"]
|
sap_session = cookies.get("SAP_SESSIONID_PP0_581", {}).get("value")
|
||||||
if cookie_session and sap_session:
|
if cookie_session and sap_session:
|
||||||
break
|
break
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
@@ -127,9 +149,9 @@ def fetch_session(
|
|||||||
|
|
||||||
# Re-check cookies for SAP session
|
# Re-check cookies for SAP session
|
||||||
for _ in range(10):
|
for _ in range(10):
|
||||||
cookies = {c["name"]: c for c in context.cookies()}
|
cookies = _cookie_map(context)
|
||||||
if "SAP_SESSIONID_PP0_581" in cookies:
|
if "SAP_SESSIONID_PP0_581" in cookies:
|
||||||
sap_session = cookies["SAP_SESSIONID_PP0_581"]["value"]
|
sap_session = cookies.get("SAP_SESSIONID_PP0_581", {}).get("value")
|
||||||
break
|
break
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
@@ -162,7 +184,7 @@ def fetch_session(
|
|||||||
|
|
||||||
def apply_search_via_ui(
|
def apply_search_via_ui(
|
||||||
headless: bool = True,
|
headless: bool = True,
|
||||||
save_to_env: bool = True,
|
save_to_env: bool = False,
|
||||||
) -> Dict[str, Optional[str]]:
|
) -> Dict[str, Optional[str]]:
|
||||||
"""Drive the UI to initialize the search context.
|
"""Drive the UI to initialize the search context.
|
||||||
|
|
||||||
@@ -182,7 +204,10 @@ def apply_search_via_ui(
|
|||||||
)
|
)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
load_dotenv()
|
try:
|
||||||
|
load_dotenv()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
result: Dict[str, Optional[str]] = {
|
result: Dict[str, Optional[str]] = {
|
||||||
"COOKIE_SESSION": None,
|
"COOKIE_SESSION": None,
|
||||||
"SAP_SESSIONID": None,
|
"SAP_SESSIONID": None,
|
||||||
@@ -197,15 +222,9 @@ def apply_search_via_ui(
|
|||||||
# Click entry button (robust tries)
|
# Click entry button (robust tries)
|
||||||
clicked = False
|
clicked = False
|
||||||
for sel in [
|
for sel in [
|
||||||
lambda: page.get_by_role(
|
lambda: page.get_by_role("button", name=BTN_SEARCH_ENTRY).click(timeout=3000),
|
||||||
"button", name="Ich suche eine Wohnung"
|
lambda: page.get_by_role("link", name=BTN_SEARCH_ENTRY).click(timeout=3000),
|
||||||
).click(timeout=3000),
|
lambda: page.get_by_text(BTN_SEARCH_ENTRY, exact=True).click(timeout=3000),
|
||||||
lambda: page.get_by_role(
|
|
||||||
"link", name="Ich suche eine Wohnung"
|
|
||||||
).click(timeout=3000),
|
|
||||||
lambda: page.get_by_text(
|
|
||||||
"Ich suche eine Wohnung", exact=True
|
|
||||||
).click(timeout=3000),
|
|
||||||
]:
|
]:
|
||||||
try:
|
try:
|
||||||
sel()
|
sel()
|
||||||
@@ -224,26 +243,17 @@ def apply_search_via_ui(
|
|||||||
container = page.locator(
|
container = page.locator(
|
||||||
"div.easy-services-service-container"
|
"div.easy-services-service-container"
|
||||||
).filter(has_text="Immobiliensuche")
|
).filter(has_text="Immobiliensuche")
|
||||||
container.get_by_role(
|
container.get_by_role("button", name=BTN_MORE).first.click(timeout=5000)
|
||||||
"button", name="MEHR ANZEIGEN"
|
|
||||||
).first.click(timeout=5000)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
try:
|
try:
|
||||||
page.get_by_role("button", name="MEHR ANZEIGEN").click(
|
page.get_by_role("button", name=BTN_MORE).click(timeout=5000)
|
||||||
timeout=5000
|
|
||||||
)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Fill "Maximale Trefferanzahl" to 1000
|
# Fill "Maximale Trefferanzahl" to 1000
|
||||||
def _fill_max_results():
|
def _fill_max_results():
|
||||||
# Try common label variations
|
# Try common label variations
|
||||||
for label in (
|
for label in LABEL_MAX_RESULTS:
|
||||||
"Maximale Trefferanzahl",
|
|
||||||
"Maximale Trefferzahl",
|
|
||||||
"Maximale\u00A0Trefferanzahl",
|
|
||||||
"Maximale\u00A0Trefferzahl",
|
|
||||||
):
|
|
||||||
try:
|
try:
|
||||||
page.get_by_label(label).fill("1000", timeout=3000)
|
page.get_by_label(label).fill("1000", timeout=3000)
|
||||||
return True
|
return True
|
||||||
@@ -266,7 +276,7 @@ def apply_search_via_ui(
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
# Click Suchen
|
# Click Suchen
|
||||||
for name in ["Suchen", "SUCHEN"]:
|
for name in BTN_SEARCH:
|
||||||
try:
|
try:
|
||||||
page.get_by_role("button", name=name).click(timeout=4000)
|
page.get_by_role("button", name=name).click(timeout=4000)
|
||||||
break
|
break
|
||||||
@@ -283,15 +293,11 @@ def apply_search_via_ui(
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
# Collect cookies
|
# Collect cookies
|
||||||
cookies = {c["name"]: c for c in context.cookies()}
|
cookies = _cookie_map(context)
|
||||||
if "cookiesession1" in cookies:
|
if "cookiesession1" in cookies:
|
||||||
result["COOKIE_SESSION"] = cookies["cookiesession1"].get(
|
result["COOKIE_SESSION"] = cookies.get("cookiesession1", {}).get("value")
|
||||||
"value"
|
|
||||||
)
|
|
||||||
if "SAP_SESSIONID_PP0_581" in cookies:
|
if "SAP_SESSIONID_PP0_581" in cookies:
|
||||||
result["SAP_SESSIONID"] = cookies["SAP_SESSIONID_PP0_581"].get(
|
result["SAP_SESSIONID"] = cookies.get("SAP_SESSIONID_PP0_581", {}).get("value")
|
||||||
"value"
|
|
||||||
)
|
|
||||||
|
|
||||||
# render next page and keep it open for 10 seconds
|
# render next page and keep it open for 10 seconds
|
||||||
page.wait_for_timeout(10000)
|
page.wait_for_timeout(10000)
|
||||||
|
Reference in New Issue
Block a user