mirror of
https://gitlab.dit.htwk-leipzig.de/fsr-im/tools/flatscraper.git
synced 2025-08-30 01:53:52 +02:00
Compare commits
4 Commits
4f62e2709a
...
main
Author | SHA1 | Date | |
---|---|---|---|
![]() |
825ffe7743 | ||
![]() |
401eafcca5
|
||
![]() |
2c8cea7645
|
||
![]() |
78152dafc0
|
@@ -37,8 +37,7 @@ You can run the bot natively on your machine or use a Docker image. The requirem
|
|||||||
### 1. Environment Setup
|
### 1. Environment Setup
|
||||||
|
|
||||||
Ensure that the `.env` file is configured correctly. An example is available in the `sample.env` file. Copy it to `.env` and fill in the required values.
|
Ensure that the `.env` file is configured correctly. An example is available in the `sample.env` file. Copy it to `.env` and fill in the required values.
|
||||||
The `SAP_SESSIONID` and `COOKIE_SESSSION` are obtained after performing a search on the LWB website. Use your browser's developer tools to locate them in local storage.
|
You no longer need to set `SAP_SESSIONID` or `COOKIE_SESSION` manually. The scraper opens the LWB portal with a headless browser, clicks “Ich suche eine Wohnung”, opens “Immobiliensuche” → “MEHR ANZEIGEN”, sets the maximum results to 1000, clicks “Suchen”, and extracts session cookies automatically.
|
||||||
*Future versions will include automatic form processing to obtain a valid session ID.*
|
|
||||||
|
|
||||||
### 2. Python Environment
|
### 2. Python Environment
|
||||||
|
|
||||||
|
13
main.py
13
main.py
@@ -1,4 +1,3 @@
|
|||||||
from bs4 import BeautifulSoup
|
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
import src.wogetra.scraper as wogetra_scraper
|
import src.wogetra.scraper as wogetra_scraper
|
||||||
@@ -14,6 +13,7 @@ TARGET_URL = "https://www.wogetra.de/immobilien-vermarktungsart/miete/"
|
|||||||
# Store known property IDs to avoid duplicate notifications
|
# Store known property IDs to avoid duplicate notifications
|
||||||
known_properties = set()
|
known_properties = set()
|
||||||
|
|
||||||
|
|
||||||
# Main loop to periodically check for new listings
|
# Main loop to periodically check for new listings
|
||||||
def main():
|
def main():
|
||||||
global known_properties
|
global known_properties
|
||||||
@@ -28,11 +28,11 @@ def main():
|
|||||||
while True:
|
while True:
|
||||||
current_time = time.strftime("%H:%M:%S", time.localtime())
|
current_time = time.strftime("%H:%M:%S", time.localtime())
|
||||||
print("Scraping properties at " + current_time)
|
print("Scraping properties at " + current_time)
|
||||||
|
|
||||||
properties_wogetra = wogetra_scraper.scrape_wogetra()
|
properties_wogetra = wogetra_scraper.scrape_wogetra()
|
||||||
print("Scraped " + str(len(properties_wogetra)) + " properties from Wogetra")
|
print("Scraped " + str(len(properties_wogetra)) + " properties from Wogetra")
|
||||||
properties = properties_wogetra
|
properties = properties_wogetra
|
||||||
|
|
||||||
properties_lwb = lwb_scraper.scrape_easysquare()
|
properties_lwb = lwb_scraper.scrape_easysquare()
|
||||||
print("Scraped " + str(len(properties_lwb)) + " properties from LWB")
|
print("Scraped " + str(len(properties_lwb)) + " properties from LWB")
|
||||||
properties += properties_lwb
|
properties += properties_lwb
|
||||||
@@ -40,15 +40,14 @@ def main():
|
|||||||
properties_lipsia = lipsia_scraper.scrape_lipsia()
|
properties_lipsia = lipsia_scraper.scrape_lipsia()
|
||||||
print("Scraped " + str(len(properties_lipsia)) + " properties from Lipsia")
|
print("Scraped " + str(len(properties_lipsia)) + " properties from Lipsia")
|
||||||
properties += properties_lipsia
|
properties += properties_lipsia
|
||||||
|
|
||||||
properties_bgl = bgl_scraper.fetch_all_properties()
|
properties_bgl = bgl_scraper.fetch_all_properties()
|
||||||
print("Scraped " + str(len(properties_bgl)) + " properties from BGL")
|
print("Scraped " + str(len(properties_bgl)) + " properties from BGL")
|
||||||
properties += properties_bgl
|
properties += properties_bgl
|
||||||
|
|
||||||
properties_vlw = vlw_scraper.scrape_vlw()
|
properties_vlw = vlw_scraper.scrape_vlw()
|
||||||
print("Scraped " + str(len(properties_vlw)) + " properties from VLW")
|
print("Scraped " + str(len(properties_vlw)) + " properties from VLW")
|
||||||
properties = properties_vlw
|
properties += properties_vlw
|
||||||
|
|
||||||
|
|
||||||
for prop in properties:
|
for prop in properties:
|
||||||
if prop["id"] not in known_properties:
|
if prop["id"] not in known_properties:
|
||||||
@@ -56,7 +55,6 @@ def main():
|
|||||||
localwebhook.send_to_discord(prop)
|
localwebhook.send_to_discord(prop)
|
||||||
known_properties.add(prop["id"])
|
known_properties.add(prop["id"])
|
||||||
|
|
||||||
|
|
||||||
# save known properties to file
|
# save known properties to file
|
||||||
with open("known_properties.json", "w") as file:
|
with open("known_properties.json", "w") as file:
|
||||||
json.dump(list(known_properties), file)
|
json.dump(list(known_properties), file)
|
||||||
@@ -65,5 +63,6 @@ def main():
|
|||||||
print("Waiting for the next check...")
|
print("Waiting for the next check...")
|
||||||
time.sleep(300) # Check every 5 minutes
|
time.sleep(300) # Check every 5 minutes
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
@@ -7,3 +7,4 @@ requests==2.32.3
|
|||||||
soupsieve==2.6
|
soupsieve==2.6
|
||||||
typing_extensions==4.12.2
|
typing_extensions==4.12.2
|
||||||
urllib3==2.3.0
|
urllib3==2.3.0
|
||||||
|
playwright==1.45.0
|
||||||
|
@@ -1,3 +1 @@
|
|||||||
SAP_SESSIONID=UrN6nRbjuCBe4dkLw7vkJLcpV5zniRHvhkwAAKG5Agg%3d
|
|
||||||
COOKIE_SESSION=678ADA67ADF0813997206FE9F4132819
|
|
||||||
WEBHOOK_URL=https://discord.com/api/webhooks/1327600813367432462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzLKAaa
|
WEBHOOK_URL=https://discord.com/api/webhooks/1327600813367432462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzLKAaa
|
@@ -34,7 +34,8 @@ def fetch_all_properties():
|
|||||||
'Upgrade-Insecure-Requests': '1',
|
'Upgrade-Insecure-Requests': '1',
|
||||||
'Cache-Control': 'max-age=0',
|
'Cache-Control': 'max-age=0',
|
||||||
}
|
}
|
||||||
|
# debug print url
|
||||||
|
# print(url)
|
||||||
response = requests.request("GET", url, headers=headers)
|
response = requests.request("GET", url, headers=headers)
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
@@ -8,6 +8,7 @@ load_dotenv()
|
|||||||
# Webhook URL from Discord
|
# Webhook URL from Discord
|
||||||
WEBHOOK_URL = os.getenv("WEBHOOK_URL")
|
WEBHOOK_URL = os.getenv("WEBHOOK_URL")
|
||||||
|
|
||||||
|
|
||||||
# Funktion: Nachricht an Discord senden
|
# Funktion: Nachricht an Discord senden
|
||||||
def send_to_discord(property_data):
|
def send_to_discord(property_data):
|
||||||
|
|
||||||
@@ -23,10 +24,9 @@ def send_to_discord(property_data):
|
|||||||
f"**Beschreibung:** {property_data['abstract']}"
|
f"**Beschreibung:** {property_data['abstract']}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# Set headers
|
# Set headers
|
||||||
headers = {"Content-Type": "application/json"}
|
headers = {"Content-Type": "application/json"}
|
||||||
|
|
||||||
# Check for optional image URL
|
# Check for optional image URL
|
||||||
if "image_url" in property_data and property_data["image_url"]:
|
if "image_url" in property_data and property_data["image_url"]:
|
||||||
try:
|
try:
|
||||||
|
File diff suppressed because one or more lines are too long
@@ -1,5 +1,5 @@
|
|||||||
import requests
|
import requests
|
||||||
from src.lwb.scraper import EASYSQUARE_HEADERS, EASYSQUARE_PARAMS
|
from src.lwb.scraper import EASYSQUARE_HEADERS
|
||||||
|
|
||||||
def scrape_image(url, owner):
|
def scrape_image(url, owner):
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
@@ -19,10 +19,14 @@ def scrape_image(url, owner):
|
|||||||
# return empty image
|
# return empty image
|
||||||
return b''
|
return b''
|
||||||
|
|
||||||
if owner == "LWB":
|
if owner == "LWB":
|
||||||
response = session.get(url, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS)
|
# Image URL already carries the required params; only send headers
|
||||||
|
response = session.get(url, headers=EASYSQUARE_HEADERS)
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
|
print(
|
||||||
|
f"Fehler beim Abrufen von Easysquare: "
|
||||||
|
f"{response.status_code}"
|
||||||
|
)
|
||||||
# return empty image
|
# return empty image
|
||||||
return b''
|
return b''
|
||||||
|
|
||||||
@@ -30,8 +34,10 @@ def scrape_image(url, owner):
|
|||||||
if response is None:
|
if response is None:
|
||||||
response = session.get(url)
|
response = session.get(url)
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
print(f"Fehler beim Abrufen der Standardquelle: {response.status_code}")
|
print(
|
||||||
|
f"Fehler beim Abrufen der Standardquelle: "
|
||||||
|
f"{response.status_code}"
|
||||||
|
)
|
||||||
return b''
|
return b''
|
||||||
|
|
||||||
|
|
||||||
return response.content
|
return response.content
|
||||||
|
@@ -2,82 +2,133 @@ import requests
|
|||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
import src.lwb.format as format
|
import src.lwb.format as format
|
||||||
import hashlib
|
import hashlib
|
||||||
import os
|
import time
|
||||||
from dotenv import load_dotenv
|
|
||||||
|
|
||||||
load_dotenv()
|
SESSION_CREATE_URL = (
|
||||||
|
"https://portal1s.easysquare.com/meinelwb/index.html"
|
||||||
|
"?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
|
||||||
|
)
|
||||||
|
|
||||||
SESSION_CREATE_URL = "https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
|
# Session tokens (resolved dynamically via session bootstrap; no .env needed)
|
||||||
|
SAP_SESSIONID = None
|
||||||
SAP_SESSIONID = os.getenv("SAP_SESSIONID")
|
COOKIE_SESSION = None
|
||||||
COOKIE_SESSION = os.getenv("COOKIE_SESSION")
|
|
||||||
|
|
||||||
EASYSQUARE_URL = "https://portal1s.easysquare.com/prorex/xmlforms"
|
EASYSQUARE_URL = "https://portal1s.easysquare.com/prorex/xmlforms"
|
||||||
|
|
||||||
|
# Mutable headers so imports see updates when we refresh session values
|
||||||
EASYSQUARE_HEADERS = {
|
EASYSQUARE_HEADERS = {
|
||||||
"DNT": "1",
|
"DNT": "1",
|
||||||
"Host": "portal1s.easysquare.com",
|
"Host": "portal1s.easysquare.com",
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
"Accept": (
|
||||||
"Cookie": f"SAP_SESSIONID_PP0_581={SAP_SESSIONID}; sap-usercontext=sap-language=D&sap-client=581; cookiesession1={COOKIE_SESSION}",
|
"text/html,application/xhtml+xml,"
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
|
"application/xml;q=0.9,*/*;q=0.8"
|
||||||
|
),
|
||||||
|
# Cookie gets filled by _update_cookie_header()
|
||||||
|
"Cookie": "",
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) "
|
||||||
|
"Gecko/20100101 Firefox/135.0"
|
||||||
|
),
|
||||||
"Accept-Encoding": "gzip, deflate, br, zstd",
|
"Accept-Encoding": "gzip, deflate, br, zstd",
|
||||||
"Accept-Language": "de,en-US;q=0.7,en;q=0.3",
|
"Accept-Language": "de,en-US;q=0.7,en;q=0.3",
|
||||||
"Upgrade-Insecure-Requests": "1"
|
"Upgrade-Insecure-Requests": "1",
|
||||||
|
# Helpful headers observed from network calls
|
||||||
|
"X-Requested-With": "XMLHttpRequest",
|
||||||
|
"Referer": SESSION_CREATE_URL,
|
||||||
}
|
}
|
||||||
EASYSQUARE_PARAMS = {
|
EASYSQUARE_PARAMS = {
|
||||||
"application": "ESQ_IA_REOBJ",
|
"application": "ESQ_IA_REOBJ",
|
||||||
"sap-client": "581",
|
"sap-client": "581",
|
||||||
"command": "action",
|
"command": "action",
|
||||||
"name": "boxlist",
|
"name": "boxlist",
|
||||||
"api": "6.169",
|
"api": "6.249",
|
||||||
"head-oppc-version": "6.169.22",
|
"head-oppc-version": "6.249.1",
|
||||||
"_": "1736761256321"
|
# dynamic '_' gets applied at request time; leave placeholder
|
||||||
|
"_": "1755259702945",
|
||||||
}
|
}
|
||||||
|
|
||||||
SETUP_QUERY_PARAMS_URL = "https://portal1s.easysquare.com/prorex/xmlforms?application=ESQ_IA_REOBJ&sap-client=581&command=action&name=save&id=4B76A3C8-3E4D-4217-B54A-7C28C920748C&api=6.169&head-oppc-version=6.169.22&originalId=842F0073-DC21-A841-4E80-B1BD5E404E35&resourceOrigin=form"
|
SETUP_QUERY_PARAMS_URL = (
|
||||||
|
"https://portal1s.easysquare.com/prorex/xmlforms"
|
||||||
|
"?application=ESQ_IA_REOBJ"
|
||||||
|
"&sap-client=581"
|
||||||
|
"&command=action"
|
||||||
|
"&name=save"
|
||||||
|
"&id=E3920A27-432A-4127-96FC-6433ED32FDDE"
|
||||||
|
"&api=6.249"
|
||||||
|
"&head-oppc-version=6.249.1"
|
||||||
|
"&originalId=3C9DAA99-1C5D-4810-5B5E-AFE704639EF5"
|
||||||
|
"&resourceOrigin=form"
|
||||||
|
)
|
||||||
|
|
||||||
# curl --location 'https://portal1s.easysquare.com/prorex/xmlforms?application=ESQ_IA_REOBJ&sap-client=581&command=action&name=boxlist&api=6.169&head-oppc-version=6.169.22&_=1736761255682' \
|
# Example curl reference omitted for brevity.
|
||||||
# --header 'DNT: 1' \
|
|
||||||
# --header 'UTC: 1736761256321' \
|
|
||||||
# --header 'Host: portal1s.easysquare.com' \
|
|
||||||
# --header 'host: portal1s.easysquare.com' \
|
|
||||||
# --header 'Accept: text/plain, */*; q=0.01' \
|
|
||||||
# --header 'Cookie: cookiesession1=678ADA67ADF0813997206FE9F4133118; sap-usercontext=sap-language=de&sap-client=581; SAP_SESSIONID_PP0_581=Vg3w4pn8whD76BldaU2wvP-YzyrRkRHvhWoKELG5Agg%3d' \
|
|
||||||
# --header 'Referer: https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL' \
|
|
||||||
# --header 'Sec-GPC: 1' \
|
|
||||||
# --header 'oppc-id: D9925A2D-4ED9-4911-8AD3-2626DA41FBB0' \
|
|
||||||
# --header 'Connection: keep-alive' \
|
|
||||||
# --header 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0' \
|
|
||||||
# --header 'Content-Type: text/plain;charset=UTF-8' \
|
|
||||||
# --header 'Sec-Fetch-Dest: empty' \
|
|
||||||
# --header 'Sec-Fetch-Mode: cors' \
|
|
||||||
# --header 'Sec-Fetch-Site: same-origin' \
|
|
||||||
# --header 'Accept-Encoding: gzip, deflate, br, zstd' \
|
|
||||||
# --header 'Accept-Language: de,en-US;q=0.7,en;q=0.3' \
|
|
||||||
# --header 'X-Requested-With: XMLHttpRequest'
|
|
||||||
|
|
||||||
|
|
||||||
# setup query params for lwb session
|
def _now_ms() -> str:
|
||||||
def setup_query_params():
|
return str(int(time.time() * 1000))
|
||||||
|
|
||||||
# request this url with POST an xml form
|
|
||||||
|
|
||||||
# load xml form from file
|
def _update_cookie_header():
|
||||||
xml_form = ""
|
"""Update Cookie header so other modules can see fresh values."""
|
||||||
with open("src/lwb/lwb_form.xml", "r") as file:
|
cookie = []
|
||||||
xml_form = file.read()
|
if SAP_SESSIONID:
|
||||||
|
cookie.append(f"SAP_SESSIONID_PP0_581={SAP_SESSIONID}")
|
||||||
# post xml form to SETUP_QUERY_PARAMS_URL
|
cookie.append("sap-usercontext=sap-language=D&sap-client=581")
|
||||||
response = requests.post(SETUP_QUERY_PARAMS_URL, data=xml_form, headers=EASYSQUARE_HEADERS)
|
if COOKIE_SESSION:
|
||||||
|
cookie.append(f"cookiesession1={COOKIE_SESSION}")
|
||||||
|
EASYSQUARE_HEADERS["Cookie"] = "; ".join(cookie)
|
||||||
|
|
||||||
if response.status_code != 200:
|
|
||||||
print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
print(response.content)
|
|
||||||
|
|
||||||
return response.content
|
# initialize cookie header from any .env-provided values
|
||||||
|
_update_cookie_header()
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_csrf_token(session: requests.Session) -> str:
|
||||||
|
"""Fetch an X-CSRF-Token by calling the list endpoint with 'fetch'."""
|
||||||
|
_update_cookie_header()
|
||||||
|
headers = dict(EASYSQUARE_HEADERS)
|
||||||
|
headers["X-CSRF-Token"] = "fetch"
|
||||||
|
headers["Accept"] = "text/plain, */*; q=0.01"
|
||||||
|
headers["UTC"] = _now_ms()
|
||||||
|
params = dict(EASYSQUARE_PARAMS)
|
||||||
|
params["_"] = _now_ms()
|
||||||
|
r = session.get(EASYSQUARE_URL, headers=headers, params=params)
|
||||||
|
return r.headers.get("x-csrf-token") or r.headers.get("X-CSRF-Token") or ""
|
||||||
|
|
||||||
|
|
||||||
|
# setup query params for lwb session (submit stored filter form)
|
||||||
|
def setup_query_params(headless: bool = True) -> bool:
|
||||||
|
"""Initialize search parameters via Playwright by driving the UI.
|
||||||
|
|
||||||
|
Replaces the previous XML POST approach. It will:
|
||||||
|
- click "Ich suche eine Wohnung"
|
||||||
|
- open the "Immobiliensuche" card and click "MEHR ANZEIGEN"
|
||||||
|
- set "Maximale Trefferanzahl" to 1000
|
||||||
|
- click "Suchen"
|
||||||
|
Updates in-memory cookies if Playwright yields fresher values.
|
||||||
|
Returns True if the flow ran; False otherwise.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from src.lwb.session_bootstrap import apply_search_via_ui
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Playwright-Setup nicht verfügbar: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
vals = apply_search_via_ui(headless=headless, save_to_env=False)
|
||||||
|
global COOKIE_SESSION, SAP_SESSIONID
|
||||||
|
if vals.get("COOKIE_SESSION"):
|
||||||
|
COOKIE_SESSION = vals["COOKIE_SESSION"]
|
||||||
|
if vals.get("SAP_SESSIONID"):
|
||||||
|
SAP_SESSIONID = vals["SAP_SESSIONID"]
|
||||||
|
_update_cookie_header()
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Fehler beim Setzen der Suchparameter per UI: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Call Session Create and get the session from the response cookies
|
||||||
|
|
||||||
|
|
||||||
# Call Session Create and get the session from teh response cookies
|
|
||||||
def create_session():
|
def create_session():
|
||||||
# request url with chromium browser and get the cookies
|
# request url with chromium browser and get the cookies
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
@@ -86,91 +137,142 @@ def create_session():
|
|||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
print(f"Fehler Session von Easysquare: {response.status_code}")
|
print(f"Fehler Session von Easysquare: {response.status_code}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# get the cookies from the response
|
# get the cookies from the response
|
||||||
cookies = response.cookies
|
cookies = response.cookies
|
||||||
|
global COOKIE_SESSION
|
||||||
COOKIE_SESSION = cookies.get("cookiesession1")
|
COOKIE_SESSION = cookies.get("cookiesession1")
|
||||||
print(COOKIE_SESSION)
|
_update_cookie_header()
|
||||||
|
url = (
|
||||||
|
"https://portal1s.easysquare.com/meinelwb/api5/authenticate"
|
||||||
url = "https://portal1s.easysquare.com/meinelwb/api5/authenticate?api=6.169&sap-language=de"
|
"?api=6.169&sap-language=de"
|
||||||
|
)
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
'sap-field_b64': "dXNlcj1ERU1PJnBhc3N3b3JkPXByb21vczE2"
|
'sap-field_b64': "dXNlcj1ERU1PJnBhc3N3b3JkPXByb21vczE2"
|
||||||
}
|
}
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
'DNT': '1',
|
'DNT': '1',
|
||||||
'UTC': '1738713279005',
|
'UTC': _now_ms(),
|
||||||
'Host': 'portal1s.easysquare.com',
|
'Host': 'portal1s.easysquare.com',
|
||||||
'host': 'portal1s.easysquare.com',
|
'host': 'portal1s.easysquare.com',
|
||||||
'Accept': 'text/html, */*; q=0.01',
|
'Accept': 'text/html, */*; q=0.01',
|
||||||
'Cookie': f'esq-alias=%2fmeinelwb; sap-usercontext=sap-language=de&sap-client=581; cookiesession1={COOKIE_SESSION}',
|
'Cookie': (
|
||||||
'Origin': 'https://portal1s.easysquare.com',
|
'esq-alias=%2fmeinelwb; '
|
||||||
'Referer': 'https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL',
|
'sap-usercontext=sap-language=de&sap-client=581; '
|
||||||
'Sec-GPC': '1',
|
f'cookiesession1={COOKIE_SESSION}'
|
||||||
'Connection': 'keep-alive',
|
),
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0',
|
'Origin': 'https://portal1s.easysquare.com',
|
||||||
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
|
'Referer': SESSION_CREATE_URL,
|
||||||
'X-CSRF-Token': 'fetch',
|
'Sec-GPC': '1',
|
||||||
'Sec-Fetch-Dest': 'empty',
|
'Connection': 'keep-alive',
|
||||||
'Sec-Fetch-Mode': 'cors',
|
'User-Agent': EASYSQUARE_HEADERS['User-Agent'],
|
||||||
'Sec-Fetch-Site': 'same-origin',
|
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
|
||||||
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
'X-CSRF-Token': 'fetch',
|
||||||
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
|
'Sec-Fetch-Dest': 'empty',
|
||||||
'X-Requested-With': 'XMLHttpRequest'
|
'Sec-Fetch-Mode': 'cors',
|
||||||
|
'Sec-Fetch-Site': 'same-origin',
|
||||||
|
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
||||||
|
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
|
||||||
|
'X-Requested-With': 'XMLHttpRequest'
|
||||||
}
|
}
|
||||||
|
|
||||||
print(headers)
|
response = requests.request(
|
||||||
|
"POST",
|
||||||
response = requests.request("POST", url, headers=headers, data=payload)
|
url,
|
||||||
|
headers=headers,
|
||||||
print(response.text)
|
data=payload,
|
||||||
|
)
|
||||||
|
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
print(f"Fehler beim Session Erstellen via Easysquare: {response.status_code}")
|
print(
|
||||||
|
f"Fehler beim Session Erstellen via Easysquare: "
|
||||||
|
f"{response.status_code}"
|
||||||
|
)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# get the cookies from the response
|
# get the cookies from the response
|
||||||
cookies = response.cookies
|
cookies = response.cookies
|
||||||
global SAP_SESSIONID
|
global SAP_SESSIONID
|
||||||
SAP_SESSIONID = cookies.get("SAP_SESSIONID_PP0_581")
|
SAP_SESSIONID = cookies.get("SAP_SESSIONID_PP0_581")
|
||||||
|
_update_cookie_header()
|
||||||
print(SAP_SESSIONID)
|
print(f"SAP_SESSIONID_PP0_581: {SAP_SESSIONID}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Funktion: Scrape von Easysquare
|
# Funktion: Scrape von Easysquare
|
||||||
|
def _ensure_session_ready():
|
||||||
|
# If missing tokens, try to create session
|
||||||
|
global COOKIE_SESSION, SAP_SESSIONID
|
||||||
|
if not COOKIE_SESSION or not SAP_SESSIONID:
|
||||||
|
# First try headless browser bootstrap for robust cookie capture
|
||||||
|
try:
|
||||||
|
from src.lwb.session_bootstrap import fetch_session
|
||||||
|
|
||||||
|
vals = fetch_session(headless=True, save_to_env=False)
|
||||||
|
got_cookie = vals.get("COOKIE_SESSION")
|
||||||
|
got_sap = vals.get("SAP_SESSIONID")
|
||||||
|
if got_cookie:
|
||||||
|
COOKIE_SESSION = got_cookie
|
||||||
|
if got_sap:
|
||||||
|
SAP_SESSIONID = got_sap
|
||||||
|
_update_cookie_header()
|
||||||
|
except Exception:
|
||||||
|
# Fallback to legacy request-based bootstrap
|
||||||
|
create_session()
|
||||||
|
|
||||||
|
|
||||||
def scrape_easysquare():
|
def scrape_easysquare():
|
||||||
|
_ensure_session_ready()
|
||||||
|
|
||||||
|
# Submit stored search/filter form to ensure listing context is ready
|
||||||
|
try:
|
||||||
|
setup_query_params()
|
||||||
|
except Exception as e:
|
||||||
|
# non-fatal; we'll still attempt list fetch
|
||||||
|
print(f"Warnung: Setup der Suchparameter fehlgeschlagen: {e}")
|
||||||
|
|
||||||
|
# Build params with fresh timestamp
|
||||||
|
params = dict(EASYSQUARE_PARAMS)
|
||||||
|
params["_"] = _now_ms()
|
||||||
|
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
response = session.get(EASYSQUARE_URL, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS)
|
response = session.get(
|
||||||
|
EASYSQUARE_URL,
|
||||||
|
headers=EASYSQUARE_HEADERS,
|
||||||
|
params=params,
|
||||||
|
)
|
||||||
|
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
|
print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
|
||||||
# print("Versuche Session zu erstellen")
|
|
||||||
# create_session()
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# XML-Daten parsen
|
# XML-Daten parsen
|
||||||
root = ET.fromstring(response.content)
|
root = ET.fromstring(response.content)
|
||||||
namespace = {"ns": "http://www.openpromos.com/OPPC/XMLForms"}
|
namespace = {"ns": "http://www.openpromos.com/OPPC/XMLForms"}
|
||||||
|
|
||||||
properties = []
|
properties = []
|
||||||
for head in root.findall(".//ns:head", namespace):
|
for head in root.findall(".//ns:head", namespace):
|
||||||
prop_title = head.find("ns:title", namespace).text
|
title_el = head.find("ns:title", namespace)
|
||||||
subtitle = head.find("ns:subtitle", namespace).text
|
subtitle_el = head.find("ns:subtitle", namespace)
|
||||||
abstract = head.find("ns:abstract", namespace).text.strip()
|
abstract_el = head.find("ns:abstract", namespace)
|
||||||
|
prop_title = title_el.text if title_el is not None else ""
|
||||||
|
subtitle = subtitle_el.text if subtitle_el is not None else ""
|
||||||
|
abstract = (
|
||||||
|
abstract_el.text.strip()
|
||||||
|
if abstract_el is not None and abstract_el.text
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
|
||||||
# get adress lat and long
|
# get address lat/lon
|
||||||
# <address city="" lat="51.346061" lon="12.3774656" postcode="" street=""/>
|
address_el = head.find("ns:address", namespace)
|
||||||
|
lat = address_el.get("lat") if address_el is not None else ""
|
||||||
adress = head.find("ns:address", namespace)
|
lon = address_el.get("lon") if address_el is not None else ""
|
||||||
lat = adress.get("lat")
|
|
||||||
lon = adress.get("lon")
|
|
||||||
|
|
||||||
image = head.find("ns:image", namespace)
|
image = head.find("ns:image", namespace)
|
||||||
iamge_resourceId = image.get("resourceId")
|
image_resource_id = (
|
||||||
|
image.get("resourceId") if image is not None else ""
|
||||||
id = head.find("ns:id", namespace).text
|
)
|
||||||
|
|
||||||
# Details extrahieren
|
# Details extrahieren
|
||||||
rooms = "N/A"
|
rooms = "N/A"
|
||||||
@@ -191,28 +293,42 @@ def scrape_easysquare():
|
|||||||
availability = value
|
availability = value
|
||||||
|
|
||||||
# link create google maps link with lat and long
|
# link create google maps link with lat and long
|
||||||
link = f"https://www.google.com/maps/search/?api=1&query={lat},{lon}"
|
link = (
|
||||||
|
f"https://www.google.com/maps/search/?api=1&query={lat},{lon}"
|
||||||
|
if lat and lon
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
|
||||||
# https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id=1EC8D4E6-191A-A827-47FF-72D8C5379070&name=get
|
# image url for listing
|
||||||
image_url = f"https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id={iamge_resourceId}&name=get"
|
base_img = "https://portal1s.easysquare.com/prorex/xmlforms/image.jpg"
|
||||||
|
image_url = (
|
||||||
|
f"{base_img}?application=ESQ_IA_REOBJ&command=action"
|
||||||
|
f"&id={image_resource_id}&name=get"
|
||||||
|
if image_resource_id
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
|
||||||
# the id should be a hash create by the title, subtitle, rooms, size, rent, availability
|
# Hash from title, subtitle, rooms, size, rent, availability
|
||||||
hashID = f"{prop_title}{subtitle}{rooms}{size}{rent}{availability}"
|
hash_id = (
|
||||||
id = hashlib.sha256(hashID.encode('utf-8')).hexdigest()
|
f"{prop_title}{subtitle}{rooms}{size}{rent}{availability}"
|
||||||
|
)
|
||||||
|
prop_id = hashlib.sha256(hash_id.encode('utf-8')).hexdigest()
|
||||||
|
|
||||||
properties.append({
|
properties.append(
|
||||||
"id": id,
|
{
|
||||||
"title": "LWB - " + prop_title,
|
"id": prop_id,
|
||||||
"subtitle": subtitle,
|
"title": f"LWB - {prop_title}",
|
||||||
"rooms": format.format_room(rooms),
|
"subtitle": subtitle,
|
||||||
"size": format.format_roomSize(size),
|
"rooms": format.format_room(rooms),
|
||||||
"rent": format.format_money(rent),
|
"size": format.format_roomSize(size),
|
||||||
"link": link,
|
"rent": format.format_money(rent),
|
||||||
"abstract": abstract,
|
"link": link,
|
||||||
"warm_rent": "",
|
"abstract": abstract,
|
||||||
"availability": format.format_date(availability),
|
"warm_rent": "",
|
||||||
"image_url": image_url,
|
"availability": format.format_date(availability),
|
||||||
"owner": "LWB",
|
"image_url": image_url,
|
||||||
})
|
"owner": "LWB",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return properties
|
return properties
|
||||||
|
347
src/lwb/session_bootstrap.py
Normal file
347
src/lwb/session_bootstrap.py
Normal file
@@ -0,0 +1,347 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from typing import Dict, Optional
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
|
||||||
|
SESSION_CREATE_URL = (
|
||||||
|
"https://portal1s.easysquare.com/meinelwb/index.html"
|
||||||
|
"?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
|
||||||
|
)
|
||||||
|
|
||||||
|
# UI string constants
|
||||||
|
BTN_SEARCH_ENTRY = "Ich suche eine Wohnung"
|
||||||
|
BTN_MORE = "MEHR ANZEIGEN"
|
||||||
|
BTN_SEARCH = ("Suchen", "SUCHEN")
|
||||||
|
LABEL_MAX_RESULTS = (
|
||||||
|
"Maximale Trefferanzahl",
|
||||||
|
"Maximale Trefferzahl",
|
||||||
|
"Maximale\u00A0Trefferanzahl",
|
||||||
|
"Maximale\u00A0Trefferzahl",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _cookie_map(context) -> dict:
|
||||||
|
"""Return a name->cookie dict safely from Playwright context cookies."""
|
||||||
|
mapping = {}
|
||||||
|
try:
|
||||||
|
for c in context.cookies(): # type: ignore[attr-defined]
|
||||||
|
name = c.get("name") if isinstance(c, dict) else None
|
||||||
|
if name:
|
||||||
|
mapping[name] = c
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return mapping
|
||||||
|
|
||||||
|
|
||||||
|
def _update_env_file(values: Dict[str, str], env_path: str = ".env") -> None:
|
||||||
|
"""Create or update .env with given key/value pairs."""
|
||||||
|
existing = {}
|
||||||
|
if os.path.exists(env_path):
|
||||||
|
with open(env_path, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line or line.startswith("#") or "=" not in line:
|
||||||
|
continue
|
||||||
|
k, v = line.split("=", 1)
|
||||||
|
existing[k] = v
|
||||||
|
existing.update(values)
|
||||||
|
lines = [f"{k}={v}\n" for k, v in existing.items()]
|
||||||
|
with open(env_path, "w", encoding="utf-8") as f:
|
||||||
|
f.writelines(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_session(
|
||||||
|
headless: bool = True,
|
||||||
|
save_to_env: bool = False,
|
||||||
|
) -> Dict[str, Optional[str]]:
|
||||||
|
"""Open the LWB portal in a headless browser and extract cookies.
|
||||||
|
|
||||||
|
Returns a dict with keys: COOKIE_SESSION, SAP_SESSIONID.
|
||||||
|
When save_to_env=True, writes/updates these in .env at repo root.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
except Exception: # pragma: no cover - dependency missing
|
||||||
|
print(
|
||||||
|
"Playwright not installed. Install: pip install playwright "
|
||||||
|
"&& python -m playwright install"
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
|
||||||
|
# .env not required for cookie fetch; loading is harmless but optional
|
||||||
|
try:
|
||||||
|
load_dotenv()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
result: Dict[str, Optional[str]] = {
|
||||||
|
"COOKIE_SESSION": None,
|
||||||
|
"SAP_SESSIONID": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
browser = p.chromium.launch(headless=headless)
|
||||||
|
context = browser.new_context()
|
||||||
|
page = context.new_page()
|
||||||
|
page.goto(SESSION_CREATE_URL, wait_until="networkidle")
|
||||||
|
|
||||||
|
# Click the entry button: "Ich suche eine Wohnung"
|
||||||
|
try:
|
||||||
|
page.get_by_role("button", name=BTN_SEARCH_ENTRY).click(timeout=3000)
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
page.get_by_role("link", name=BTN_SEARCH_ENTRY).click(timeout=3000)
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
page.get_by_text(BTN_SEARCH_ENTRY, exact=True).click(timeout=3000)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Let the portal finish any navigation after the click
|
||||||
|
try:
|
||||||
|
page.wait_for_load_state("networkidle", timeout=5000)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Try to read cookiesession1 quickly (set by the portal)
|
||||||
|
cookie_session = None
|
||||||
|
sap_session = None
|
||||||
|
for _ in range(10):
|
||||||
|
cookies = _cookie_map(context)
|
||||||
|
if not cookie_session and "cookiesession1" in cookies:
|
||||||
|
cookie_session = cookies.get("cookiesession1", {}).get("value")
|
||||||
|
if not sap_session and "SAP_SESSIONID_PP0_581" in cookies:
|
||||||
|
sap_session = cookies.get("SAP_SESSIONID_PP0_581", {}).get("value")
|
||||||
|
if cookie_session and sap_session:
|
||||||
|
break
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
# If SAP session not present, trigger authenticate from the page context
|
||||||
|
if not sap_session:
|
||||||
|
try:
|
||||||
|
js_lines = [
|
||||||
|
"async () => {",
|
||||||
|
" const base = '/meinelwb/api5/authenticate';",
|
||||||
|
" const url = base + '?api=6.169&sap-language=de';",
|
||||||
|
" const body = new URLSearchParams({",
|
||||||
|
" 'sap-field_b64':",
|
||||||
|
" 'dXNlcj1ERU1PJnBhc3N3b3JkPXByb21vczE2'",
|
||||||
|
" }).toString();",
|
||||||
|
" await fetch(url, {",
|
||||||
|
" method: 'POST',",
|
||||||
|
" headers: {",
|
||||||
|
" 'Content-Type':",
|
||||||
|
" 'application/x-www-form-urlencoded;' +",
|
||||||
|
" ' charset=UTF-8',",
|
||||||
|
" 'X-CSRF-Token': 'fetch',",
|
||||||
|
" 'X-Requested-With': 'XMLHttpRequest'",
|
||||||
|
" },",
|
||||||
|
" body",
|
||||||
|
" });",
|
||||||
|
" return true;",
|
||||||
|
"}",
|
||||||
|
]
|
||||||
|
page.evaluate("\n".join(js_lines))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Re-check cookies for SAP session
|
||||||
|
for _ in range(10):
|
||||||
|
cookies = _cookie_map(context)
|
||||||
|
if "SAP_SESSIONID_PP0_581" in cookies:
|
||||||
|
sap_session = cookies.get("SAP_SESSIONID_PP0_581", {}).get("value")
|
||||||
|
break
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
# Best-effort read of localStorage (useful for debugging)
|
||||||
|
try:
|
||||||
|
storage_dump = page.evaluate(
|
||||||
|
"() => JSON.stringify(window.localStorage)"
|
||||||
|
)
|
||||||
|
if storage_dump and len(storage_dump) > 2:
|
||||||
|
pass # Not strictly needed; kept for debugging future issues
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
result["COOKIE_SESSION"] = cookie_session
|
||||||
|
result["SAP_SESSIONID"] = sap_session
|
||||||
|
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
if save_to_env and (result["COOKIE_SESSION"] or result["SAP_SESSIONID"]):
|
||||||
|
to_write = {}
|
||||||
|
if result["COOKIE_SESSION"]:
|
||||||
|
to_write["COOKIE_SESSION"] = result["COOKIE_SESSION"]
|
||||||
|
if result["SAP_SESSIONID"]:
|
||||||
|
to_write["SAP_SESSIONID"] = result["SAP_SESSIONID"]
|
||||||
|
if to_write:
|
||||||
|
_update_env_file(to_write)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def apply_search_via_ui(
|
||||||
|
headless: bool = True,
|
||||||
|
save_to_env: bool = False,
|
||||||
|
) -> Dict[str, Optional[str]]:
|
||||||
|
"""Drive the UI to initialize the search context.
|
||||||
|
|
||||||
|
Steps:
|
||||||
|
- Open portal, click "Ich suche eine Wohnung".
|
||||||
|
- In services section with title "Immobiliensuche", click "MEHR ANZEIGEN".
|
||||||
|
- Set "Maximale Trefferanzahl" to 1000 and click "Suchen".
|
||||||
|
Returns latest cookies (COOKIE_SESSION, SAP_SESSIONID) and optionally
|
||||||
|
writes them to .env.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
except Exception: # pragma: no cover
|
||||||
|
print(
|
||||||
|
"Playwright not installed. Install: pip install playwright "
|
||||||
|
"&& python -m playwright install"
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
|
||||||
|
try:
|
||||||
|
load_dotenv()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
result: Dict[str, Optional[str]] = {
|
||||||
|
"COOKIE_SESSION": None,
|
||||||
|
"SAP_SESSIONID": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
browser = p.chromium.launch(headless=headless)
|
||||||
|
context = browser.new_context()
|
||||||
|
page = context.new_page()
|
||||||
|
page.goto(SESSION_CREATE_URL, wait_until="networkidle")
|
||||||
|
|
||||||
|
# Click entry button (robust tries)
|
||||||
|
clicked = False
|
||||||
|
for sel in [
|
||||||
|
lambda: page.get_by_role("button", name=BTN_SEARCH_ENTRY).click(timeout=3000),
|
||||||
|
lambda: page.get_by_role("link", name=BTN_SEARCH_ENTRY).click(timeout=3000),
|
||||||
|
lambda: page.get_by_text(BTN_SEARCH_ENTRY, exact=True).click(timeout=3000),
|
||||||
|
]:
|
||||||
|
try:
|
||||||
|
sel()
|
||||||
|
clicked = True
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if clicked:
|
||||||
|
try:
|
||||||
|
page.wait_for_load_state("networkidle", timeout=5000)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Click MEHR ANZEIGEN in the Immobiliensuche container
|
||||||
|
try:
|
||||||
|
container = page.locator(
|
||||||
|
"div.easy-services-service-container"
|
||||||
|
).filter(has_text="Immobiliensuche")
|
||||||
|
container.get_by_role("button", name=BTN_MORE).first.click(timeout=5000)
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
page.get_by_role("button", name=BTN_MORE).click(timeout=5000)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fill "Maximale Trefferanzahl" to 1000
|
||||||
|
def _fill_max_results():
|
||||||
|
# Try common label variations
|
||||||
|
for label in LABEL_MAX_RESULTS:
|
||||||
|
try:
|
||||||
|
page.get_by_label(label).fill("1000", timeout=3000)
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
# Fallback: find inputs near the label text
|
||||||
|
try:
|
||||||
|
near = page.get_by_text("Maximale")
|
||||||
|
near_locator = near.locator(
|
||||||
|
"xpath=following::input[1]"
|
||||||
|
)
|
||||||
|
near_locator.fill("1000", timeout=3000)
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
_fill_max_results()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Click Suchen
|
||||||
|
for name in BTN_SEARCH:
|
||||||
|
try:
|
||||||
|
page.get_by_role("button", name=name).click(timeout=4000)
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
page.get_by_text(name, exact=True).click(timeout=4000)
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
page.wait_for_load_state("networkidle", timeout=6000)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Collect cookies
|
||||||
|
cookies = _cookie_map(context)
|
||||||
|
if "cookiesession1" in cookies:
|
||||||
|
result["COOKIE_SESSION"] = cookies.get("cookiesession1", {}).get("value")
|
||||||
|
if "SAP_SESSIONID_PP0_581" in cookies:
|
||||||
|
result["SAP_SESSIONID"] = cookies.get("SAP_SESSIONID_PP0_581", {}).get("value")
|
||||||
|
|
||||||
|
# render next page and keep it open for 10 seconds
|
||||||
|
page.wait_for_timeout(10000)
|
||||||
|
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
if save_to_env and (result["COOKIE_SESSION"] or result["SAP_SESSIONID"]):
|
||||||
|
to_write = {}
|
||||||
|
if result["COOKIE_SESSION"]:
|
||||||
|
to_write["COOKIE_SESSION"] = result["COOKIE_SESSION"]
|
||||||
|
if result["SAP_SESSIONID"]:
|
||||||
|
to_write["SAP_SESSIONID"] = result["SAP_SESSIONID"]
|
||||||
|
if to_write:
|
||||||
|
_update_env_file(to_write)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Fetch LWB session cookies via headless browser"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--headed", action="store_true", help="Run browser in headed mode"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-save", action="store_true", help="Do not write values into .env"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
try:
|
||||||
|
vals = fetch_session(
|
||||||
|
headless=not args.headed,
|
||||||
|
save_to_env=not args.no_save,
|
||||||
|
)
|
||||||
|
print("COOKIE_SESSION=", vals.get("COOKIE_SESSION"))
|
||||||
|
print("SAP_SESSIONID=", vals.get("SAP_SESSIONID"))
|
||||||
|
if not (vals.get("COOKIE_SESSION") and vals.get("SAP_SESSIONID")):
|
||||||
|
print(
|
||||||
|
"Warning: One or both values are missing. Try --headed to "
|
||||||
|
"complete any prompts."
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error while fetching session: {e}")
|
||||||
|
sys.exit(1)
|
@@ -17,10 +17,11 @@ def scrape_vlw():
|
|||||||
"senden": "suchen",
|
"senden": "suchen",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#debug print url
|
||||||
|
# print(f"Fetching VLW properties from: {url} with params: {parameter}")
|
||||||
response = requests.get(url=url, params=parameter)
|
response = requests.get(url=url, params=parameter)
|
||||||
soup = BeautifulSoup(response.content, 'html.parser')
|
soup = BeautifulSoup(response.content, 'html.parser')
|
||||||
|
|
||||||
|
|
||||||
properties = []
|
properties = []
|
||||||
|
|
||||||
# get div with class "estate-result-list"
|
# get div with class "estate-result-list"
|
||||||
|
Reference in New Issue
Block a user