mirror of
https://gitlab.dit.htwk-leipzig.de/fsr-im/tools/flatscraper.git
synced 2025-07-16 03:28:48 +02:00
add initial scraping functionality and related utilities
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
.venv
|
1
known_properties.json
Normal file
1
known_properties.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
["803DF7B0-1125-2AA4-90AC-CF0CAEAC625A", "id-193-1-13", "7421A7DD-D9B4-72D3-0A91-5C3DA60C26E4", "B4095706-A65C-F421-B02E-1D227B684B62", "BD41AC52-BADB-428F-AF4B-11BBC374F2F1", "id-154-2-71", "id-105-12-78", "id-88-5-45"]
|
49
main.py
Normal file
49
main.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import src.wogetra.scraper as wogetra_scraper
|
||||||
|
import src.lwb.scraper as lwb_scraper
|
||||||
|
import src.discord.webhook as localwebhook
|
||||||
|
|
||||||
|
|
||||||
|
# URL of the website to scrape
|
||||||
|
TARGET_URL = "https://www.wogetra.de/immobilien-vermarktungsart/miete/"
|
||||||
|
|
||||||
|
# Store known property IDs to avoid duplicate notifications
|
||||||
|
known_properties = set()
|
||||||
|
|
||||||
|
# Main loop to periodically check for new listings
|
||||||
|
def main():
|
||||||
|
global known_properties
|
||||||
|
|
||||||
|
# Load known properties from file
|
||||||
|
try:
|
||||||
|
with open("known_properties.json", "r") as file:
|
||||||
|
known_properties = set(json.load(file))
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("No known properties file found. Starting fresh.")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
print("Scraping properties...")
|
||||||
|
print("Scraping properties from Wogetra...")
|
||||||
|
properties = wogetra_scraper.scrape_wogetra()
|
||||||
|
print("Scraping properties from LWB...")
|
||||||
|
properties += lwb_scraper.scrape_easysquare()
|
||||||
|
|
||||||
|
for prop in properties:
|
||||||
|
if prop["id"] not in known_properties:
|
||||||
|
# Notify Discord and mark as known
|
||||||
|
localwebhook.send_to_discord(prop)
|
||||||
|
known_properties.add(prop["id"])
|
||||||
|
|
||||||
|
|
||||||
|
# save known properties to file
|
||||||
|
with open("known_properties.json", "w") as file:
|
||||||
|
json.dump(list(known_properties), file)
|
||||||
|
|
||||||
|
# Wait before checking again
|
||||||
|
print("Waiting for the next check...")
|
||||||
|
time.sleep(300) # Check every 5 minutes
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
0
requirements.txt
Normal file
0
requirements.txt
Normal file
BIN
src/discord/__pycache__/webhook.cpython-310.pyc
Normal file
BIN
src/discord/__pycache__/webhook.cpython-310.pyc
Normal file
Binary file not shown.
55
src/discord/webhook.py
Normal file
55
src/discord/webhook.py
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
import json
|
||||||
|
import requests
|
||||||
|
|
||||||
|
import src.lwb.scrape_image as scrape_image
|
||||||
|
|
||||||
|
# Webhook URL from Discord
|
||||||
|
WEBHOOK_URL = "https://discord.com/api/webhooks/1327600813367099462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzGCSru"
|
||||||
|
|
||||||
|
|
||||||
|
# Funktion: Nachricht an Discord senden
|
||||||
|
def send_to_discord(property_data):
|
||||||
|
|
||||||
|
message = (
|
||||||
|
f"**{property_data['title']}**\n"
|
||||||
|
f"{property_data['subtitle']}\n"
|
||||||
|
f"**Zimmer:** {property_data['rooms']}\n"
|
||||||
|
f"**Wohnfläche:** {property_data['size']}\n"
|
||||||
|
f"**Gesamtmiete:** {property_data['rent']}\n"
|
||||||
|
f"**Warmmiete:** {property_data['warm_rent']}\n"
|
||||||
|
f"**Verfügbar ab:** {property_data['availability']}\n"
|
||||||
|
f"**Link:** {property_data['link']}\n"
|
||||||
|
f"**Beschreibung:** {property_data['abstract']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Set headers
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
|
||||||
|
# Check for optional image URL
|
||||||
|
if "image_url" in property_data and property_data["image_url"]:
|
||||||
|
try:
|
||||||
|
|
||||||
|
# Download the image
|
||||||
|
image_response = scrape_image.scrape_image(property_data["image_url"])
|
||||||
|
|
||||||
|
# Send the message with an image attachment
|
||||||
|
files = {"file": ("image.jpg", image_response)}
|
||||||
|
payload = {"content": message}
|
||||||
|
response = requests.post(WEBHOOK_URL, data=payload, files=files)
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"Fehler beim Herunterladen des Bildes: {e}")
|
||||||
|
payload = {"content": message}
|
||||||
|
response = requests.post(WEBHOOK_URL, data=json.dumps(payload), headers=headers)
|
||||||
|
else:
|
||||||
|
# Send the message without an image
|
||||||
|
payload = {"content": message}
|
||||||
|
response = requests.post(WEBHOOK_URL, data=json.dumps(payload), headers=headers)
|
||||||
|
|
||||||
|
# Check if the message was sent successfully when the status code is >= 200 and < 300
|
||||||
|
if response.status_code >= 200 and response.status_code < 300:
|
||||||
|
print(f"Benachrichtigung gesendet: {property_data['title']} - response: {response.status_code}")
|
||||||
|
else:
|
||||||
|
print(f"Fehler beim Senden der Benachrichtigung: {response.status_code}")
|
||||||
|
|
BIN
src/lwb/__pycache__/format.cpython-310.pyc
Normal file
BIN
src/lwb/__pycache__/format.cpython-310.pyc
Normal file
Binary file not shown.
BIN
src/lwb/__pycache__/scrape_image.cpython-310.pyc
Normal file
BIN
src/lwb/__pycache__/scrape_image.cpython-310.pyc
Normal file
Binary file not shown.
BIN
src/lwb/__pycache__/scraper.cpython-310.pyc
Normal file
BIN
src/lwb/__pycache__/scraper.cpython-310.pyc
Normal file
Binary file not shown.
57
src/lwb/format.py
Normal file
57
src/lwb/format.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
def format_date(date):
|
||||||
|
# Extract the date part (skip the "B" prefix)
|
||||||
|
date_part = date[1:]
|
||||||
|
|
||||||
|
# Convert to a datetime object
|
||||||
|
date_object = datetime.strptime(date_part, "%Y%m%d")
|
||||||
|
|
||||||
|
# Format as "day month year"
|
||||||
|
formatted_date = date_object.strftime("%d %B %Y")
|
||||||
|
|
||||||
|
return formatted_date
|
||||||
|
|
||||||
|
def format_room(room):
|
||||||
|
|
||||||
|
room = room[-5:].lstrip("0")
|
||||||
|
|
||||||
|
room_count = int(room[:1])
|
||||||
|
if room_count == 1:
|
||||||
|
return (f"{room_count} Zimmer")
|
||||||
|
else:
|
||||||
|
return (f"{room_count} Zimmer")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def format_money(money):
|
||||||
|
# B000000079900 -> 799,00 €
|
||||||
|
|
||||||
|
# Extract the amount part (skip the "B" prefix)
|
||||||
|
amount_part = money[1:]
|
||||||
|
|
||||||
|
# remove leading zeros
|
||||||
|
amount_part = amount_part.lstrip("0")
|
||||||
|
|
||||||
|
# Split the amount into euros and cents
|
||||||
|
euros = amount_part[:-2]
|
||||||
|
cents = amount_part[-2:]
|
||||||
|
|
||||||
|
# Combine the parts with a comma
|
||||||
|
formatted_money = f"{euros},{cents} €"
|
||||||
|
|
||||||
|
return formatted_money
|
||||||
|
|
||||||
|
def format_roomSize(room):
|
||||||
|
|
||||||
|
# Extract the amount part (skip the "B" prefix)
|
||||||
|
amount_part = room[1:]
|
||||||
|
# remove leading zeros
|
||||||
|
amount_part = amount_part.lstrip("0")
|
||||||
|
# Split the amount into meters and centimeters
|
||||||
|
meters = amount_part[:-4]
|
||||||
|
centimeters = amount_part[-4:]
|
||||||
|
# Combine the parts with a comma
|
||||||
|
formatted_room = f"{meters},{centimeters} m²"
|
||||||
|
|
||||||
|
return formatted_room
|
35
src/lwb/scrape_image.py
Normal file
35
src/lwb/scrape_image.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
import requests
|
||||||
|
|
||||||
|
EASYSQUARE_HEADERS = {
|
||||||
|
"DNT": "1",
|
||||||
|
"Host": "portal1s.easysquare.com",
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Cookie": "SAP_SESSIONID_PP0_581=zqFIhvNbEsOs_n3cgRTIO1V7ZaLQCxHvhYgKELG5Agg%3d; sap-usercontext=sap-language=D&sap-client=581; cookiesession1=678ADA67ADF0813997206FE9F4133118",
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
|
||||||
|
"Accept-Encoding": "gzip, deflate, br, zstd",
|
||||||
|
"Accept-Language": "de,en-US;q=0.7,en;q=0.3",
|
||||||
|
"Upgrade-Insecure-Requests": "1"
|
||||||
|
}
|
||||||
|
EASYSQUARE_PARAMS = {
|
||||||
|
"application": "ESQ_IA_REOBJ",
|
||||||
|
"sap-client": "581",
|
||||||
|
"command": "action",
|
||||||
|
"name": "boxlist",
|
||||||
|
"api": "6.169",
|
||||||
|
"head-oppc-version": "6.169.22",
|
||||||
|
"_": "1736595414769"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_image(url):
|
||||||
|
session = requests.Session()
|
||||||
|
response = session.get(url, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# get image from response
|
||||||
|
|
||||||
|
return response.content
|
||||||
|
|
116
src/lwb/scraper.py
Normal file
116
src/lwb/scraper.py
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
import requests
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import src.lwb.format as format
|
||||||
|
|
||||||
|
EASYSQUARE_URL = "https://portal1s.easysquare.com/prorex/xmlforms"
|
||||||
|
EASYSQUARE_HEADERS = {
|
||||||
|
"DNT": "1",
|
||||||
|
"Host": "portal1s.easysquare.com",
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Cookie": "SAP_SESSIONID_PP0_581=Vg3w4pn8whD76BldaU2wvP-YzyrRkRHvhWoKELG5Agg%3d; sap-usercontext=sap-language=D&sap-client=581; cookiesession1=678ADA67ADF0813997206FE9F4133118",
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
|
||||||
|
"Accept-Encoding": "gzip, deflate, br, zstd",
|
||||||
|
"Accept-Language": "de,en-US;q=0.7,en;q=0.3",
|
||||||
|
"Upgrade-Insecure-Requests": "1"
|
||||||
|
}
|
||||||
|
EASYSQUARE_PARAMS = {
|
||||||
|
"application": "ESQ_IA_REOBJ",
|
||||||
|
"sap-client": "581",
|
||||||
|
"command": "action",
|
||||||
|
"name": "boxlist",
|
||||||
|
"api": "6.169",
|
||||||
|
"head-oppc-version": "6.169.22",
|
||||||
|
"_": "1736761256321"
|
||||||
|
}
|
||||||
|
|
||||||
|
# curl --location 'https://portal1s.easysquare.com/prorex/xmlforms?application=ESQ_IA_REOBJ&sap-client=581&command=action&name=boxlist&api=6.169&head-oppc-version=6.169.22&_=1736761255682' \
|
||||||
|
# --header 'DNT: 1' \
|
||||||
|
# --header 'UTC: 1736761256321' \
|
||||||
|
# --header 'Host: portal1s.easysquare.com' \
|
||||||
|
# --header 'host: portal1s.easysquare.com' \
|
||||||
|
# --header 'Accept: text/plain, */*; q=0.01' \
|
||||||
|
# --header 'Cookie: cookiesession1=678ADA67ADF0813997206FE9F4133118; sap-usercontext=sap-language=de&sap-client=581; SAP_SESSIONID_PP0_581=Vg3w4pn8whD76BldaU2wvP-YzyrRkRHvhWoKELG5Agg%3d' \
|
||||||
|
# --header 'Referer: https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL' \
|
||||||
|
# --header 'Sec-GPC: 1' \
|
||||||
|
# --header 'oppc-id: D9925A2D-4ED9-4911-8AD3-2626DA41FBB0' \
|
||||||
|
# --header 'Connection: keep-alive' \
|
||||||
|
# --header 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0' \
|
||||||
|
# --header 'Content-Type: text/plain;charset=UTF-8' \
|
||||||
|
# --header 'Sec-Fetch-Dest: empty' \
|
||||||
|
# --header 'Sec-Fetch-Mode: cors' \
|
||||||
|
# --header 'Sec-Fetch-Site: same-origin' \
|
||||||
|
# --header 'Accept-Encoding: gzip, deflate, br, zstd' \
|
||||||
|
# --header 'Accept-Language: de,en-US;q=0.7,en;q=0.3' \
|
||||||
|
# --header 'X-Requested-With: XMLHttpRequest'
|
||||||
|
|
||||||
|
|
||||||
|
# Funktion: Scrape von Easysquare
|
||||||
|
def scrape_easysquare():
|
||||||
|
session = requests.Session()
|
||||||
|
response = session.get(EASYSQUARE_URL, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# XML-Daten parsen
|
||||||
|
root = ET.fromstring(response.content)
|
||||||
|
namespace = {"ns": "http://www.openpromos.com/OPPC/XMLForms"}
|
||||||
|
|
||||||
|
properties = []
|
||||||
|
for head in root.findall(".//ns:head", namespace):
|
||||||
|
prop_title = head.find("ns:title", namespace).text
|
||||||
|
subtitle = head.find("ns:subtitle", namespace).text
|
||||||
|
abstract = head.find("ns:abstract", namespace).text.strip()
|
||||||
|
|
||||||
|
# get adress lat and long
|
||||||
|
# <address city="" lat="51.346061" lon="12.3774656" postcode="" street=""/>
|
||||||
|
|
||||||
|
adress = head.find("ns:address", namespace)
|
||||||
|
lat = adress.get("lat")
|
||||||
|
lon = adress.get("lon")
|
||||||
|
|
||||||
|
image = head.find("ns:image", namespace)
|
||||||
|
iamge_resourceId = image.get("resourceId")
|
||||||
|
|
||||||
|
id = head.find("ns:id", namespace).text
|
||||||
|
|
||||||
|
# Details extrahieren
|
||||||
|
rooms = "N/A"
|
||||||
|
size = "N/A"
|
||||||
|
rent = "N/A"
|
||||||
|
availability = "N/A"
|
||||||
|
|
||||||
|
for criterion in head.findall(".//ns:criterion", namespace):
|
||||||
|
criterion_title = criterion.get("title")
|
||||||
|
value = criterion.text.strip() if criterion.text else "N/A"
|
||||||
|
if criterion_title == "Zimmer":
|
||||||
|
rooms = value
|
||||||
|
elif criterion_title == "Fläche":
|
||||||
|
size = value
|
||||||
|
elif criterion_title == "Gesamtmiete":
|
||||||
|
rent = value
|
||||||
|
elif criterion_title == "Verfügbar ab":
|
||||||
|
availability = value
|
||||||
|
|
||||||
|
# link create google maps link with lat and long
|
||||||
|
link = f"https://www.google.com/maps/search/?api=1&query={lat},{lon}"
|
||||||
|
|
||||||
|
# https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id=1EC8D4E6-191A-A827-47FF-72D8C5379070&name=get
|
||||||
|
image_url = f"https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id={iamge_resourceId}&name=get"
|
||||||
|
|
||||||
|
properties.append({
|
||||||
|
"id": id,
|
||||||
|
"title": prop_title,
|
||||||
|
"subtitle": subtitle,
|
||||||
|
"rooms": format.format_room(rooms),
|
||||||
|
"size": format.format_roomSize(size),
|
||||||
|
"rent": format.format_money(rent),
|
||||||
|
"link": link,
|
||||||
|
"abstract": abstract,
|
||||||
|
"warm_rent": "",
|
||||||
|
"availability": format.format_date(availability),
|
||||||
|
"image_url": image_url,
|
||||||
|
})
|
||||||
|
|
||||||
|
return properties
|
BIN
src/wogetra/__pycache__/scraper.cpython-310.pyc
Normal file
BIN
src/wogetra/__pycache__/scraper.cpython-310.pyc
Normal file
Binary file not shown.
50
src/wogetra/scraper.py
Normal file
50
src/wogetra/scraper.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
WOGETRA_URL = "https://www.wogetra.de/immobilien-vermarktungsart/miete/"
|
||||||
|
|
||||||
|
# Funktion: Scrape von Wogetra
|
||||||
|
def scrape_wogetra():
|
||||||
|
response = requests.get(WOGETRA_URL)
|
||||||
|
soup = BeautifulSoup(response.content, 'html.parser')
|
||||||
|
|
||||||
|
# Find all property containers
|
||||||
|
property_elements = soup.find_all("div", class_="property-container")
|
||||||
|
properties = []
|
||||||
|
|
||||||
|
for prop in property_elements:
|
||||||
|
# Extract property details
|
||||||
|
title_element = prop.find("h3", class_="property-title")
|
||||||
|
subtitle_element = prop.find("div", class_="property-subtitle")
|
||||||
|
link_element = title_element.find("a") if title_element else None
|
||||||
|
|
||||||
|
details = prop.find("div", class_="property-data")
|
||||||
|
|
||||||
|
# Extract details (ID, rooms, size, etc.)
|
||||||
|
property_id = prop.get("id", "")
|
||||||
|
title = title_element.text.strip() if title_element else "No Title"
|
||||||
|
subtitle = subtitle_element.text.strip() if subtitle_element else "No Subtitle"
|
||||||
|
link = link_element["href"] if link_element else "#"
|
||||||
|
|
||||||
|
rooms = details.find("div", class_="data-anzahl_zimmer").find("div", class_="dd").text.strip() if details else "N/A"
|
||||||
|
size = details.find("div", class_="data-wohnflaeche").find("div", class_="dd").text.strip() if details else "N/A"
|
||||||
|
rent = details.find("div", class_="data-nettokaltmiete").find("div", class_="dd").text.strip() if details else "N/A"
|
||||||
|
warm_rent = details.find("div", class_="data-warmmiete").find("div", class_="dd").text.strip() if details else "N/A"
|
||||||
|
availability = details.find("div", class_="data-verfuegbar_ab").find("div", class_="dd").text.strip() if details else "N/A"
|
||||||
|
|
||||||
|
# Add property to list
|
||||||
|
properties.append({
|
||||||
|
"id": property_id,
|
||||||
|
"title": title,
|
||||||
|
"subtitle": subtitle,
|
||||||
|
"rooms": rooms,
|
||||||
|
"size": size,
|
||||||
|
"rent": rent,
|
||||||
|
"link": link,
|
||||||
|
"abstract": "",
|
||||||
|
"warm_rent": warm_rent,
|
||||||
|
"availability": availability,
|
||||||
|
"image_url": "",
|
||||||
|
})
|
||||||
|
|
||||||
|
return properties
|
12
test.py
Normal file
12
test.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
import src.lwb.scrape_image as scrape_image
|
||||||
|
|
||||||
|
# B000000502800 -> 50,28 m²
|
||||||
|
room = "B000000502800"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
iamge = scrape_image.scrape_image("https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id=1EC8D4E6-191A-A827-47FF-72D8C5379070&name=get")
|
||||||
|
|
||||||
|
# save image
|
||||||
|
with open(f"image_{room}.jpg", "wb") as file:
|
||||||
|
file.write(iamge)
|
Reference in New Issue
Block a user