feat: update Docker setup, add environment variables, and enhance property scraping logic

This commit is contained in:
Elmar Kresse
2025-02-10 10:02:53 +01:00
parent f16116040d
commit 92bb5b2e85
11 changed files with 66 additions and 74 deletions

3
.gitignore vendored
View File

@ -1,3 +1,4 @@
.venv
*.pyc
__pycache__
__pycache__
.env

View File

@ -7,6 +7,8 @@ WORKDIR /app
RUN pip install -r requirements.txt
COPY . /app
COPY ./main.py /app
COPY ./known_properties.json /app
COPY ./src/. /app/src
CMD ["python", "-u", "/app/main.py"]

View File

@ -1,5 +1,9 @@
services:
python:
image: gitlab.dit.htwk-leipzig.de:5050/ekresse/flatscraper:main
build:
context: .
dockerfile: Dockerfile
container_name: flatscraper
env_file:
- .env

View File

@ -1 +1 @@
[51968, "172DFAD2-7CDB-51B4-212E-E6F9C7F0601A", "FA45C0B6-813C-DE65-496A-EDD8DA3F2526", "9A0B42A2-6D9B-331A-DAA7-624A5FA49606", "F7992488-7C24-DFA9-F8B2-94DDC18E66A3", "C02892BE-F34F-5A8A-E174-4A79549DC9A9", "id-32-1-41", "21C20126-380D-9B0F-73F8-C4279897F189", "BAAEC20A-F667-FE22-6693-E4B4CA366889", "665243A8-FD34-86F8-322F-FE9B0B392083", -4432880139075606878, 6598001214168193439, 3637269161070974884, 51624, 3048296312984895144, "E7B71D28-C557-CFE4-805D-42C2793E9248", 51628, 3286076625991780524, 52087, "id-193-1-13", 51632, "id-105-12-78", "id-88-5-45", "57446DF5-CB9F-951C-A40A-4BA775DA7426", "id-19-3-19", "B4095706-A65C-F421-B02E-1D227B684B62", 9131742071665368621, "id-58-4-30", "892BD779-F186-9BD1-A97A-5783EFB6F56D", "id-154-2-71", -6815659680085605823, 51656, "id-158-2-18", 51660, 51665, 213337995196486226, "id-145-1-9", "7421A7DD-D9B4-72D3-0A91-5C3DA60C26E4", "id-148-1-30", "B57516F9-E364-7E54-A211-527ED54388E6", "3B73B720-13F2-62A4-8829-557676725A95", "BD41AC52-BADB-428F-AF4B-11BBC374F2F1", "803DF7B0-1125-2AA4-90AC-CF0CAEAC625A", "281A7F0B-FA5A-75AE-68DE-89AFD90EEF48", "id-202-1-11", "id-147-1-1", 6615894728235425776, "id-30-5-36", "A979EBAE-EF87-FB51-152A-5453CD7DC794", -9017525000097635974, 51963]
["b36ab45e1c5899b4c9add5b1134e731e6c97858fb7de80373df925c2246c824d", "449c368b2b7bca515840659acd6702071fa62e59917ba7a29a180d49ca19ad82", "id-58-4-30", "8901.039", "2702.004", "id-148-1-30", "7e59929357819219851c5dd0f4addb2c6d472fe5e1001b1c6a0021597d6ec4cc", "fe1245cb4d60d639f47edbf5b4ccf44c2639595723ef412d059760b3c3b075e2", "3207.006", "8ee618d09d89849e48da7a66efafe046f52c833baa08babbd98d8574578a259f", "e804d28f1dea007b5a3ff762dfdd7cc29fb1f42104ac0f15bfac74b9254174bd", "10104.013", 52087, "dc552b2bfb958b166d98425b8ee43420130a8eaea7e494da003fb1f7734cb1fb", "1809676f5b586fb5ad89ae3ec6082c45dad170be9b67c18fa887cd97df0b375c", "id-158-2-18", "9101.046"]

25
main.py
View File

@ -27,14 +27,23 @@ def main():
while True:
current_time = time.strftime("%H:%M:%S", time.localtime())
print("Scraping properties at " + current_time)
print("Scraping properties from Wogetra...")
properties = wogetra_scraper.scrape_wogetra()
print("Scraping properties from LWB...")
properties += lwb_scraper.scrape_easysquare()
print("Scraping properties from Lipsia...")
properties += lipsia_scraper.scrape_lipsia()
print("Scraping properties from BGL...")
properties = bgl_scraper.fetch_all_properties()
properties_wogetra = wogetra_scraper.scrape_wogetra()
print("Scraped " + str(len(properties_wogetra)) + " properties from Wogetra")
properties = properties_wogetra
properties_lwb = lwb_scraper.scrape_easysquare()
print("Scraped " + str(len(properties_lwb)) + " properties from LWB")
properties += properties_lwb
properties_lipsia = lipsia_scraper.scrape_lipsia()
print("Scraped " + str(len(properties_lipsia)) + " properties from Lipsia")
properties += properties_lipsia
properties_bgl = bgl_scraper.fetch_all_properties()
print("Scraped " + str(len(properties_bgl)) + " properties from BGL")
properties += properties_bgl
for prop in properties:
if prop["id"] not in known_properties:

View File

@ -1,59 +1,9 @@
aiohappyeyeballs==2.4.4
aiohttp==3.11.11
aiosignal==1.3.2
asttokens==3.0.0
async-timeout==5.0.1
attrs==24.3.0
backcall==0.2.0
beautifulsoup4==4.12.3
bleach==6.2.0
certifi==2024.12.14
beautifulsoup4==4.13.3
certifi==2025.1.31
charset-normalizer==3.4.1
colorama==0.4.6
decorator==5.1.1
defusedxml==0.7.1
discord.py==2.4.0
docopt==0.6.2
executing==2.1.0
fastjsonschema==2.21.1
frozenlist==1.5.0
idna==3.10
ipython==8.12.3
jedi==0.19.2
Jinja2==3.1.5
jsonschema==4.23.0
jsonschema-specifications==2024.10.1
MarkupSafe==3.0.2
matplotlib-inline==0.1.7
mistune==3.1.0
multidict==6.1.0
nbclient==0.10.2
nbconvert==7.16.5
nbformat==5.10.4
packaging==24.2
pandocfilters==1.5.1
parso==0.8.4
pickleshare==0.7.5
platformdirs==4.3.6
prompt_toolkit==3.0.48
propcache==0.2.1
pure_eval==0.2.3
Pygments==2.19.1
python-dateutil==2.9.0.post0
pyzmq==26.2.0
referencing==0.35.1
python-dotenv==1.0.1
requests==2.32.3
rpds-py==0.22.3
schedule==1.2.2
six==1.17.0
soupsieve==2.6
stack-data==0.6.3
tinycss2==1.4.0
tornado==6.4.2
traitlets==5.14.3
typing_extensions==4.12.2
urllib3==2.3.0
wcwidth==0.2.13
webencodings==0.5.1
yarg==0.1.9
yarl==1.18.3

3
sample.env Normal file
View File

@ -0,0 +1,3 @@
SAP_SESSIONID=UrN6nRbjuCBe4dkLw7vkJLcpV5zniRHvhkwAAKG5Agg%3d
COOKIE_SESSION=678ADA67ADF0813997206FE9F4132819
WEBHOOK_URL=https://discord.com/api/webhooks/1327600813367432462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzLKAaa

View File

@ -52,7 +52,10 @@ def fetch_all_properties():
prop_soup = BeautifulSoup(response.text, "html.parser")
# get h3 with class adresse and extract the text
prop_title = prop_soup.find("h3", {"class": "adresse"}).text.strip()
try :
prop_title = prop_soup.find("h3", {"class": "adresse"}).text.strip()
except:
prop_title = "N/A"
# create a value entrie tuple list
facts = []
@ -91,15 +94,23 @@ def fetch_all_properties():
# image is in img tag with class "img-responsive"
image_url = prop_soup.find("img", {"class": "img-responsive"})["src"]
# from prop_soup get the ifram with id "gmap_canvas" and extract the src
google_maps_link = prop_soup.find("iframe", {"id": "gmap_canvas"})
google_maps_link = google_maps_link["data-original-src"]
# remove the query parameter output=embed
google_maps_link = google_maps_link.replace("&output=embed", "")
# remove width and height
google_maps_link = google_maps_link.replace("width=300&height=220&", "")
properties.append({
"id": obj_id,
"title": "BGL - " + prop_title,
"subtitle": "",
"subtitle": google_maps_link,
"rooms": room_count,
"size": size,
"rent": cold_rent,
"link": bgl_url + prop_url,
"abstract": "",
"abstract": "Andere Kosten: " + other_costs + " Heizkosten: " + heating_costs + " Etage:" + level,
"warm_rent": rent,
"availability": availability,
"image_url": image_url,

View File

@ -1,11 +1,12 @@
import json
import requests
from dotenv import load_dotenv
import os
import src.lwb.scrape_image as scrape_image
load_dotenv()
# Webhook URL from Discord
WEBHOOK_URL = "https://discord.com/api/webhooks/1327600813367099462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzGCSru"
WEBHOOK_URL = os.getenv("WEBHOOK_URL")
# Funktion: Nachricht an Discord senden
def send_to_discord(property_data):

View File

@ -3,6 +3,7 @@ from src.lwb.scraper import EASYSQUARE_HEADERS, EASYSQUARE_PARAMS
def scrape_image(url, owner):
session = requests.Session()
response = None
if owner == "BGL":
response = session.get(url)
@ -18,6 +19,12 @@ def scrape_image(url, owner):
# return empty image
return b''
# get image from response
# Handle other owners or fallback
if response is None:
response = session.get(url)
if response.status_code != 200:
print(f"Fehler beim Abrufen der Standardquelle: {response.status_code}")
return b''
return response.content

View File

@ -2,11 +2,15 @@ import requests
import xml.etree.ElementTree as ET
import src.lwb.format as format
import hashlib
import os
from dotenv import load_dotenv
load_dotenv()
SESSION_CREATE_URL = "https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
SAP_SESSIONID = "iZ52JjFdvDRY0528vXt4y4tdOvzk1xHvhW4KELG5Agg%3d"
COOKIE_SESSION = "678ADA670E24565B64423D923CC07C0B"
SAP_SESSIONID = os.getenv("SAP_SESSIONID")
COOKIE_SESSION = os.getenv("COOKIE_SESSION")
EASYSQUARE_URL = "https://portal1s.easysquare.com/prorex/xmlforms"
EASYSQUARE_HEADERS = {