mirror of
https://gitlab.dit.htwk-leipzig.de/fsr-im/tools/flatscraper.git
synced 2025-07-15 11:08:48 +02:00
feat: update Docker setup, add environment variables, and enhance property scraping logic
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,3 +1,4 @@
|
||||
.venv
|
||||
*.pyc
|
||||
__pycache__
|
||||
__pycache__
|
||||
.env
|
@ -7,6 +7,8 @@ WORKDIR /app
|
||||
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
COPY . /app
|
||||
COPY ./main.py /app
|
||||
COPY ./known_properties.json /app
|
||||
COPY ./src/. /app/src
|
||||
|
||||
CMD ["python", "-u", "/app/main.py"]
|
@ -1,5 +1,9 @@
|
||||
services:
|
||||
python:
|
||||
image: gitlab.dit.htwk-leipzig.de:5050/ekresse/flatscraper:main
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
container_name: flatscraper
|
||||
env_file:
|
||||
- .env
|
||||
|
@ -1 +1 @@
|
||||
[51968, "172DFAD2-7CDB-51B4-212E-E6F9C7F0601A", "FA45C0B6-813C-DE65-496A-EDD8DA3F2526", "9A0B42A2-6D9B-331A-DAA7-624A5FA49606", "F7992488-7C24-DFA9-F8B2-94DDC18E66A3", "C02892BE-F34F-5A8A-E174-4A79549DC9A9", "id-32-1-41", "21C20126-380D-9B0F-73F8-C4279897F189", "BAAEC20A-F667-FE22-6693-E4B4CA366889", "665243A8-FD34-86F8-322F-FE9B0B392083", -4432880139075606878, 6598001214168193439, 3637269161070974884, 51624, 3048296312984895144, "E7B71D28-C557-CFE4-805D-42C2793E9248", 51628, 3286076625991780524, 52087, "id-193-1-13", 51632, "id-105-12-78", "id-88-5-45", "57446DF5-CB9F-951C-A40A-4BA775DA7426", "id-19-3-19", "B4095706-A65C-F421-B02E-1D227B684B62", 9131742071665368621, "id-58-4-30", "892BD779-F186-9BD1-A97A-5783EFB6F56D", "id-154-2-71", -6815659680085605823, 51656, "id-158-2-18", 51660, 51665, 213337995196486226, "id-145-1-9", "7421A7DD-D9B4-72D3-0A91-5C3DA60C26E4", "id-148-1-30", "B57516F9-E364-7E54-A211-527ED54388E6", "3B73B720-13F2-62A4-8829-557676725A95", "BD41AC52-BADB-428F-AF4B-11BBC374F2F1", "803DF7B0-1125-2AA4-90AC-CF0CAEAC625A", "281A7F0B-FA5A-75AE-68DE-89AFD90EEF48", "id-202-1-11", "id-147-1-1", 6615894728235425776, "id-30-5-36", "A979EBAE-EF87-FB51-152A-5453CD7DC794", -9017525000097635974, 51963]
|
||||
["b36ab45e1c5899b4c9add5b1134e731e6c97858fb7de80373df925c2246c824d", "449c368b2b7bca515840659acd6702071fa62e59917ba7a29a180d49ca19ad82", "id-58-4-30", "8901.039", "2702.004", "id-148-1-30", "7e59929357819219851c5dd0f4addb2c6d472fe5e1001b1c6a0021597d6ec4cc", "fe1245cb4d60d639f47edbf5b4ccf44c2639595723ef412d059760b3c3b075e2", "3207.006", "8ee618d09d89849e48da7a66efafe046f52c833baa08babbd98d8574578a259f", "e804d28f1dea007b5a3ff762dfdd7cc29fb1f42104ac0f15bfac74b9254174bd", "10104.013", 52087, "dc552b2bfb958b166d98425b8ee43420130a8eaea7e494da003fb1f7734cb1fb", "1809676f5b586fb5ad89ae3ec6082c45dad170be9b67c18fa887cd97df0b375c", "id-158-2-18", "9101.046"]
|
25
main.py
25
main.py
@ -27,14 +27,23 @@ def main():
|
||||
while True:
|
||||
current_time = time.strftime("%H:%M:%S", time.localtime())
|
||||
print("Scraping properties at " + current_time)
|
||||
print("Scraping properties from Wogetra...")
|
||||
properties = wogetra_scraper.scrape_wogetra()
|
||||
print("Scraping properties from LWB...")
|
||||
properties += lwb_scraper.scrape_easysquare()
|
||||
print("Scraping properties from Lipsia...")
|
||||
properties += lipsia_scraper.scrape_lipsia()
|
||||
print("Scraping properties from BGL...")
|
||||
properties = bgl_scraper.fetch_all_properties()
|
||||
|
||||
properties_wogetra = wogetra_scraper.scrape_wogetra()
|
||||
print("Scraped " + str(len(properties_wogetra)) + " properties from Wogetra")
|
||||
properties = properties_wogetra
|
||||
|
||||
properties_lwb = lwb_scraper.scrape_easysquare()
|
||||
print("Scraped " + str(len(properties_lwb)) + " properties from LWB")
|
||||
properties += properties_lwb
|
||||
|
||||
properties_lipsia = lipsia_scraper.scrape_lipsia()
|
||||
print("Scraped " + str(len(properties_lipsia)) + " properties from Lipsia")
|
||||
properties += properties_lipsia
|
||||
|
||||
properties_bgl = bgl_scraper.fetch_all_properties()
|
||||
print("Scraped " + str(len(properties_bgl)) + " properties from BGL")
|
||||
properties += properties_bgl
|
||||
|
||||
|
||||
for prop in properties:
|
||||
if prop["id"] not in known_properties:
|
||||
|
@ -1,59 +1,9 @@
|
||||
aiohappyeyeballs==2.4.4
|
||||
aiohttp==3.11.11
|
||||
aiosignal==1.3.2
|
||||
asttokens==3.0.0
|
||||
async-timeout==5.0.1
|
||||
attrs==24.3.0
|
||||
backcall==0.2.0
|
||||
beautifulsoup4==4.12.3
|
||||
bleach==6.2.0
|
||||
certifi==2024.12.14
|
||||
beautifulsoup4==4.13.3
|
||||
certifi==2025.1.31
|
||||
charset-normalizer==3.4.1
|
||||
colorama==0.4.6
|
||||
decorator==5.1.1
|
||||
defusedxml==0.7.1
|
||||
discord.py==2.4.0
|
||||
docopt==0.6.2
|
||||
executing==2.1.0
|
||||
fastjsonschema==2.21.1
|
||||
frozenlist==1.5.0
|
||||
idna==3.10
|
||||
ipython==8.12.3
|
||||
jedi==0.19.2
|
||||
Jinja2==3.1.5
|
||||
jsonschema==4.23.0
|
||||
jsonschema-specifications==2024.10.1
|
||||
MarkupSafe==3.0.2
|
||||
matplotlib-inline==0.1.7
|
||||
mistune==3.1.0
|
||||
multidict==6.1.0
|
||||
nbclient==0.10.2
|
||||
nbconvert==7.16.5
|
||||
nbformat==5.10.4
|
||||
packaging==24.2
|
||||
pandocfilters==1.5.1
|
||||
parso==0.8.4
|
||||
pickleshare==0.7.5
|
||||
platformdirs==4.3.6
|
||||
prompt_toolkit==3.0.48
|
||||
propcache==0.2.1
|
||||
pure_eval==0.2.3
|
||||
Pygments==2.19.1
|
||||
python-dateutil==2.9.0.post0
|
||||
pyzmq==26.2.0
|
||||
referencing==0.35.1
|
||||
python-dotenv==1.0.1
|
||||
requests==2.32.3
|
||||
rpds-py==0.22.3
|
||||
schedule==1.2.2
|
||||
six==1.17.0
|
||||
soupsieve==2.6
|
||||
stack-data==0.6.3
|
||||
tinycss2==1.4.0
|
||||
tornado==6.4.2
|
||||
traitlets==5.14.3
|
||||
typing_extensions==4.12.2
|
||||
urllib3==2.3.0
|
||||
wcwidth==0.2.13
|
||||
webencodings==0.5.1
|
||||
yarg==0.1.9
|
||||
yarl==1.18.3
|
||||
|
3
sample.env
Normal file
3
sample.env
Normal file
@ -0,0 +1,3 @@
|
||||
SAP_SESSIONID=UrN6nRbjuCBe4dkLw7vkJLcpV5zniRHvhkwAAKG5Agg%3d
|
||||
COOKIE_SESSION=678ADA67ADF0813997206FE9F4132819
|
||||
WEBHOOK_URL=https://discord.com/api/webhooks/1327600813367432462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzLKAaa
|
@ -52,7 +52,10 @@ def fetch_all_properties():
|
||||
prop_soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# get h3 with class adresse and extract the text
|
||||
prop_title = prop_soup.find("h3", {"class": "adresse"}).text.strip()
|
||||
try :
|
||||
prop_title = prop_soup.find("h3", {"class": "adresse"}).text.strip()
|
||||
except:
|
||||
prop_title = "N/A"
|
||||
|
||||
# create a value entrie tuple list
|
||||
facts = []
|
||||
@ -91,15 +94,23 @@ def fetch_all_properties():
|
||||
# image is in img tag with class "img-responsive"
|
||||
image_url = prop_soup.find("img", {"class": "img-responsive"})["src"]
|
||||
|
||||
# from prop_soup get the ifram with id "gmap_canvas" and extract the src
|
||||
google_maps_link = prop_soup.find("iframe", {"id": "gmap_canvas"})
|
||||
google_maps_link = google_maps_link["data-original-src"]
|
||||
# remove the query parameter output=embed
|
||||
google_maps_link = google_maps_link.replace("&output=embed", "")
|
||||
# remove width and height
|
||||
google_maps_link = google_maps_link.replace("width=300&height=220&", "")
|
||||
|
||||
properties.append({
|
||||
"id": obj_id,
|
||||
"title": "BGL - " + prop_title,
|
||||
"subtitle": "",
|
||||
"subtitle": google_maps_link,
|
||||
"rooms": room_count,
|
||||
"size": size,
|
||||
"rent": cold_rent,
|
||||
"link": bgl_url + prop_url,
|
||||
"abstract": "",
|
||||
"abstract": "Andere Kosten: " + other_costs + " Heizkosten: " + heating_costs + " Etage:" + level,
|
||||
"warm_rent": rent,
|
||||
"availability": availability,
|
||||
"image_url": image_url,
|
||||
|
@ -1,11 +1,12 @@
|
||||
import json
|
||||
import requests
|
||||
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
import src.lwb.scrape_image as scrape_image
|
||||
load_dotenv()
|
||||
|
||||
# Webhook URL from Discord
|
||||
WEBHOOK_URL = "https://discord.com/api/webhooks/1327600813367099462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzGCSru"
|
||||
|
||||
WEBHOOK_URL = os.getenv("WEBHOOK_URL")
|
||||
|
||||
# Funktion: Nachricht an Discord senden
|
||||
def send_to_discord(property_data):
|
||||
|
@ -3,6 +3,7 @@ from src.lwb.scraper import EASYSQUARE_HEADERS, EASYSQUARE_PARAMS
|
||||
|
||||
def scrape_image(url, owner):
|
||||
session = requests.Session()
|
||||
response = None
|
||||
|
||||
if owner == "BGL":
|
||||
response = session.get(url)
|
||||
@ -18,6 +19,12 @@ def scrape_image(url, owner):
|
||||
# return empty image
|
||||
return b''
|
||||
|
||||
# get image from response
|
||||
# Handle other owners or fallback
|
||||
if response is None:
|
||||
response = session.get(url)
|
||||
if response.status_code != 200:
|
||||
print(f"Fehler beim Abrufen der Standardquelle: {response.status_code}")
|
||||
return b''
|
||||
|
||||
|
||||
return response.content
|
||||
|
@ -2,11 +2,15 @@ import requests
|
||||
import xml.etree.ElementTree as ET
|
||||
import src.lwb.format as format
|
||||
import hashlib
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
SESSION_CREATE_URL = "https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
|
||||
|
||||
SAP_SESSIONID = "iZ52JjFdvDRY0528vXt4y4tdOvzk1xHvhW4KELG5Agg%3d"
|
||||
COOKIE_SESSION = "678ADA670E24565B64423D923CC07C0B"
|
||||
SAP_SESSIONID = os.getenv("SAP_SESSIONID")
|
||||
COOKIE_SESSION = os.getenv("COOKIE_SESSION")
|
||||
|
||||
EASYSQUARE_URL = "https://portal1s.easysquare.com/prorex/xmlforms"
|
||||
EASYSQUARE_HEADERS = {
|
||||
|
Reference in New Issue
Block a user