mirror of
https://gitlab.dit.htwk-leipzig.de/fsr-im/tools/flatscraper.git
synced 2025-07-16 11:38:49 +02:00
feat: update Docker setup, add environment variables, and enhance property scraping logic
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,3 +1,4 @@
|
|||||||
.venv
|
.venv
|
||||||
*.pyc
|
*.pyc
|
||||||
__pycache__
|
__pycache__
|
||||||
|
.env
|
@ -7,6 +7,8 @@ WORKDIR /app
|
|||||||
|
|
||||||
RUN pip install -r requirements.txt
|
RUN pip install -r requirements.txt
|
||||||
|
|
||||||
COPY . /app
|
COPY ./main.py /app
|
||||||
|
COPY ./known_properties.json /app
|
||||||
|
COPY ./src/. /app/src
|
||||||
|
|
||||||
CMD ["python", "-u", "/app/main.py"]
|
CMD ["python", "-u", "/app/main.py"]
|
@ -1,5 +1,9 @@
|
|||||||
services:
|
services:
|
||||||
python:
|
python:
|
||||||
image: gitlab.dit.htwk-leipzig.de:5050/ekresse/flatscraper:main
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
container_name: flatscraper
|
container_name: flatscraper
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
|
|
@ -1 +1 @@
|
|||||||
[51968, "172DFAD2-7CDB-51B4-212E-E6F9C7F0601A", "FA45C0B6-813C-DE65-496A-EDD8DA3F2526", "9A0B42A2-6D9B-331A-DAA7-624A5FA49606", "F7992488-7C24-DFA9-F8B2-94DDC18E66A3", "C02892BE-F34F-5A8A-E174-4A79549DC9A9", "id-32-1-41", "21C20126-380D-9B0F-73F8-C4279897F189", "BAAEC20A-F667-FE22-6693-E4B4CA366889", "665243A8-FD34-86F8-322F-FE9B0B392083", -4432880139075606878, 6598001214168193439, 3637269161070974884, 51624, 3048296312984895144, "E7B71D28-C557-CFE4-805D-42C2793E9248", 51628, 3286076625991780524, 52087, "id-193-1-13", 51632, "id-105-12-78", "id-88-5-45", "57446DF5-CB9F-951C-A40A-4BA775DA7426", "id-19-3-19", "B4095706-A65C-F421-B02E-1D227B684B62", 9131742071665368621, "id-58-4-30", "892BD779-F186-9BD1-A97A-5783EFB6F56D", "id-154-2-71", -6815659680085605823, 51656, "id-158-2-18", 51660, 51665, 213337995196486226, "id-145-1-9", "7421A7DD-D9B4-72D3-0A91-5C3DA60C26E4", "id-148-1-30", "B57516F9-E364-7E54-A211-527ED54388E6", "3B73B720-13F2-62A4-8829-557676725A95", "BD41AC52-BADB-428F-AF4B-11BBC374F2F1", "803DF7B0-1125-2AA4-90AC-CF0CAEAC625A", "281A7F0B-FA5A-75AE-68DE-89AFD90EEF48", "id-202-1-11", "id-147-1-1", 6615894728235425776, "id-30-5-36", "A979EBAE-EF87-FB51-152A-5453CD7DC794", -9017525000097635974, 51963]
|
["b36ab45e1c5899b4c9add5b1134e731e6c97858fb7de80373df925c2246c824d", "449c368b2b7bca515840659acd6702071fa62e59917ba7a29a180d49ca19ad82", "id-58-4-30", "8901.039", "2702.004", "id-148-1-30", "7e59929357819219851c5dd0f4addb2c6d472fe5e1001b1c6a0021597d6ec4cc", "fe1245cb4d60d639f47edbf5b4ccf44c2639595723ef412d059760b3c3b075e2", "3207.006", "8ee618d09d89849e48da7a66efafe046f52c833baa08babbd98d8574578a259f", "e804d28f1dea007b5a3ff762dfdd7cc29fb1f42104ac0f15bfac74b9254174bd", "10104.013", 52087, "dc552b2bfb958b166d98425b8ee43420130a8eaea7e494da003fb1f7734cb1fb", "1809676f5b586fb5ad89ae3ec6082c45dad170be9b67c18fa887cd97df0b375c", "id-158-2-18", "9101.046"]
|
25
main.py
25
main.py
@ -27,14 +27,23 @@ def main():
|
|||||||
while True:
|
while True:
|
||||||
current_time = time.strftime("%H:%M:%S", time.localtime())
|
current_time = time.strftime("%H:%M:%S", time.localtime())
|
||||||
print("Scraping properties at " + current_time)
|
print("Scraping properties at " + current_time)
|
||||||
print("Scraping properties from Wogetra...")
|
|
||||||
properties = wogetra_scraper.scrape_wogetra()
|
properties_wogetra = wogetra_scraper.scrape_wogetra()
|
||||||
print("Scraping properties from LWB...")
|
print("Scraped " + str(len(properties_wogetra)) + " properties from Wogetra")
|
||||||
properties += lwb_scraper.scrape_easysquare()
|
properties = properties_wogetra
|
||||||
print("Scraping properties from Lipsia...")
|
|
||||||
properties += lipsia_scraper.scrape_lipsia()
|
properties_lwb = lwb_scraper.scrape_easysquare()
|
||||||
print("Scraping properties from BGL...")
|
print("Scraped " + str(len(properties_lwb)) + " properties from LWB")
|
||||||
properties = bgl_scraper.fetch_all_properties()
|
properties += properties_lwb
|
||||||
|
|
||||||
|
properties_lipsia = lipsia_scraper.scrape_lipsia()
|
||||||
|
print("Scraped " + str(len(properties_lipsia)) + " properties from Lipsia")
|
||||||
|
properties += properties_lipsia
|
||||||
|
|
||||||
|
properties_bgl = bgl_scraper.fetch_all_properties()
|
||||||
|
print("Scraped " + str(len(properties_bgl)) + " properties from BGL")
|
||||||
|
properties += properties_bgl
|
||||||
|
|
||||||
|
|
||||||
for prop in properties:
|
for prop in properties:
|
||||||
if prop["id"] not in known_properties:
|
if prop["id"] not in known_properties:
|
||||||
|
@ -1,59 +1,9 @@
|
|||||||
aiohappyeyeballs==2.4.4
|
beautifulsoup4==4.13.3
|
||||||
aiohttp==3.11.11
|
certifi==2025.1.31
|
||||||
aiosignal==1.3.2
|
|
||||||
asttokens==3.0.0
|
|
||||||
async-timeout==5.0.1
|
|
||||||
attrs==24.3.0
|
|
||||||
backcall==0.2.0
|
|
||||||
beautifulsoup4==4.12.3
|
|
||||||
bleach==6.2.0
|
|
||||||
certifi==2024.12.14
|
|
||||||
charset-normalizer==3.4.1
|
charset-normalizer==3.4.1
|
||||||
colorama==0.4.6
|
|
||||||
decorator==5.1.1
|
|
||||||
defusedxml==0.7.1
|
|
||||||
discord.py==2.4.0
|
|
||||||
docopt==0.6.2
|
|
||||||
executing==2.1.0
|
|
||||||
fastjsonschema==2.21.1
|
|
||||||
frozenlist==1.5.0
|
|
||||||
idna==3.10
|
idna==3.10
|
||||||
ipython==8.12.3
|
python-dotenv==1.0.1
|
||||||
jedi==0.19.2
|
|
||||||
Jinja2==3.1.5
|
|
||||||
jsonschema==4.23.0
|
|
||||||
jsonschema-specifications==2024.10.1
|
|
||||||
MarkupSafe==3.0.2
|
|
||||||
matplotlib-inline==0.1.7
|
|
||||||
mistune==3.1.0
|
|
||||||
multidict==6.1.0
|
|
||||||
nbclient==0.10.2
|
|
||||||
nbconvert==7.16.5
|
|
||||||
nbformat==5.10.4
|
|
||||||
packaging==24.2
|
|
||||||
pandocfilters==1.5.1
|
|
||||||
parso==0.8.4
|
|
||||||
pickleshare==0.7.5
|
|
||||||
platformdirs==4.3.6
|
|
||||||
prompt_toolkit==3.0.48
|
|
||||||
propcache==0.2.1
|
|
||||||
pure_eval==0.2.3
|
|
||||||
Pygments==2.19.1
|
|
||||||
python-dateutil==2.9.0.post0
|
|
||||||
pyzmq==26.2.0
|
|
||||||
referencing==0.35.1
|
|
||||||
requests==2.32.3
|
requests==2.32.3
|
||||||
rpds-py==0.22.3
|
|
||||||
schedule==1.2.2
|
|
||||||
six==1.17.0
|
|
||||||
soupsieve==2.6
|
soupsieve==2.6
|
||||||
stack-data==0.6.3
|
|
||||||
tinycss2==1.4.0
|
|
||||||
tornado==6.4.2
|
|
||||||
traitlets==5.14.3
|
|
||||||
typing_extensions==4.12.2
|
typing_extensions==4.12.2
|
||||||
urllib3==2.3.0
|
urllib3==2.3.0
|
||||||
wcwidth==0.2.13
|
|
||||||
webencodings==0.5.1
|
|
||||||
yarg==0.1.9
|
|
||||||
yarl==1.18.3
|
|
||||||
|
3
sample.env
Normal file
3
sample.env
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
SAP_SESSIONID=UrN6nRbjuCBe4dkLw7vkJLcpV5zniRHvhkwAAKG5Agg%3d
|
||||||
|
COOKIE_SESSION=678ADA67ADF0813997206FE9F4132819
|
||||||
|
WEBHOOK_URL=https://discord.com/api/webhooks/1327600813367432462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzLKAaa
|
@ -52,7 +52,10 @@ def fetch_all_properties():
|
|||||||
prop_soup = BeautifulSoup(response.text, "html.parser")
|
prop_soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
# get h3 with class adresse and extract the text
|
# get h3 with class adresse and extract the text
|
||||||
|
try :
|
||||||
prop_title = prop_soup.find("h3", {"class": "adresse"}).text.strip()
|
prop_title = prop_soup.find("h3", {"class": "adresse"}).text.strip()
|
||||||
|
except:
|
||||||
|
prop_title = "N/A"
|
||||||
|
|
||||||
# create a value entrie tuple list
|
# create a value entrie tuple list
|
||||||
facts = []
|
facts = []
|
||||||
@ -91,15 +94,23 @@ def fetch_all_properties():
|
|||||||
# image is in img tag with class "img-responsive"
|
# image is in img tag with class "img-responsive"
|
||||||
image_url = prop_soup.find("img", {"class": "img-responsive"})["src"]
|
image_url = prop_soup.find("img", {"class": "img-responsive"})["src"]
|
||||||
|
|
||||||
|
# from prop_soup get the ifram with id "gmap_canvas" and extract the src
|
||||||
|
google_maps_link = prop_soup.find("iframe", {"id": "gmap_canvas"})
|
||||||
|
google_maps_link = google_maps_link["data-original-src"]
|
||||||
|
# remove the query parameter output=embed
|
||||||
|
google_maps_link = google_maps_link.replace("&output=embed", "")
|
||||||
|
# remove width and height
|
||||||
|
google_maps_link = google_maps_link.replace("width=300&height=220&", "")
|
||||||
|
|
||||||
properties.append({
|
properties.append({
|
||||||
"id": obj_id,
|
"id": obj_id,
|
||||||
"title": "BGL - " + prop_title,
|
"title": "BGL - " + prop_title,
|
||||||
"subtitle": "",
|
"subtitle": google_maps_link,
|
||||||
"rooms": room_count,
|
"rooms": room_count,
|
||||||
"size": size,
|
"size": size,
|
||||||
"rent": cold_rent,
|
"rent": cold_rent,
|
||||||
"link": bgl_url + prop_url,
|
"link": bgl_url + prop_url,
|
||||||
"abstract": "",
|
"abstract": "Andere Kosten: " + other_costs + " Heizkosten: " + heating_costs + " Etage:" + level,
|
||||||
"warm_rent": rent,
|
"warm_rent": rent,
|
||||||
"availability": availability,
|
"availability": availability,
|
||||||
"image_url": image_url,
|
"image_url": image_url,
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
import json
|
import json
|
||||||
import requests
|
import requests
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import os
|
||||||
import src.lwb.scrape_image as scrape_image
|
import src.lwb.scrape_image as scrape_image
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
# Webhook URL from Discord
|
# Webhook URL from Discord
|
||||||
WEBHOOK_URL = "https://discord.com/api/webhooks/1327600813367099462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzGCSru"
|
WEBHOOK_URL = os.getenv("WEBHOOK_URL")
|
||||||
|
|
||||||
|
|
||||||
# Funktion: Nachricht an Discord senden
|
# Funktion: Nachricht an Discord senden
|
||||||
def send_to_discord(property_data):
|
def send_to_discord(property_data):
|
||||||
|
@ -3,6 +3,7 @@ from src.lwb.scraper import EASYSQUARE_HEADERS, EASYSQUARE_PARAMS
|
|||||||
|
|
||||||
def scrape_image(url, owner):
|
def scrape_image(url, owner):
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
|
response = None
|
||||||
|
|
||||||
if owner == "BGL":
|
if owner == "BGL":
|
||||||
response = session.get(url)
|
response = session.get(url)
|
||||||
@ -18,6 +19,12 @@ def scrape_image(url, owner):
|
|||||||
# return empty image
|
# return empty image
|
||||||
return b''
|
return b''
|
||||||
|
|
||||||
# get image from response
|
# Handle other owners or fallback
|
||||||
|
if response is None:
|
||||||
|
response = session.get(url)
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f"Fehler beim Abrufen der Standardquelle: {response.status_code}")
|
||||||
|
return b''
|
||||||
|
|
||||||
|
|
||||||
return response.content
|
return response.content
|
||||||
|
@ -2,11 +2,15 @@ import requests
|
|||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
import src.lwb.format as format
|
import src.lwb.format as format
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
SESSION_CREATE_URL = "https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
|
SESSION_CREATE_URL = "https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
|
||||||
|
|
||||||
SAP_SESSIONID = "iZ52JjFdvDRY0528vXt4y4tdOvzk1xHvhW4KELG5Agg%3d"
|
SAP_SESSIONID = os.getenv("SAP_SESSIONID")
|
||||||
COOKIE_SESSION = "678ADA670E24565B64423D923CC07C0B"
|
COOKIE_SESSION = os.getenv("COOKIE_SESSION")
|
||||||
|
|
||||||
EASYSQUARE_URL = "https://portal1s.easysquare.com/prorex/xmlforms"
|
EASYSQUARE_URL = "https://portal1s.easysquare.com/prorex/xmlforms"
|
||||||
EASYSQUARE_HEADERS = {
|
EASYSQUARE_HEADERS = {
|
||||||
|
Reference in New Issue
Block a user