From b337b7c2f88b23166f6cf7c71f382ef6677c8852 Mon Sep 17 00:00:00 2001 From: Elmar Kresse Date: Mon, 13 Jan 2025 10:44:01 +0100 Subject: [PATCH] add initial scraping functionality and related utilities --- .gitignore | 1 + known_properties.json | 1 + main.py | 49 ++++++++ requirements.txt | 0 .../__pycache__/webhook.cpython-310.pyc | Bin 0 -> 1514 bytes src/discord/webhook.py | 55 +++++++++ src/lwb/__pycache__/format.cpython-310.pyc | Bin 0 -> 1058 bytes .../__pycache__/scrape_image.cpython-310.pyc | Bin 0 -> 1211 bytes src/lwb/__pycache__/scraper.cpython-310.pyc | Bin 0 -> 2614 bytes src/lwb/format.py | 57 +++++++++ src/lwb/scrape_image.py | 35 ++++++ src/lwb/scraper.py | 116 ++++++++++++++++++ .../__pycache__/scraper.cpython-310.pyc | Bin 0 -> 1405 bytes src/wogetra/scraper.py | 50 ++++++++ test.py | 12 ++ 15 files changed, 376 insertions(+) create mode 100644 .gitignore create mode 100644 known_properties.json create mode 100644 main.py create mode 100644 requirements.txt create mode 100644 src/discord/__pycache__/webhook.cpython-310.pyc create mode 100644 src/discord/webhook.py create mode 100644 src/lwb/__pycache__/format.cpython-310.pyc create mode 100644 src/lwb/__pycache__/scrape_image.cpython-310.pyc create mode 100644 src/lwb/__pycache__/scraper.cpython-310.pyc create mode 100644 src/lwb/format.py create mode 100644 src/lwb/scrape_image.py create mode 100644 src/lwb/scraper.py create mode 100644 src/wogetra/__pycache__/scraper.cpython-310.pyc create mode 100644 src/wogetra/scraper.py create mode 100644 test.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1d17dae --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.venv diff --git a/known_properties.json b/known_properties.json new file mode 100644 index 0000000..c865079 --- /dev/null +++ b/known_properties.json @@ -0,0 +1 @@ +["803DF7B0-1125-2AA4-90AC-CF0CAEAC625A", "id-193-1-13", "7421A7DD-D9B4-72D3-0A91-5C3DA60C26E4", "B4095706-A65C-F421-B02E-1D227B684B62", "BD41AC52-BADB-428F-AF4B-11BBC374F2F1", "id-154-2-71", "id-105-12-78", "id-88-5-45"] \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..a857929 --- /dev/null +++ b/main.py @@ -0,0 +1,49 @@ +from bs4 import BeautifulSoup +import json +import time +import src.wogetra.scraper as wogetra_scraper +import src.lwb.scraper as lwb_scraper +import src.discord.webhook as localwebhook + + +# URL of the website to scrape +TARGET_URL = "https://www.wogetra.de/immobilien-vermarktungsart/miete/" + +# Store known property IDs to avoid duplicate notifications +known_properties = set() + +# Main loop to periodically check for new listings +def main(): + global known_properties + + # Load known properties from file + try: + with open("known_properties.json", "r") as file: + known_properties = set(json.load(file)) + except FileNotFoundError: + print("No known properties file found. Starting fresh.") + + while True: + print("Scraping properties...") + print("Scraping properties from Wogetra...") + properties = wogetra_scraper.scrape_wogetra() + print("Scraping properties from LWB...") + properties += lwb_scraper.scrape_easysquare() + + for prop in properties: + if prop["id"] not in known_properties: + # Notify Discord and mark as known + localwebhook.send_to_discord(prop) + known_properties.add(prop["id"]) + + + # save known properties to file + with open("known_properties.json", "w") as file: + json.dump(list(known_properties), file) + + # Wait before checking again + print("Waiting for the next check...") + time.sleep(300) # Check every 5 minutes + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/src/discord/__pycache__/webhook.cpython-310.pyc b/src/discord/__pycache__/webhook.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dfbbda8229db0f69a3e9b7718b93a7ee211845cf GIT binary patch literal 1514 zcmah}TW=dh6rS0gU2hWGF^N+MDP`LSBwHlT1*J`d5K71mLV}XEQLISTdS_y9vbW5P z>w2+KxxDfV5K=4glK;S4U-%9Cgv3+-0TfC(yX%US7e<;n-#Oo$>)I0(3Zn?tuU~%C zx-by>Em5v61(c6ri$ypnqS!_COyW9Db;VN^gpo(;1S6`{HL6nb9jfaTAEJu(4E)eu z)N4x_iq%?;I>HV(owEb4W`$0z%bM*VI2W~rrN!IJ^YeEWmX?-p&#$a}u)J7n1sCk- zliqRHSy-A~-kk3R;k{_{eEYst7xN3vbvC&daA%( zLf`6RkebA2id6erpU~;=)q#G5{1t-0O3t+2Ba8+)jH2sOr*HRjkWKj(!}wqGmAS`8 zUweZ43Z2O!9K)v@D*ey;J$6uonf_$B1L0kvmSdpEH|ev;-# zhX`16Xv=B>Tjl0nq*SZ1CLQUrNUc^!V?%VBnOYdFR_l)EF}_-@nz6=%z!RMb_(VV) z1#SP-efpE#W+_XA6R~Ilm@OtOPkIiMLxwY7My15Z;@-dZkuIo>gzS{FnD-e!efo2& zX>rqPrcv|O1Iux(rsF!Y7mY!TUB^Gq61sq*NieN3VYfMRnjOEDQijzOl3TWn^6P;w zA;aw5UdW=76^5>3Tha;q+L;Lac*OCn7Hf348bn*qngvtK}S-iP0IG_%Hh z3rcqEwscxhwb^2V`IO03Gb)&~CTAiHe8E7iT+McvTuRb#^S>b=;b@!}Hn&37NV|#+ z&MrDkND-4T5HcP++F0A(-~X&}?_f6`We;r@CX*0x=^(qf@yZlyA$NQklXOTL?RcSx z$Aq+GM>On!vWihJgggitm%TpujUhJ%{$d-BysBgJQdh{| zx=LQ?g#4vz= 200 and < 300 + if response.status_code >= 200 and response.status_code < 300: + print(f"Benachrichtigung gesendet: {property_data['title']} - response: {response.status_code}") + else: + print(f"Fehler beim Senden der Benachrichtigung: {response.status_code}") + diff --git a/src/lwb/__pycache__/format.cpython-310.pyc b/src/lwb/__pycache__/format.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..957f1c1fe9e21e9f3c5608c19527c0db43456ddd GIT binary patch literal 1058 zcmZ`&&1(}u6rZn6wwrAmsv_zo2O}(~^ej@O;x(6^x&%VHGbzz*HoTo6hRvnwUm=C| zSn%YZ;GeNaJ>~4JiuJuo$Vb72d0#X0=J$K=ZMMA~F#H;DcA9vM{Um4o6X?9cmG7g_ zO!EOtSc2!&@xkrLViz*nVmfPLai@P}6#i zD}REbzy~z@EFQ2ax9ox!T#HLF6@|FK=PvnF7828{){N3jD$V0&+SC)jsTw=kmToYf;MQl%mg@fOL>s0%i(IWs$XYZ4pp^fE}h)*9WR@D+n9rn-Lo=0*Fo zZ`-{=*X9XyM+VwCB<;cbllJ_^^U;U0HaFc(iPv%;MJPfUNF`tkoekmya!LFEofYv5 ztSk~`mSc=*NyYfcr&J30PQb&0e_>jkizBwOBCj?3eRFeD@_nG!C)GyWPg4VtC@TZ3 zMSoPv{?MYUAj{G?AlXT>@z6S!b3vGwOz6Ze26CE@Jy91v4|pJeVjEm=YL;An9ip@1 zLZ}yf2>*f&wiCH5+AIq}d9@2Ed9 z&oI#(tqcT?8m|jPdmp1Me2XT`&7EtQBjNVeveI~i>$(3j&GdL+4(P9P4G) Fy?>J)>umr4 literal 0 HcmV?d00001 diff --git a/src/lwb/__pycache__/scrape_image.cpython-310.pyc b/src/lwb/__pycache__/scrape_image.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ab8e9de03596a2c539c67b0a7459d243cfb5ebe GIT binary patch literal 1211 zcmZ8hTXWk)6xPMJ+D@AkW&*>D9-xDRZ;J16T@bO{7*fZv5}59I#WIs1J(bI$HL->kQ_l@|E;E_h|ViV4E+ z5!f7eIQWcD`iL7OAkh(kfFvZN7>c6=N}?1;cV4G}h|(y7vS61tDx z`Bnfj5)C2u?i(IM7=)r4?D`&I&{1g-L;8YVEg-=~)0+pMGQ47?8JjzK==+XsLS}nz zX%#V_tp3l=o%U6yTx{j{N_*>7Gx){Oh7&_KjQ+`>-<=GHonc7m&3f3_SK>pldClg*)8` z5i}heyR5^bw-wX#E^SOPrTnyNr`~94U9H~Gy1jB!t+ZN={c^phw|Z5zQc=}rFnr<# zw&TE3tyoq*{@Qkt_naz&Ges>I%WZ`d^=eyLMTRU7)JnBj&MA+vdFhq*%W9ebs?xIw zzVKG1^=`0h1-7485Wa9A!+B*&@=8D%3U(3BW4CZ_tP9lO>-D0NO98n%I0LaGk1~+Z zOcVRuAFgi_q|f=~3!24m7~H$rE9kE2A=|ay634d%3En^VEdmiP^j(V01;K?8zFJ_) zD9Gx@>7=hsM*7L;UjUg`3nZW!H5DYM-pa8l3*z;nT5koZwJYxLgDtgDsn=Sys#IBa1(M~5{A*7?7+W?QN6Milfj%Zto2Z>&oHM>r?VYc{2n0h$((89rwq6X0i V+@u6amc-33@rRW85B@Wo`5PIYRnPzc literal 0 HcmV?d00001 diff --git a/src/lwb/__pycache__/scraper.cpython-310.pyc b/src/lwb/__pycache__/scraper.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e19655092405be80cc357907e34103a57ca36664 GIT binary patch literal 2614 zcmZ`*OK&5`5uO*x;qa};YS)io;>2>~h%afEdWCW56}78)S1XGvJ02(?hSROlNb^$9 zP?QEkAV8dBa?8nUKr(#Ef5^?3{DwXm$jLt-K;l!Xhmuw{V3Jc^UsX?cS9SH*^a=%q zgHP?-&&}y+j{7$QcRyJGzk??@1c5lj+nhFOc}Ea9o^wP^;wP4rlTIudEdv%olaYud zltO8gK{Cn$QeMeg7UfVL6;KhCP#IOw)T@l9plLMoIj7|i-{Tf$zk|8Bov3~khJLVK zulpVeb-NkVupV3lr$e3Kn&CNhpLhhH*UugM&?8Pj`DQft6Mdr(LVO;=h>pFe`@U@% zdT4oW{hV2T4I})}>ew5n?M7{F@%{SyiMASj-Rtc3d);1d`^C=oW`B3L(SNksY=2@d zj~@E&>gZ^5rS+kW^!=sL*>3f-=z{E>>^(g@`qcZV`|Qc1j%mKTj5gFj_p8GI6T@@a zirNg@x@!(~6Sp_#88mDQyI~tf-%t(DJF#$pg8)|BY`0ccJDZ)>N@sJcvD#c-TU%Lb zw6?lyTMwJd%gyF$wENtPEZf%Wk7^C|SD#ug@NykSY`wYs zu+~^mpJ3y}t1mU04fr+HEsNko@4TLPqd7CO{6!VvLt78=qBcp*2K-7msEP{1tCp$3zitupOa%-(bah^SKX737*IB1YhwS7jl>U6*uN$?pO%LPzqCs|EoCWWBxde zgcFgx8D@~k&==g7!k>AbGr1lY%9mNNDOdbhU`NQ}$D+xNrC7Mkp>!->37gy>ug0l3 zb(u#QhK~#5bb{n-lu1x_f-Z}&205`P%ref#*-HWCV0)6-}8$RzIo{<2>Hk4T~ETP9s4E>b<5A~yN7)K`bf4X$9&#G z?I=6Xj%VA%A>|fwRIm`KKmb&}$tFz^&x5-`3akiIi2(0V$+p}RD&ORoW=A@4`iU#A zpXrvZ4=mdXFK9MdRDVe9g;^?-YwDqy-h_{-$mT6}JH5|(AMbbey8ZpVXLRZge%kG9 zcK3R;a>usY+3P&-(OeMf;V|d}^$XiL zrcjAp9W=#y`M`QU6da;yb_wdCmYcMF7PnTq1A34FEq`rrJQrWw_C-ML3-@W}`l4a6 z2EGTJ3mNmgkY-ty179~VRiLzeOhUMosKm-xODz4QyvQshpS%G=OY2F|5=Mul%=RQT zso1imJ0^y7-_$LcVO{PqF~+;>&<$c$rV%c_d!=Q(4T5k|6NxD<|I@UEr1fNJX*=6D{P) z&!Ev};r~YshXmtV*TxQP_)E~WRASO{cWh*y^)Es^HHfD+*-M~=(STIJLw?T0J52n7 z2{uWRl{;buY*hY@7c7F1xhV({mVCYF$#Mlg{a+z>qq8>szr6Z?`AieF literal 0 HcmV?d00001 diff --git a/src/lwb/format.py b/src/lwb/format.py new file mode 100644 index 0000000..8542d96 --- /dev/null +++ b/src/lwb/format.py @@ -0,0 +1,57 @@ +from datetime import datetime + +def format_date(date): + # Extract the date part (skip the "B" prefix) + date_part = date[1:] + + # Convert to a datetime object + date_object = datetime.strptime(date_part, "%Y%m%d") + + # Format as "day month year" + formatted_date = date_object.strftime("%d %B %Y") + + return formatted_date + +def format_room(room): + + room = room[-5:].lstrip("0") + + room_count = int(room[:1]) + if room_count == 1: + return (f"{room_count} Zimmer") + else: + return (f"{room_count} Zimmer") + + + +def format_money(money): + # B000000079900 -> 799,00 € + + # Extract the amount part (skip the "B" prefix) + amount_part = money[1:] + + # remove leading zeros + amount_part = amount_part.lstrip("0") + + # Split the amount into euros and cents + euros = amount_part[:-2] + cents = amount_part[-2:] + + # Combine the parts with a comma + formatted_money = f"{euros},{cents} €" + + return formatted_money + +def format_roomSize(room): + + # Extract the amount part (skip the "B" prefix) + amount_part = room[1:] + # remove leading zeros + amount_part = amount_part.lstrip("0") + # Split the amount into meters and centimeters + meters = amount_part[:-4] + centimeters = amount_part[-4:] + # Combine the parts with a comma + formatted_room = f"{meters},{centimeters} m²" + + return formatted_room \ No newline at end of file diff --git a/src/lwb/scrape_image.py b/src/lwb/scrape_image.py new file mode 100644 index 0000000..c46b6c1 --- /dev/null +++ b/src/lwb/scrape_image.py @@ -0,0 +1,35 @@ +import requests + +EASYSQUARE_HEADERS = { + "DNT": "1", + "Host": "portal1s.easysquare.com", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Cookie": "SAP_SESSIONID_PP0_581=zqFIhvNbEsOs_n3cgRTIO1V7ZaLQCxHvhYgKELG5Agg%3d; sap-usercontext=sap-language=D&sap-client=581; cookiesession1=678ADA67ADF0813997206FE9F4133118", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0", + "Accept-Encoding": "gzip, deflate, br, zstd", + "Accept-Language": "de,en-US;q=0.7,en;q=0.3", + "Upgrade-Insecure-Requests": "1" +} +EASYSQUARE_PARAMS = { + "application": "ESQ_IA_REOBJ", + "sap-client": "581", + "command": "action", + "name": "boxlist", + "api": "6.169", + "head-oppc-version": "6.169.22", + "_": "1736595414769" +} + + +def scrape_image(url): + session = requests.Session() + response = session.get(url, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS) + + if response.status_code != 200: + print(f"Fehler beim Abrufen von Easysquare: {response.status_code}") + return [] + + # get image from response + + return response.content + diff --git a/src/lwb/scraper.py b/src/lwb/scraper.py new file mode 100644 index 0000000..0e9dd18 --- /dev/null +++ b/src/lwb/scraper.py @@ -0,0 +1,116 @@ +import requests +import xml.etree.ElementTree as ET +import src.lwb.format as format + +EASYSQUARE_URL = "https://portal1s.easysquare.com/prorex/xmlforms" +EASYSQUARE_HEADERS = { + "DNT": "1", + "Host": "portal1s.easysquare.com", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Cookie": "SAP_SESSIONID_PP0_581=Vg3w4pn8whD76BldaU2wvP-YzyrRkRHvhWoKELG5Agg%3d; sap-usercontext=sap-language=D&sap-client=581; cookiesession1=678ADA67ADF0813997206FE9F4133118", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0", + "Accept-Encoding": "gzip, deflate, br, zstd", + "Accept-Language": "de,en-US;q=0.7,en;q=0.3", + "Upgrade-Insecure-Requests": "1" +} +EASYSQUARE_PARAMS = { + "application": "ESQ_IA_REOBJ", + "sap-client": "581", + "command": "action", + "name": "boxlist", + "api": "6.169", + "head-oppc-version": "6.169.22", + "_": "1736761256321" +} + +# curl --location 'https://portal1s.easysquare.com/prorex/xmlforms?application=ESQ_IA_REOBJ&sap-client=581&command=action&name=boxlist&api=6.169&head-oppc-version=6.169.22&_=1736761255682' \ +# --header 'DNT: 1' \ +# --header 'UTC: 1736761256321' \ +# --header 'Host: portal1s.easysquare.com' \ +# --header 'host: portal1s.easysquare.com' \ +# --header 'Accept: text/plain, */*; q=0.01' \ +# --header 'Cookie: cookiesession1=678ADA67ADF0813997206FE9F4133118; sap-usercontext=sap-language=de&sap-client=581; SAP_SESSIONID_PP0_581=Vg3w4pn8whD76BldaU2wvP-YzyrRkRHvhWoKELG5Agg%3d' \ +# --header 'Referer: https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL' \ +# --header 'Sec-GPC: 1' \ +# --header 'oppc-id: D9925A2D-4ED9-4911-8AD3-2626DA41FBB0' \ +# --header 'Connection: keep-alive' \ +# --header 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0' \ +# --header 'Content-Type: text/plain;charset=UTF-8' \ +# --header 'Sec-Fetch-Dest: empty' \ +# --header 'Sec-Fetch-Mode: cors' \ +# --header 'Sec-Fetch-Site: same-origin' \ +# --header 'Accept-Encoding: gzip, deflate, br, zstd' \ +# --header 'Accept-Language: de,en-US;q=0.7,en;q=0.3' \ +# --header 'X-Requested-With: XMLHttpRequest' + + +# Funktion: Scrape von Easysquare +def scrape_easysquare(): + session = requests.Session() + response = session.get(EASYSQUARE_URL, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS) + + if response.status_code != 200: + print(f"Fehler beim Abrufen von Easysquare: {response.status_code}") + return [] + + # XML-Daten parsen + root = ET.fromstring(response.content) + namespace = {"ns": "http://www.openpromos.com/OPPC/XMLForms"} + + properties = [] + for head in root.findall(".//ns:head", namespace): + prop_title = head.find("ns:title", namespace).text + subtitle = head.find("ns:subtitle", namespace).text + abstract = head.find("ns:abstract", namespace).text.strip() + + # get adress lat and long + #
+ + adress = head.find("ns:address", namespace) + lat = adress.get("lat") + lon = adress.get("lon") + + image = head.find("ns:image", namespace) + iamge_resourceId = image.get("resourceId") + + id = head.find("ns:id", namespace).text + + # Details extrahieren + rooms = "N/A" + size = "N/A" + rent = "N/A" + availability = "N/A" + + for criterion in head.findall(".//ns:criterion", namespace): + criterion_title = criterion.get("title") + value = criterion.text.strip() if criterion.text else "N/A" + if criterion_title == "Zimmer": + rooms = value + elif criterion_title == "Fläche": + size = value + elif criterion_title == "Gesamtmiete": + rent = value + elif criterion_title == "Verfügbar ab": + availability = value + + # link create google maps link with lat and long + link = f"https://www.google.com/maps/search/?api=1&query={lat},{lon}" + + # https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id=1EC8D4E6-191A-A827-47FF-72D8C5379070&name=get + image_url = f"https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id={iamge_resourceId}&name=get" + + properties.append({ + "id": id, + "title": prop_title, + "subtitle": subtitle, + "rooms": format.format_room(rooms), + "size": format.format_roomSize(size), + "rent": format.format_money(rent), + "link": link, + "abstract": abstract, + "warm_rent": "", + "availability": format.format_date(availability), + "image_url": image_url, + }) + + return properties \ No newline at end of file diff --git a/src/wogetra/__pycache__/scraper.cpython-310.pyc b/src/wogetra/__pycache__/scraper.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c24036bbb601659f3e87dcd373c52e32deb0af2 GIT binary patch literal 1405 zcmZ`(y^q^A6elUkmMvRn`NOU&Dru!YIOO;HQ>3OkS zl;6yiTF6JE(Q36iT@^E~1U+SZlw?^kPSS+uCpTPVR9vfNK9f|aQI>GUM-%ICKL)@Y z!F&c&{)Q1!XpI%w;0h^6qbmsFS5je(uRKN=g!Yb9WQHn7lkd^be^#z`uNur_joRYX z5@F3+YG_ZKXm8zM{s!lmp{lvFw`#ku-8E)y7Hmk>0_-#GT{YMK1_8EHwRJD);*+0l~Ij&U()3STe zGODN{2{Wkl&x+43>YoDO&Uf(0oeMsFj~|o+i#?%vN#|)?Ld8K;gt5{&8$B(1wQ*I< z^Jz-?WX{XuT9b37iffvxx(v&33!@_2%3GgMvePA>jj4#~co>K#)TQyZ(fWtDJy8^y zG_FiaZd}20W!yB$uOpw1CG^2WnbrzC$F-?LZ)lQI+Z5_HY9$$+@pviHVcYnEzg}{w zq;a5uOz_hWFP>k#dK&-u>ZP!~YMM5?*1tbZau(AxHLm51tN0sbJlHK+mrgI=TtZk9x->c@J?1ITz>zfVUB(j*C$N0ftA0=~>dXB(%h~x9D49oBa 50,28 m² +room = "B000000502800" + + + +iamge = scrape_image.scrape_image("https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id=1EC8D4E6-191A-A827-47FF-72D8C5379070&name=get") + +# save image +with open(f"image_{room}.jpg", "wb") as file: + file.write(iamge)