From 31c27635d395f928659c839481e5b0afd8f1e97e Mon Sep 17 00:00:00 2001 From: masterElmar <18119527+masterElmar@users.noreply.github.com> Date: Tue, 12 Dec 2023 13:16:10 +0100 Subject: [PATCH] fix:#82 fixed fetching with main page fetch for sport course link list --- backend/sport/main.go | 119 ++++++++++++----------------- backend/sport/sportFetcherModel.go | 49 ++++++++++++ 2 files changed, 98 insertions(+), 70 deletions(-) create mode 100644 backend/sport/sportFetcherModel.go diff --git a/backend/sport/main.go b/backend/sport/main.go index 31164b2..99657ce 100644 --- a/backend/sport/main.go +++ b/backend/sport/main.go @@ -11,55 +11,80 @@ import ( "github.com/PuerkitoBio/goquery" ) -// just to test the code -// @TODO: remove this +// @TODO: reformat the extracted data to the event model that there are events with real start and end dates // @TODO: add tests // @TODO: add it to the service // @TODO: make it like a cron job to fetch the sport courses once a week func main() { - events := fetchAllHtwkSportCourses() + + var sportCourseLinks = fetchAllAvailableSportCourses() + events := fetchHTWKSportCourses(sportCourseLinks) + for _, event := range events { - print(event.Title) + println(event.Title) } } -// fetchAllHtwkSportCourses fetches all sport courses from the htwk sport website. -// It iterates over all ids from 0 to 9999 and tries to fetch the sport course. -// If the sport course does not exist, it will continue with the next id. -// If the sport course exists, it will be added to the events slice. -// Since the ids are not consecutive, it will take a while to fetch all sport courses. -// @TODO: find the highest id and iterate over all ids from 0 to highest id -func fetchAllHtwkSportCourses() []Event { +// fetch the main page where all sport courses are listed and extract all links to the sport courses +func fetchAllAvailableSportCourses() []string { + var url = "https://sport.htwk-leipzig.de/sportangebote" + + var doc, err = htmlRequest(url) + + if err != nil { + return nil + } + + // link list of all sport courses + var links []string + + // find all links to sport courses with regex https://sport.htwk-leipzig.de/sportangebote/detail/sport/ + [0-9]{1,4} + doc.Find("a[href]").Each(func(i int, s *goquery.Selection) { + link, _ := s.Attr("href") + if strings.HasPrefix(link, "/sportangebote/detail/sport/") { + links = append(links, link) + } + }) + + return links +} + +// fetchAllHTWKSportCourses fetches all sport courses from the given links. +// to speed up the process, it uses multithreading. + +func fetchHTWKSportCourses(links []string) []Event { //multithreaded webpage requests to speed up the process - var maxPageID = 9999 - var maxThreads = 300 - var htmlPageArray = make([]*goquery.Document, maxPageID) - var url = "https://sport.htwk-leipzig.de/sportangebote/detail/sport/" + var maxThreads = 10 + var htmlPageArray = make([]*goquery.Document, len(links)) + var hostUrl = "https://sport.htwk-leipzig.de" var wg sync.WaitGroup wg.Add(maxThreads) - for i := 0; i < maxThreads; i++ { go func(i int) { - defer wg.Done() - for j := i; j < maxPageID; j += maxThreads { - doc, err := htmlRequest(url + strconv.Itoa(j)) + for j := i; j < len(links); j += maxThreads { + doc, err := htmlRequest(hostUrl + links[j]) if err == nil { htmlPageArray[j] = doc } } + wg.Done() }(i) } - wg.Wait() - println("finished fetching all pages") - - //print count of all pages - var events []Event + + for _, doc := range htmlPageArray { + if doc != nil { + event, err := fetchHtwkSportCourse(doc) + if err == nil { + events = append(events, event...) + } + } + } return events } @@ -172,49 +197,3 @@ func fetchHtwkSportCourse(doc *goquery.Document) ([]Event, error) { return events, nil } - -// MODELS - -// Event represents the overall event details. -type Event struct { - Title string - Details EventDetails - AdditionalNote string -} - -// EventDetails represents detailed information about the event. -type EventDetails struct { - DateRange DateRange - Cycle string - Gender string - CourseLead CourseLead - Location Location - Participants Participants - Cost string - Type string -} - -// DateRange represents a start and end date. -type DateRange struct { - Start time.Time - End time.Time -} - -// CourseLead represents a person with a name and a contact link. -type CourseLead struct { - Name string - Link string -} - -// Location represents the location of the event. -type Location struct { - Name string - Address string -} - -// Participants represents the participants' details. -type Participants struct { - Bookings int - TotalPlaces int - WaitList int -} diff --git a/backend/sport/sportFetcherModel.go b/backend/sport/sportFetcherModel.go new file mode 100644 index 0000000..785e582 --- /dev/null +++ b/backend/sport/sportFetcherModel.go @@ -0,0 +1,49 @@ +package main + +import "time" + +// MODELS + +// Event represents the overall event details. +type Event struct { + Title string + Details EventDetails + AdditionalNote string +} + +// EventDetails represents detailed information about the event. +type EventDetails struct { + DateRange DateRange + Cycle string + Gender string + CourseLead CourseLead + Location Location + Participants Participants + Cost string + Type string +} + +// DateRange represents a start and end date. +type DateRange struct { + Start time.Time + End time.Time +} + +// CourseLead represents a person with a name and a contact link. +type CourseLead struct { + Name string + Link string +} + +// Location represents the location of the event. +type Location struct { + Name string + Address string +} + +// Participants represents the participants' details. +type Participants struct { + Bookings int + TotalPlaces int + WaitList int +}