mirror of
https://gitlab.dit.htwk-leipzig.de/htwk-software/htwkalender.git
synced 2025-08-02 17:59:14 +02:00
fix:#82 fixed fetching with main page fetch for sport course link list
This commit is contained in:
@@ -11,55 +11,80 @@ import (
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
// just to test the code
|
||||
// @TODO: remove this
|
||||
// @TODO: reformat the extracted data to the event model that there are events with real start and end dates
|
||||
// @TODO: add tests
|
||||
// @TODO: add it to the service
|
||||
// @TODO: make it like a cron job to fetch the sport courses once a week
|
||||
func main() {
|
||||
events := fetchAllHtwkSportCourses()
|
||||
|
||||
var sportCourseLinks = fetchAllAvailableSportCourses()
|
||||
events := fetchHTWKSportCourses(sportCourseLinks)
|
||||
|
||||
for _, event := range events {
|
||||
print(event.Title)
|
||||
println(event.Title)
|
||||
}
|
||||
}
|
||||
|
||||
// fetchAllHtwkSportCourses fetches all sport courses from the htwk sport website.
|
||||
// It iterates over all ids from 0 to 9999 and tries to fetch the sport course.
|
||||
// If the sport course does not exist, it will continue with the next id.
|
||||
// If the sport course exists, it will be added to the events slice.
|
||||
// Since the ids are not consecutive, it will take a while to fetch all sport courses.
|
||||
// @TODO: find the highest id and iterate over all ids from 0 to highest id
|
||||
func fetchAllHtwkSportCourses() []Event {
|
||||
// fetch the main page where all sport courses are listed and extract all links to the sport courses
|
||||
func fetchAllAvailableSportCourses() []string {
|
||||
var url = "https://sport.htwk-leipzig.de/sportangebote"
|
||||
|
||||
var doc, err = htmlRequest(url)
|
||||
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// link list of all sport courses
|
||||
var links []string
|
||||
|
||||
// find all links to sport courses with regex https://sport.htwk-leipzig.de/sportangebote/detail/sport/ + [0-9]{1,4}
|
||||
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
|
||||
link, _ := s.Attr("href")
|
||||
if strings.HasPrefix(link, "/sportangebote/detail/sport/") {
|
||||
links = append(links, link)
|
||||
}
|
||||
})
|
||||
|
||||
return links
|
||||
}
|
||||
|
||||
// fetchAllHTWKSportCourses fetches all sport courses from the given links.
|
||||
// to speed up the process, it uses multithreading.
|
||||
|
||||
func fetchHTWKSportCourses(links []string) []Event {
|
||||
|
||||
//multithreaded webpage requests to speed up the process
|
||||
|
||||
var maxPageID = 9999
|
||||
var maxThreads = 300
|
||||
var htmlPageArray = make([]*goquery.Document, maxPageID)
|
||||
var url = "https://sport.htwk-leipzig.de/sportangebote/detail/sport/"
|
||||
var maxThreads = 10
|
||||
var htmlPageArray = make([]*goquery.Document, len(links))
|
||||
var hostUrl = "https://sport.htwk-leipzig.de"
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(maxThreads)
|
||||
|
||||
for i := 0; i < maxThreads; i++ {
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
for j := i; j < maxPageID; j += maxThreads {
|
||||
doc, err := htmlRequest(url + strconv.Itoa(j))
|
||||
for j := i; j < len(links); j += maxThreads {
|
||||
doc, err := htmlRequest(hostUrl + links[j])
|
||||
if err == nil {
|
||||
htmlPageArray[j] = doc
|
||||
}
|
||||
}
|
||||
wg.Done()
|
||||
}(i)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
println("finished fetching all pages")
|
||||
|
||||
//print count of all pages
|
||||
|
||||
var events []Event
|
||||
|
||||
for _, doc := range htmlPageArray {
|
||||
if doc != nil {
|
||||
event, err := fetchHtwkSportCourse(doc)
|
||||
if err == nil {
|
||||
events = append(events, event...)
|
||||
}
|
||||
}
|
||||
}
|
||||
return events
|
||||
}
|
||||
|
||||
@@ -172,49 +197,3 @@ func fetchHtwkSportCourse(doc *goquery.Document) ([]Event, error) {
|
||||
|
||||
return events, nil
|
||||
}
|
||||
|
||||
// MODELS
|
||||
|
||||
// Event represents the overall event details.
|
||||
type Event struct {
|
||||
Title string
|
||||
Details EventDetails
|
||||
AdditionalNote string
|
||||
}
|
||||
|
||||
// EventDetails represents detailed information about the event.
|
||||
type EventDetails struct {
|
||||
DateRange DateRange
|
||||
Cycle string
|
||||
Gender string
|
||||
CourseLead CourseLead
|
||||
Location Location
|
||||
Participants Participants
|
||||
Cost string
|
||||
Type string
|
||||
}
|
||||
|
||||
// DateRange represents a start and end date.
|
||||
type DateRange struct {
|
||||
Start time.Time
|
||||
End time.Time
|
||||
}
|
||||
|
||||
// CourseLead represents a person with a name and a contact link.
|
||||
type CourseLead struct {
|
||||
Name string
|
||||
Link string
|
||||
}
|
||||
|
||||
// Location represents the location of the event.
|
||||
type Location struct {
|
||||
Name string
|
||||
Address string
|
||||
}
|
||||
|
||||
// Participants represents the participants' details.
|
||||
type Participants struct {
|
||||
Bookings int
|
||||
TotalPlaces int
|
||||
WaitList int
|
||||
}
|
||||
|
49
backend/sport/sportFetcherModel.go
Normal file
49
backend/sport/sportFetcherModel.go
Normal file
@@ -0,0 +1,49 @@
|
||||
package main
|
||||
|
||||
import "time"
|
||||
|
||||
// MODELS
|
||||
|
||||
// Event represents the overall event details.
|
||||
type Event struct {
|
||||
Title string
|
||||
Details EventDetails
|
||||
AdditionalNote string
|
||||
}
|
||||
|
||||
// EventDetails represents detailed information about the event.
|
||||
type EventDetails struct {
|
||||
DateRange DateRange
|
||||
Cycle string
|
||||
Gender string
|
||||
CourseLead CourseLead
|
||||
Location Location
|
||||
Participants Participants
|
||||
Cost string
|
||||
Type string
|
||||
}
|
||||
|
||||
// DateRange represents a start and end date.
|
||||
type DateRange struct {
|
||||
Start time.Time
|
||||
End time.Time
|
||||
}
|
||||
|
||||
// CourseLead represents a person with a name and a contact link.
|
||||
type CourseLead struct {
|
||||
Name string
|
||||
Link string
|
||||
}
|
||||
|
||||
// Location represents the location of the event.
|
||||
type Location struct {
|
||||
Name string
|
||||
Address string
|
||||
}
|
||||
|
||||
// Participants represents the participants' details.
|
||||
type Participants struct {
|
||||
Bookings int
|
||||
TotalPlaces int
|
||||
WaitList int
|
||||
}
|
Reference in New Issue
Block a user