mirror of
https://gitlab.dit.htwk-leipzig.de/htwk-software/htwkalender.git
synced 2025-08-03 10:19:14 +02:00
fix:#82 fixed fetching with main page fetch for sport course link list
This commit is contained in:
@@ -11,55 +11,80 @@ import (
|
|||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
)
|
)
|
||||||
|
|
||||||
// just to test the code
|
// @TODO: reformat the extracted data to the event model that there are events with real start and end dates
|
||||||
// @TODO: remove this
|
|
||||||
// @TODO: add tests
|
// @TODO: add tests
|
||||||
// @TODO: add it to the service
|
// @TODO: add it to the service
|
||||||
// @TODO: make it like a cron job to fetch the sport courses once a week
|
// @TODO: make it like a cron job to fetch the sport courses once a week
|
||||||
func main() {
|
func main() {
|
||||||
events := fetchAllHtwkSportCourses()
|
|
||||||
|
var sportCourseLinks = fetchAllAvailableSportCourses()
|
||||||
|
events := fetchHTWKSportCourses(sportCourseLinks)
|
||||||
|
|
||||||
for _, event := range events {
|
for _, event := range events {
|
||||||
print(event.Title)
|
println(event.Title)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// fetchAllHtwkSportCourses fetches all sport courses from the htwk sport website.
|
// fetch the main page where all sport courses are listed and extract all links to the sport courses
|
||||||
// It iterates over all ids from 0 to 9999 and tries to fetch the sport course.
|
func fetchAllAvailableSportCourses() []string {
|
||||||
// If the sport course does not exist, it will continue with the next id.
|
var url = "https://sport.htwk-leipzig.de/sportangebote"
|
||||||
// If the sport course exists, it will be added to the events slice.
|
|
||||||
// Since the ids are not consecutive, it will take a while to fetch all sport courses.
|
var doc, err = htmlRequest(url)
|
||||||
// @TODO: find the highest id and iterate over all ids from 0 to highest id
|
|
||||||
func fetchAllHtwkSportCourses() []Event {
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// link list of all sport courses
|
||||||
|
var links []string
|
||||||
|
|
||||||
|
// find all links to sport courses with regex https://sport.htwk-leipzig.de/sportangebote/detail/sport/ + [0-9]{1,4}
|
||||||
|
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
|
||||||
|
link, _ := s.Attr("href")
|
||||||
|
if strings.HasPrefix(link, "/sportangebote/detail/sport/") {
|
||||||
|
links = append(links, link)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
return links
|
||||||
|
}
|
||||||
|
|
||||||
|
// fetchAllHTWKSportCourses fetches all sport courses from the given links.
|
||||||
|
// to speed up the process, it uses multithreading.
|
||||||
|
|
||||||
|
func fetchHTWKSportCourses(links []string) []Event {
|
||||||
|
|
||||||
//multithreaded webpage requests to speed up the process
|
//multithreaded webpage requests to speed up the process
|
||||||
|
|
||||||
var maxPageID = 9999
|
var maxThreads = 10
|
||||||
var maxThreads = 300
|
var htmlPageArray = make([]*goquery.Document, len(links))
|
||||||
var htmlPageArray = make([]*goquery.Document, maxPageID)
|
var hostUrl = "https://sport.htwk-leipzig.de"
|
||||||
var url = "https://sport.htwk-leipzig.de/sportangebote/detail/sport/"
|
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
wg.Add(maxThreads)
|
wg.Add(maxThreads)
|
||||||
|
|
||||||
for i := 0; i < maxThreads; i++ {
|
for i := 0; i < maxThreads; i++ {
|
||||||
go func(i int) {
|
go func(i int) {
|
||||||
defer wg.Done()
|
for j := i; j < len(links); j += maxThreads {
|
||||||
for j := i; j < maxPageID; j += maxThreads {
|
doc, err := htmlRequest(hostUrl + links[j])
|
||||||
doc, err := htmlRequest(url + strconv.Itoa(j))
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
htmlPageArray[j] = doc
|
htmlPageArray[j] = doc
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
wg.Done()
|
||||||
}(i)
|
}(i)
|
||||||
}
|
}
|
||||||
|
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
|
|
||||||
println("finished fetching all pages")
|
|
||||||
|
|
||||||
//print count of all pages
|
|
||||||
|
|
||||||
var events []Event
|
var events []Event
|
||||||
|
|
||||||
|
for _, doc := range htmlPageArray {
|
||||||
|
if doc != nil {
|
||||||
|
event, err := fetchHtwkSportCourse(doc)
|
||||||
|
if err == nil {
|
||||||
|
events = append(events, event...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
return events
|
return events
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -172,49 +197,3 @@ func fetchHtwkSportCourse(doc *goquery.Document) ([]Event, error) {
|
|||||||
|
|
||||||
return events, nil
|
return events, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// MODELS
|
|
||||||
|
|
||||||
// Event represents the overall event details.
|
|
||||||
type Event struct {
|
|
||||||
Title string
|
|
||||||
Details EventDetails
|
|
||||||
AdditionalNote string
|
|
||||||
}
|
|
||||||
|
|
||||||
// EventDetails represents detailed information about the event.
|
|
||||||
type EventDetails struct {
|
|
||||||
DateRange DateRange
|
|
||||||
Cycle string
|
|
||||||
Gender string
|
|
||||||
CourseLead CourseLead
|
|
||||||
Location Location
|
|
||||||
Participants Participants
|
|
||||||
Cost string
|
|
||||||
Type string
|
|
||||||
}
|
|
||||||
|
|
||||||
// DateRange represents a start and end date.
|
|
||||||
type DateRange struct {
|
|
||||||
Start time.Time
|
|
||||||
End time.Time
|
|
||||||
}
|
|
||||||
|
|
||||||
// CourseLead represents a person with a name and a contact link.
|
|
||||||
type CourseLead struct {
|
|
||||||
Name string
|
|
||||||
Link string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Location represents the location of the event.
|
|
||||||
type Location struct {
|
|
||||||
Name string
|
|
||||||
Address string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Participants represents the participants' details.
|
|
||||||
type Participants struct {
|
|
||||||
Bookings int
|
|
||||||
TotalPlaces int
|
|
||||||
WaitList int
|
|
||||||
}
|
|
||||||
|
49
backend/sport/sportFetcherModel.go
Normal file
49
backend/sport/sportFetcherModel.go
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
// MODELS
|
||||||
|
|
||||||
|
// Event represents the overall event details.
|
||||||
|
type Event struct {
|
||||||
|
Title string
|
||||||
|
Details EventDetails
|
||||||
|
AdditionalNote string
|
||||||
|
}
|
||||||
|
|
||||||
|
// EventDetails represents detailed information about the event.
|
||||||
|
type EventDetails struct {
|
||||||
|
DateRange DateRange
|
||||||
|
Cycle string
|
||||||
|
Gender string
|
||||||
|
CourseLead CourseLead
|
||||||
|
Location Location
|
||||||
|
Participants Participants
|
||||||
|
Cost string
|
||||||
|
Type string
|
||||||
|
}
|
||||||
|
|
||||||
|
// DateRange represents a start and end date.
|
||||||
|
type DateRange struct {
|
||||||
|
Start time.Time
|
||||||
|
End time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
// CourseLead represents a person with a name and a contact link.
|
||||||
|
type CourseLead struct {
|
||||||
|
Name string
|
||||||
|
Link string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Location represents the location of the event.
|
||||||
|
type Location struct {
|
||||||
|
Name string
|
||||||
|
Address string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Participants represents the participants' details.
|
||||||
|
type Participants struct {
|
||||||
|
Bookings int
|
||||||
|
TotalPlaces int
|
||||||
|
WaitList int
|
||||||
|
}
|
Reference in New Issue
Block a user