fix:#82 fixed fetching with main page fetch for sport course link list

This commit is contained in:
masterElmar
2023-12-12 13:16:10 +01:00
parent 184dc70be4
commit 31c27635d3
2 changed files with 98 additions and 70 deletions

View File

@@ -11,55 +11,80 @@ import (
"github.com/PuerkitoBio/goquery"
)
// just to test the code
// @TODO: remove this
// @TODO: reformat the extracted data to the event model that there are events with real start and end dates
// @TODO: add tests
// @TODO: add it to the service
// @TODO: make it like a cron job to fetch the sport courses once a week
func main() {
events := fetchAllHtwkSportCourses()
var sportCourseLinks = fetchAllAvailableSportCourses()
events := fetchHTWKSportCourses(sportCourseLinks)
for _, event := range events {
print(event.Title)
println(event.Title)
}
}
// fetchAllHtwkSportCourses fetches all sport courses from the htwk sport website.
// It iterates over all ids from 0 to 9999 and tries to fetch the sport course.
// If the sport course does not exist, it will continue with the next id.
// If the sport course exists, it will be added to the events slice.
// Since the ids are not consecutive, it will take a while to fetch all sport courses.
// @TODO: find the highest id and iterate over all ids from 0 to highest id
func fetchAllHtwkSportCourses() []Event {
// fetch the main page where all sport courses are listed and extract all links to the sport courses
func fetchAllAvailableSportCourses() []string {
var url = "https://sport.htwk-leipzig.de/sportangebote"
var doc, err = htmlRequest(url)
if err != nil {
return nil
}
// link list of all sport courses
var links []string
// find all links to sport courses with regex https://sport.htwk-leipzig.de/sportangebote/detail/sport/ + [0-9]{1,4}
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
link, _ := s.Attr("href")
if strings.HasPrefix(link, "/sportangebote/detail/sport/") {
links = append(links, link)
}
})
return links
}
// fetchAllHTWKSportCourses fetches all sport courses from the given links.
// to speed up the process, it uses multithreading.
func fetchHTWKSportCourses(links []string) []Event {
//multithreaded webpage requests to speed up the process
var maxPageID = 9999
var maxThreads = 300
var htmlPageArray = make([]*goquery.Document, maxPageID)
var url = "https://sport.htwk-leipzig.de/sportangebote/detail/sport/"
var maxThreads = 10
var htmlPageArray = make([]*goquery.Document, len(links))
var hostUrl = "https://sport.htwk-leipzig.de"
var wg sync.WaitGroup
wg.Add(maxThreads)
for i := 0; i < maxThreads; i++ {
go func(i int) {
defer wg.Done()
for j := i; j < maxPageID; j += maxThreads {
doc, err := htmlRequest(url + strconv.Itoa(j))
for j := i; j < len(links); j += maxThreads {
doc, err := htmlRequest(hostUrl + links[j])
if err == nil {
htmlPageArray[j] = doc
}
}
wg.Done()
}(i)
}
wg.Wait()
println("finished fetching all pages")
//print count of all pages
var events []Event
for _, doc := range htmlPageArray {
if doc != nil {
event, err := fetchHtwkSportCourse(doc)
if err == nil {
events = append(events, event...)
}
}
}
return events
}
@@ -172,49 +197,3 @@ func fetchHtwkSportCourse(doc *goquery.Document) ([]Event, error) {
return events, nil
}
// MODELS
// Event represents the overall event details.
type Event struct {
Title string
Details EventDetails
AdditionalNote string
}
// EventDetails represents detailed information about the event.
type EventDetails struct {
DateRange DateRange
Cycle string
Gender string
CourseLead CourseLead
Location Location
Participants Participants
Cost string
Type string
}
// DateRange represents a start and end date.
type DateRange struct {
Start time.Time
End time.Time
}
// CourseLead represents a person with a name and a contact link.
type CourseLead struct {
Name string
Link string
}
// Location represents the location of the event.
type Location struct {
Name string
Address string
}
// Participants represents the participants' details.
type Participants struct {
Bookings int
TotalPlaces int
WaitList int
}

View File

@@ -0,0 +1,49 @@
package main
import "time"
// MODELS
// Event represents the overall event details.
type Event struct {
Title string
Details EventDetails
AdditionalNote string
}
// EventDetails represents detailed information about the event.
type EventDetails struct {
DateRange DateRange
Cycle string
Gender string
CourseLead CourseLead
Location Location
Participants Participants
Cost string
Type string
}
// DateRange represents a start and end date.
type DateRange struct {
Start time.Time
End time.Time
}
// CourseLead represents a person with a name and a contact link.
type CourseLead struct {
Name string
Link string
}
// Location represents the location of the event.
type Location struct {
Name string
Address string
}
// Participants represents the participants' details.
type Participants struct {
Bookings int
TotalPlaces int
WaitList int
}