fix:#82 fixed fetching with main page fetch for sport course link list

This commit is contained in:
masterElmar
2023-12-12 13:16:10 +01:00
parent 184dc70be4
commit 31c27635d3
2 changed files with 98 additions and 70 deletions

View File

@@ -11,55 +11,80 @@ import (
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
) )
// just to test the code // @TODO: reformat the extracted data to the event model that there are events with real start and end dates
// @TODO: remove this
// @TODO: add tests // @TODO: add tests
// @TODO: add it to the service // @TODO: add it to the service
// @TODO: make it like a cron job to fetch the sport courses once a week // @TODO: make it like a cron job to fetch the sport courses once a week
func main() { func main() {
events := fetchAllHtwkSportCourses()
var sportCourseLinks = fetchAllAvailableSportCourses()
events := fetchHTWKSportCourses(sportCourseLinks)
for _, event := range events { for _, event := range events {
print(event.Title) println(event.Title)
} }
} }
// fetchAllHtwkSportCourses fetches all sport courses from the htwk sport website. // fetch the main page where all sport courses are listed and extract all links to the sport courses
// It iterates over all ids from 0 to 9999 and tries to fetch the sport course. func fetchAllAvailableSportCourses() []string {
// If the sport course does not exist, it will continue with the next id. var url = "https://sport.htwk-leipzig.de/sportangebote"
// If the sport course exists, it will be added to the events slice.
// Since the ids are not consecutive, it will take a while to fetch all sport courses. var doc, err = htmlRequest(url)
// @TODO: find the highest id and iterate over all ids from 0 to highest id
func fetchAllHtwkSportCourses() []Event { if err != nil {
return nil
}
// link list of all sport courses
var links []string
// find all links to sport courses with regex https://sport.htwk-leipzig.de/sportangebote/detail/sport/ + [0-9]{1,4}
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
link, _ := s.Attr("href")
if strings.HasPrefix(link, "/sportangebote/detail/sport/") {
links = append(links, link)
}
})
return links
}
// fetchAllHTWKSportCourses fetches all sport courses from the given links.
// to speed up the process, it uses multithreading.
func fetchHTWKSportCourses(links []string) []Event {
//multithreaded webpage requests to speed up the process //multithreaded webpage requests to speed up the process
var maxPageID = 9999 var maxThreads = 10
var maxThreads = 300 var htmlPageArray = make([]*goquery.Document, len(links))
var htmlPageArray = make([]*goquery.Document, maxPageID) var hostUrl = "https://sport.htwk-leipzig.de"
var url = "https://sport.htwk-leipzig.de/sportangebote/detail/sport/"
var wg sync.WaitGroup var wg sync.WaitGroup
wg.Add(maxThreads) wg.Add(maxThreads)
for i := 0; i < maxThreads; i++ { for i := 0; i < maxThreads; i++ {
go func(i int) { go func(i int) {
defer wg.Done() for j := i; j < len(links); j += maxThreads {
for j := i; j < maxPageID; j += maxThreads { doc, err := htmlRequest(hostUrl + links[j])
doc, err := htmlRequest(url + strconv.Itoa(j))
if err == nil { if err == nil {
htmlPageArray[j] = doc htmlPageArray[j] = doc
} }
} }
wg.Done()
}(i) }(i)
} }
wg.Wait() wg.Wait()
println("finished fetching all pages")
//print count of all pages
var events []Event var events []Event
for _, doc := range htmlPageArray {
if doc != nil {
event, err := fetchHtwkSportCourse(doc)
if err == nil {
events = append(events, event...)
}
}
}
return events return events
} }
@@ -172,49 +197,3 @@ func fetchHtwkSportCourse(doc *goquery.Document) ([]Event, error) {
return events, nil return events, nil
} }
// MODELS
// Event represents the overall event details.
type Event struct {
Title string
Details EventDetails
AdditionalNote string
}
// EventDetails represents detailed information about the event.
type EventDetails struct {
DateRange DateRange
Cycle string
Gender string
CourseLead CourseLead
Location Location
Participants Participants
Cost string
Type string
}
// DateRange represents a start and end date.
type DateRange struct {
Start time.Time
End time.Time
}
// CourseLead represents a person with a name and a contact link.
type CourseLead struct {
Name string
Link string
}
// Location represents the location of the event.
type Location struct {
Name string
Address string
}
// Participants represents the participants' details.
type Participants struct {
Bookings int
TotalPlaces int
WaitList int
}

View File

@@ -0,0 +1,49 @@
package main
import "time"
// MODELS
// Event represents the overall event details.
type Event struct {
Title string
Details EventDetails
AdditionalNote string
}
// EventDetails represents detailed information about the event.
type EventDetails struct {
DateRange DateRange
Cycle string
Gender string
CourseLead CourseLead
Location Location
Participants Participants
Cost string
Type string
}
// DateRange represents a start and end date.
type DateRange struct {
Start time.Time
End time.Time
}
// CourseLead represents a person with a name and a contact link.
type CourseLead struct {
Name string
Link string
}
// Location represents the location of the event.
type Location struct {
Name string
Address string
}
// Participants represents the participants' details.
type Participants struct {
Bookings int
TotalPlaces int
WaitList int
}