Files
htwkalender/backend/service/fetch/sport/sportFetcher.go
2023-12-13 12:11:06 +01:00

474 lines
14 KiB
Go

package sport
import (
"errors"
"github.com/google/uuid"
"github.com/pocketbase/pocketbase"
"github.com/pocketbase/pocketbase/tools/types"
"htwkalender/model"
"htwkalender/service/db"
"htwkalender/service/functions"
"net/http"
"regexp"
"strconv"
"strings"
"sync"
"time"
"github.com/PuerkitoBio/goquery"
)
// @TODO: add tests
// @TODO: make it like a cron job to fetch the sport courses once a week
func FetchAndUpdateSportEvents(app *pocketbase.PocketBase) []model.Event {
var sportCourseLinks = fetchAllAvailableSportCourses()
sportEntries := fetchHTWKSportCourses(sportCourseLinks)
events := formatEntriesToEvents(sportEntries)
var earliestDate time.Time
var latestDate time.Time
// find earliest and latest date in events
for _, event := range events {
if event.Start.Time().Before(earliestDate) {
earliestDate = event.Start.Time()
}
if event.End.Time().After(latestDate) {
latestDate = event.End.Time()
}
}
// get all events from database where name = Feiertage und lehrveranstaltungsfreie Tage
holidays, err := db.GetAllModulesByNameAndDateRange(app, "Feiertage und lehrveranstaltungsfreie Tage", earliestDate, latestDate)
if err != nil {
return nil
}
// remove all events that have same year, month and day as items in holidays
for _, holiday := range holidays {
for i, event := range events {
if event.Start.Time().Year() == holiday.Start.Time().Year() &&
event.Start.Time().Month() == holiday.Start.Time().Month() &&
event.Start.Time().Day() == holiday.Start.Time().Day() {
events = append(events[:i], events[i+1:]...)
}
}
}
err = db.DeleteAllEventsForCourse(app, "Sport", functions.GetCurrentSemesterString())
if err != nil {
return nil
}
// save events to database
savedEvents, err := db.SaveEvents(events, app)
if err != nil {
return nil
}
return savedEvents
}
func formatEntriesToEvents(entries []model.SportEntry) []model.Event {
var events []model.Event
for i, entry := range entries {
eventStarts, eventEnds := getWeekEvents(entry.Details.DateRange.Start, entry.Details.DateRange.End, entry.Details.Cycle)
for j := range eventStarts {
start, _ := types.ParseDateTime(eventStarts[j].In(time.UTC))
end, _ := types.ParseDateTime(eventEnds[j].In(time.UTC))
var event = model.Event{
UUID: uuid.NewSHA1(uuid.NameSpaceDNS, []byte(entry.Title+strconv.FormatInt(int64(i), 10)+entry.Details.Type)).String(),
Day: toGermanWeekdayString(entry.Details.DateRange.Start.Weekday()),
Week: strconv.Itoa(23),
Start: start,
End: end,
Name: entry.Title + " " + entry.Details.Type,
EventType: entry.Details.Type,
Prof: entry.Details.CourseLead.Name,
Rooms: entry.Details.Location.Name,
Notes: entry.AdditionalNote,
BookedAt: "",
Course: "Sport",
Semester: checkSemester(entry.Details.DateRange.Start),
}
events = append(events, event)
}
}
return events
}
func getDayInt(weekDay string) int {
var weekDayInt int
switch weekDay {
case "Mo":
weekDayInt = 1
case "Di":
weekDayInt = 2
case "Mi":
weekDayInt = 3
case "Do":
weekDayInt = 4
case "Fr":
weekDayInt = 5
case "Sa":
weekDayInt = 6
case "So":
weekDayInt = 0
}
return weekDayInt
}
func toGermanWeekdayString(weekday time.Weekday) string {
switch weekday {
case time.Monday:
return "Montag"
case time.Tuesday:
return "Dienstag"
case time.Wednesday:
return "Mittwoch"
case time.Thursday:
return "Donnerstag"
case time.Friday:
return "Freitag"
case time.Saturday:
return "Samstag"
case time.Sunday:
return "Sonntag"
default:
return ""
}
}
func extractStartAndEndTime(cycle string) (int, int, int, int) {
timeRegExp, _ := regexp.Compile("[0-9]{2}:[0-9]{2}")
times := timeRegExp.FindAllString(cycle, 2)
startHour, _ := strconv.Atoi(times[0][0:2])
startMinute, _ := strconv.Atoi(times[0][3:5])
endHour, _ := strconv.Atoi(times[1][0:2])
endMinute, _ := strconv.Atoi(times[1][3:5])
return startHour, startMinute, endHour, endMinute
}
func getWeekEvents(start time.Time, end time.Time, cycle string) ([]time.Time, []time.Time) {
var weekEvents []model.SportDayStartEnd
// split by regexp to get the cycle parts
var cycleParts []string
cycleParts = splitByCommaWithTime(cycle)
for _, cyclePart := range cycleParts {
//cut string at the first integer/number
cyclePartWithDaysOnly := cyclePart[0:strings.IndexFunc(cyclePart, func(r rune) bool { return r >= '0' && r <= '9' })]
// check if cycle has multiple days by checking if it has a plus sign
if strings.Contains(cyclePartWithDaysOnly, "+") {
// find all days in cycle part by regexp
dayRegExp, _ := regexp.Compile("[A-Z][a-z]")
days := dayRegExp.FindAllString(cyclePart, -1)
startHour, startMinute, endHour, endMinute := extractStartAndEndTime(cyclePart)
// creating a SportDayStartEnd for each day in the cycle
for _, day := range days {
weekEvents = append(weekEvents, model.SportDayStartEnd{
Start: time.Date(start.Year(), start.Month(), start.Day(), startHour, startMinute, 0, 0, start.Location()),
End: time.Date(end.Year(), end.Month(), end.Day(), endHour, endMinute, 0, 0, end.Location()),
Day: time.Weekday(getDayInt(day)),
})
}
}
// check if cycle has multiple days by checking if it has a minus sign
if strings.Contains(cyclePartWithDaysOnly, "-") {
// find all days in cycle part by regexp
dayRegExp, _ := regexp.Compile("[A-Z][a-z]")
days := dayRegExp.FindAllString(cyclePart, 2)
startHour, startMinute, endHour, endMinute := extractStartAndEndTime(cyclePart)
//create a int array with all days from start to end day
var daysBetween []int
for i := getDayInt(days[0]); i <= getDayInt(days[1]); i++ {
daysBetween = append(daysBetween, i)
}
// creating a SportDayStartEnd for each day in the cycle
for _, day := range daysBetween {
weekEvents = append(weekEvents, model.SportDayStartEnd{
Start: time.Date(start.Year(), start.Month(), start.Day(), startHour, startMinute, 0, 0, start.Location()),
End: time.Date(end.Year(), end.Month(), end.Day(), endHour, endMinute, 0, 0, end.Location()),
Day: time.Weekday(day),
})
}
}
// check if cycle has only one day
if !strings.Contains(cyclePartWithDaysOnly, "-") && !strings.Contains(cyclePartWithDaysOnly, "+") {
// find all days in cycle part by regexp
dayRegExp, _ := regexp.Compile("[A-Z][a-z]")
days := dayRegExp.FindAllString(cyclePart, -1)
startHour, startMinute, endHour, endMinute := extractStartAndEndTime(cyclePart)
// creating a SportDayStartEnd for each day in the cycle
for _, day := range days {
weekEvents = append(weekEvents, model.SportDayStartEnd{
Start: time.Date(start.Year(), start.Month(), start.Day(), startHour, startMinute, 0, 0, start.Location()),
End: time.Date(end.Year(), end.Month(), end.Day(), endHour, endMinute, 0, 0, end.Location()),
Day: time.Weekday(getDayInt(day)),
})
}
}
}
var startDatesList []time.Time
var endDatesList []time.Time
for _, weekEvent := range weekEvents {
startDates, endDates := createEventListFromStartToEndMatchingDay(weekEvent)
startDatesList = append(startDatesList, startDates...)
endDatesList = append(endDatesList, endDates...)
}
return startDatesList, endDatesList
}
func createEventListFromStartToEndMatchingDay(weekEvent model.SportDayStartEnd) ([]time.Time, []time.Time) {
var startDates []time.Time
var endDates []time.Time
for d := weekEvent.Start; d.Before(weekEvent.End); d = d.AddDate(0, 0, 1) {
if d.Weekday() == weekEvent.Day {
startDates = append(startDates, time.Date(d.Year(), d.Month(), d.Day(), weekEvent.Start.Hour(), weekEvent.Start.Minute(), 0, 0, d.Location()))
endDates = append(endDates, time.Date(d.Year(), d.Month(), d.Day(), weekEvent.End.Hour(), weekEvent.End.Minute(), 0, 0, d.Location()))
}
}
return startDates, endDates
}
func splitByCommaWithTime(input string) []string {
var result []string
// Split by comma
parts := strings.Split(input, ", ")
// Regular expression to match a day with time
regex := regexp.MustCompile(`([A-Za-z]{2,}(\+[A-Za-z]{2,})* \d{2}:\d{2}-\d{2}:\d{2})`)
// Iterate over parts and combine when necessary
var currentPart string
for _, part := range parts {
if regex.MatchString(part) {
if currentPart != "" {
currentPart += ", " + part
result = append(result, currentPart)
currentPart = ""
} else {
result = append(result, part)
}
// If the part contains a day with time, start a new currentPart
} else {
// If there's no currentPart, start a new one
if currentPart != "" {
currentPart += ", " + part
} else {
currentPart = part
}
}
}
// Add the last currentPart to the result
if currentPart != "" {
result = append(result, currentPart)
}
return result
}
// check if ws or ss
func checkSemester(date time.Time) string {
if date.Month() >= 4 && date.Month() <= 9 {
return "ss"
} else {
return "ws"
}
}
// fetch the main page where all sport courses are listed and extract all links to the sport courses
func fetchAllAvailableSportCourses() []string {
var url = "https://sport.htwk-leipzig.de/sportangebote"
var doc, err = htmlRequest(url)
if err != nil {
return nil
}
// link list of all sport courses
var links []string
// find all links to sport courses with regex https://sport.htwk-leipzig.de/sportangebote/detail/sport/ + [0-9]{1,4}
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
link, _ := s.Attr("href")
if strings.HasPrefix(link, "/sportangebote/detail/sport/") {
links = append(links, link)
}
})
return links
}
// fetchAllHTWKSportCourses fetches all sport courses from the given links.
// to speed up the process, it uses multithreading.
func fetchHTWKSportCourses(links []string) []model.SportEntry {
//multithreaded webpage requests to speed up the process
var maxThreads = 10
var htmlPageArray = make([]*goquery.Document, len(links))
var hostUrl = "https://sport.htwk-leipzig.de"
var wg sync.WaitGroup
wg.Add(maxThreads)
for i := 0; i < maxThreads; i++ {
go func(i int) {
for j := i; j < len(links); j += maxThreads {
doc, err := htmlRequest(hostUrl + links[j])
if err == nil {
htmlPageArray[j] = doc
}
}
wg.Done()
}(i)
}
wg.Wait()
var events []model.SportEntry
for _, doc := range htmlPageArray {
if doc != nil {
event, err := fetchHtwkSportCourse(doc)
if err == nil {
events = append(events, event...)
}
}
}
return events
}
func htmlRequest(url string) (*goquery.Document, error) {
resp, err := http.Get(url)
if err != nil {
return nil, err
}
defer resp.Body.Close()
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, err
}
return doc, nil
}
// fetchHtwkSportCourse fetches the sport course from the given url and id.
// If the sport course does not exist, it will return an error.
// If the sport course exists, it will return the sport course.
// goquery is used to parse the html. The html structure is not very consistent, so it is hard to parse.
// May be improved in the future.
func fetchHtwkSportCourse(doc *goquery.Document) ([]model.SportEntry, error) {
var events []model.SportEntry
if doc.Find("h1").Text() == "Aktuelle Sportangebote" {
return nil, errors.New("not a sport course page")
}
doc.Find(".eventHead").Each(func(i int, s *goquery.Selection) {
var event model.SportEntry
var details model.EventDetails
fullTitle := strings.TrimSpace(s.Find("h3").Text())
titleParts := strings.Split(fullTitle, "-")
if len(titleParts) > 0 {
event.Title = strings.TrimSpace(titleParts[0])
}
if len(titleParts) > 2 {
details.Type = strings.TrimSpace(titleParts[len(titleParts)-1])
}
s.NextFiltered("table.eventDetails").Find("tr").Each(func(i int, s *goquery.Selection) {
key := strings.TrimSpace(s.Find("td").First().Text())
value := strings.TrimSpace(s.Find("td").Last().Text())
switch key {
case "Zeitraum":
dates := strings.Split(value, "-")
if len(dates) == 2 {
startDate, _ := time.Parse("02.01.2006", strings.TrimSpace(dates[0]))
endDate, _ := time.Parse("02.01.2006", strings.TrimSpace(dates[1]))
details.DateRange = model.DateRange{Start: startDate, End: endDate}
}
case "Zyklus":
details.Cycle = value
case "Geschlecht":
details.Gender = value
case "Leiter":
leaderName := strings.TrimSpace(s.Find("td a").Text())
leadersSlice := strings.Split(leaderName, "\n")
for i, leader := range leadersSlice {
leadersSlice[i] = strings.TrimSpace(leader)
}
formattedLeaders := strings.Join(leadersSlice, ", ")
leaderLink, _ := s.Find("td a").Attr("href")
details.CourseLead = model.CourseLead{Name: formattedLeaders, Link: leaderLink}
case "Ort":
locationDetails := strings.Split(value, "(")
if len(locationDetails) == 2 {
details.Location = model.Location{
Name: strings.TrimSpace(locationDetails[0]),
Address: strings.TrimRight(strings.TrimSpace(locationDetails[1]), ")"),
}
}
case "Teilnehmer":
parts := strings.Split(value, "/")
if len(parts) >= 3 {
bookings, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
totalPlaces, _ := strconv.Atoi(strings.TrimSpace(parts[1]))
waitList, _ := strconv.Atoi(strings.TrimSpace(parts[2]))
details.Participants = model.Participants{Bookings: bookings, TotalPlaces: totalPlaces, WaitList: waitList}
}
case "Kosten":
details.Cost = value // makes no sense since you need to be logged in to see the price
case "Hinweis":
var allNotes []string
s.Find("td").Last().Contents().Each(func(i int, s *goquery.Selection) {
if s.Is("h4.eventAdvice") || goquery.NodeName(s) == "#text" {
note := strings.TrimSpace(s.Text())
if note != "" {
allNotes = append(allNotes, note)
}
}
})
event.AdditionalNote = strings.Join(allNotes, " ")
}
})
event.Details = details
events = append(events, event)
})
return events, nil
}