package main import ( "errors" "github.com/google/uuid" "github.com/pocketbase/pocketbase/tools/types" "htwkalender/model" "net/http" "regexp" "strconv" "strings" "sync" "time" "github.com/PuerkitoBio/goquery" ) // @TODO: fix bug where cycle contains multiple days (e.g. "Mo + Mi 18:00-20:00") // @TODO: add tests // @TODO: add it to the service // @TODO: make it like a cron job to fetch the sport courses once a week func main() { var sportCourseLinks = fetchAllAvailableSportCourses() sportEntries := fetchHTWKSportCourses(sportCourseLinks) for _, event := range sportEntries { println(event.Title) } events := formatEntriesToEvents(sportEntries) for _, event := range events { println(event.Name) } } func formatEntriesToEvents(entries []SportEntry) []model.Event { var events []model.Event for i, entry := range entries { eventStarts, eventEnds := calculateEventStarts(entry.Details.DateRange.Start, entry.Details.DateRange.End, entry.Details.Cycle) for j := range eventStarts { start, _ := types.ParseDateTime(eventStarts[j].In(time.UTC)) end, _ := types.ParseDateTime(eventEnds[j].In(time.UTC)) var event = model.Event{ UUID: uuid.NewSHA1(uuid.NameSpaceDNS, []byte(entry.Title+strconv.FormatInt(int64(i), 10))).String(), Day: entry.Details.DateRange.Start.Weekday().String(), Week: strconv.Itoa(23), Start: start, End: end, Name: entry.Title, EventType: entry.Details.Type, Prof: entry.Details.CourseLead.Name, Rooms: entry.Details.Location.Name, Notes: entry.AdditionalNote, BookedAt: "", Course: "Sport", Semester: checkSemester(entry.Details.DateRange.Start), } events = append(events, event) } } return events } func calculateEventStarts(start time.Time, end time.Time, cycle string) ([]time.Time, []time.Time) { // start is the begin of the cycle e.g. 01.04.2020 // end is the end of the cycle e.g. 30.09.2020 // cycle is the day and timespan (e.g. "Mo 18:00-20:00") // check if start is before end if start.After(end) { return nil, nil } // check if cycle is valid if !checkCycle(cycle) { return nil, nil } var weekDay = cycle[0:2] // match weekday to time.Weekday (e.g. "Mo" -> time.Monday) var weekDayInt int switch weekDay { case "Mo": weekDayInt = 1 case "Di": weekDayInt = 2 case "Mi": weekDayInt = 3 case "Do": weekDayInt = 4 case "Fr": weekDayInt = 5 case "Sa": weekDayInt = 6 case "So": weekDayInt = 0 } // get every date matching the weekday in the cycle between start and end var eventDates []time.Time for d := start; d.Before(end); d = d.AddDate(0, 0, 1) { if d.Weekday() == time.Weekday(weekDayInt) { eventDates = append(eventDates, d) } } // add hours and minutes to the dates in eventDates // array of tuple of start and end times var eventStartsWithTime []time.Time var eventEndWithTime []time.Time for _, eventStart := range eventDates { timeRegExp, _ := regexp.Compile("[0-9]{2}:[0-9]{2}") times := timeRegExp.FindAllString(cycle, 2) startHour, _ := strconv.Atoi(times[0][0:2]) startMinute, _ := strconv.Atoi(times[0][3:5]) endHour, _ := strconv.Atoi(times[1][0:2]) endMinute, _ := strconv.Atoi(times[1][3:5]) eventStartsWithTime = append(eventStartsWithTime, time.Date(eventStart.Year(), eventStart.Month(), eventStart.Day(), startHour, startMinute, 0, 0, eventStart.Location())) eventEndWithTime = append(eventEndWithTime, time.Date(eventStart.Year(), eventStart.Month(), eventStart.Day(), endHour, endMinute, 0, 0, eventStart.Location())) } return eventStartsWithTime, eventEndWithTime } func checkCycle(cycle string) bool { // check if cycle is valid if len(cycle) < 12 { return false } // check if cycle has a weekday weekDay := cycle[0:2] if weekDay != "Mo" && weekDay != "Di" && weekDay != "Mi" && weekDay != "Do" && weekDay != "Fr" && weekDay != "Sa" && weekDay != "So" { return false } // check if cycle has a timespan timeSpan := cycle[3:12] if len(timeSpan) != 9 { return false } // check if timespan has a start and end time startTime := timeSpan[0:5] endTime := timeSpan[6:9] if len(startTime) != 5 || len(endTime) != 3 { return false } // check if start time is before end time if startTime > endTime { return false } return true } // check if ws or ss func checkSemester(date time.Time) string { if date.Month() >= 4 && date.Month() <= 9 { return "ss" } else { return "ws" } } // fetch the main page where all sport courses are listed and extract all links to the sport courses func fetchAllAvailableSportCourses() []string { var url = "https://sport.htwk-leipzig.de/sportangebote" var doc, err = htmlRequest(url) if err != nil { return nil } // link list of all sport courses var links []string // find all links to sport courses with regex https://sport.htwk-leipzig.de/sportangebote/detail/sport/ + [0-9]{1,4} doc.Find("a[href]").Each(func(i int, s *goquery.Selection) { link, _ := s.Attr("href") if strings.HasPrefix(link, "/sportangebote/detail/sport/") { links = append(links, link) } }) return links } // fetchAllHTWKSportCourses fetches all sport courses from the given links. // to speed up the process, it uses multithreading. func fetchHTWKSportCourses(links []string) []SportEntry { //multithreaded webpage requests to speed up the process var maxThreads = 10 var htmlPageArray = make([]*goquery.Document, len(links)) var hostUrl = "https://sport.htwk-leipzig.de" var wg sync.WaitGroup wg.Add(maxThreads) for i := 0; i < maxThreads; i++ { go func(i int) { for j := i; j < len(links); j += maxThreads { doc, err := htmlRequest(hostUrl + links[j]) if err == nil { htmlPageArray[j] = doc } } wg.Done() }(i) } wg.Wait() var events []SportEntry for _, doc := range htmlPageArray { if doc != nil { event, err := fetchHtwkSportCourse(doc) if err == nil { events = append(events, event...) } } } return events } func htmlRequest(url string) (*goquery.Document, error) { println("fetching " + url) resp, err := http.Get(url) if err != nil { return nil, err } defer resp.Body.Close() doc, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { return nil, err } println("finished fetching " + url) return doc, nil } // fetchHtwkSportCourse fetches the sport course from the given url and id. // If the sport course does not exist, it will return an error. // If the sport course exists, it will return the sport course. // goquery is used to parse the html. The html structure is not very consistent, so it is hard to parse. // May be improved in the future. func fetchHtwkSportCourse(doc *goquery.Document) ([]SportEntry, error) { var events []SportEntry if doc.Find("h1").Text() == "Aktuelle Sportangebote" { return nil, errors.New("not a sport course page") } doc.Find(".eventHead").Each(func(i int, s *goquery.Selection) { var event SportEntry var details EventDetails fullTitle := strings.TrimSpace(s.Find("h3").Text()) titleParts := strings.Split(fullTitle, "-") if len(titleParts) > 0 { event.Title = strings.TrimSpace(titleParts[0]) } if len(titleParts) > 2 { details.Type = strings.TrimSpace(titleParts[len(titleParts)-1]) } s.NextFiltered("table.eventDetails").Find("tr").Each(func(i int, s *goquery.Selection) { key := strings.TrimSpace(s.Find("td").First().Text()) value := strings.TrimSpace(s.Find("td").Last().Text()) switch key { case "Zeitraum": dates := strings.Split(value, "-") if len(dates) == 2 { startDate, _ := time.Parse("02.01.2006", strings.TrimSpace(dates[0])) endDate, _ := time.Parse("02.01.2006", strings.TrimSpace(dates[1])) details.DateRange = DateRange{Start: startDate, End: endDate} } case "Zyklus": details.Cycle = value case "Geschlecht": details.Gender = value case "Leiter": leaderName := strings.TrimSpace(s.Find("td a").Text()) leadersSlice := strings.Split(leaderName, "\n") for i, leader := range leadersSlice { leadersSlice[i] = strings.TrimSpace(leader) } formattedLeaders := strings.Join(leadersSlice, ", ") leaderLink, _ := s.Find("td a").Attr("href") details.CourseLead = CourseLead{Name: formattedLeaders, Link: leaderLink} case "Ort": locationDetails := strings.Split(value, "(") if len(locationDetails) == 2 { details.Location = Location{ Name: strings.TrimSpace(locationDetails[0]), Address: strings.TrimRight(strings.TrimSpace(locationDetails[1]), ")"), } } case "Teilnehmer": parts := strings.Split(value, "/") if len(parts) >= 3 { bookings, _ := strconv.Atoi(strings.TrimSpace(parts[0])) totalPlaces, _ := strconv.Atoi(strings.TrimSpace(parts[1])) waitList, _ := strconv.Atoi(strings.TrimSpace(parts[2])) details.Participants = Participants{Bookings: bookings, TotalPlaces: totalPlaces, WaitList: waitList} } case "Kosten": details.Cost = value // makes no sense since you need to be logged in to see the price case "Hinweis": var allNotes []string s.Find("td").Last().Contents().Each(func(i int, s *goquery.Selection) { if s.Is("h4.eventAdvice") || goquery.NodeName(s) == "#text" { note := strings.TrimSpace(s.Text()) if note != "" { allNotes = append(allNotes, note) } } }) event.AdditionalNote = strings.Join(allNotes, " ") } }) event.Details = details events = append(events, event) }) return events, nil }