// Package rssbot implements a Service capable of reading Atom/RSS feeds. package rssbot import ( "errors" "fmt" "html" "net/http" "strconv" "strings" "time" "unicode" "github.com/die-net/lrucache" "github.com/gregjones/httpcache" "github.com/matrix-org/go-neb/database" "github.com/matrix-org/go-neb/polling" "github.com/matrix-org/go-neb/types" "github.com/mmcdole/gofeed" "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" mevt "maunium.net/go/mautrix/event" "maunium.net/go/mautrix/id" ) // ServiceType of the RSS Bot service const ServiceType = "rssbot" var cachingClient *http.Client var ( pollCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "goneb_rss_polls_total", Help: "The number of feed polls from RSS services", }, []string{"http_status"}) ) const minPollingIntervalSeconds = 60 * 5 // 5 min (News feeds can be genuinely spammy) // includeRules contains the rules for including or excluding a feed item. For the fields Author, Title // and Description in a feed item, there can be some words specified in the config that determine whether // the item will be displayed or not, depending on whether these words are included in that field. // // - If specified in the `must_include` field, the feed item must include at least one word for each field // that has been specified. This means that if some words have been specified for both Author and Title, // both the Author and Title must contain at least one of their respective words or the item will be skipped. // - If specified in the `must_not_include` field, the feed item fields must not contain any of the words // that were specified for each field. This means that if some words have been specified for both Author // and Title, if either of them includes at least one of their respective words, the item will be skipped, // even in the case that the item matched the `must_include` rules. // // In both cases, specifying an empty list for a field or not specifying anything causes the field to be ignored. // The field being checked each time will be split into words (any non-alphanumeric character starts a new word) // and they will be checked against the provided list. type includeRules struct { // Author is a case-sensitive list of words that the author name must contain or not contain. Author []string `json:"author"` // Title is a case-sensitive list of words that the author name must contain or not contain. Title []string `json:"title"` // Description is a case-sensitive list of words that the author name must contain or not contain. Description []string `json:"description"` } // Service contains the Config fields for this service. // // Example request: // { // feeds: { // "http://rss.cnn.com/rss/edition.rss": { // poll_interval_mins: 60, // rooms: ["!cBrPbzWazCtlkMNQSF:localhost"] // }, // "https://www.wired.com/feed/": { // rooms: ["!qmElAGdFYCHoCJuaNt:localhost"] // } // } // } type Service struct { types.DefaultService // Feeds is a map of feed URL to configuration options for this feed. Feeds map[string]struct { // Optional. The time to wait between polls. If this is less than minPollingIntervalSeconds, it is ignored. PollIntervalMins int `json:"poll_interval_mins"` // The list of rooms to send feed updates into. This cannot be empty. Rooms []id.RoomID `json:"rooms"` // True if rss bot is unable to poll this feed. This is populated by Go-NEB. Use /getService to // retrieve this value. IsFailing bool `json:"is_failing"` // The time of the last successful poll. This is populated by Go-NEB. Use /getService to retrieve // this value. FeedUpdatedTimestampSecs int64 `json:"last_updated_ts_secs"` // Specified fields must each include at least one of these words. MustInclude includeRules `json:"must_include"` // None of the specified fields must include any of these words. MustNotInclude includeRules `json:"must_not_include"` // Internal field. When we should poll again. NextPollTimestampSecs int64 // Internal field. The most recently seen GUIDs. Sized to the number of items in the feed. RecentGUIDs []string } `json:"feeds"` } // Register will check the liveness of each RSS feed given. If all feeds check out okay, no error is returned. func (s *Service) Register(oldService types.Service, client types.MatrixClient) error { if len(s.Feeds) == 0 { // this is an error UNLESS the old service had some feeds in which case they are deleting us :( var numOldFeeds int oldFeedService, ok := oldService.(*Service) if !ok { log.WithField("service", oldService).Error("Old service isn't an rssbot.Service") } else { numOldFeeds = len(oldFeedService.Feeds) } if numOldFeeds == 0 { return errors.New("An RSS feed must be specified") } return nil } // Make sure we can parse the feed for feedURL, feedInfo := range s.Feeds { if _, err := readFeed(feedURL); err != nil { return fmt.Errorf("Failed to read URL %s: %s", feedURL, err.Error()) } if len(feedInfo.Rooms) == 0 { return fmt.Errorf("Feed %s has no rooms to send updates to", feedURL) } } s.joinRooms(client) return nil } func (s *Service) joinRooms(client types.MatrixClient) { roomSet := make(map[id.RoomID]bool) for _, feedInfo := range s.Feeds { for _, roomID := range feedInfo.Rooms { roomSet[roomID] = true } } for roomID := range roomSet { if _, err := client.JoinRoom(roomID.String(), "", nil); err != nil { log.WithFields(log.Fields{ log.ErrorKey: err, "room_id": roomID, }).Error("Failed to join room") } } } // PostRegister deletes this service if there are no feeds remaining. func (s *Service) PostRegister(oldService types.Service) { if len(s.Feeds) == 0 { // bye-bye :( logger := log.WithFields(log.Fields{ "service_id": s.ServiceID(), "service_type": s.ServiceType(), }) logger.Info("Deleting service: No feeds remaining.") polling.StopPolling(s) if err := database.GetServiceDB().DeleteService(s.ServiceID()); err != nil { logger.WithError(err).Error("Failed to delete service") } } } // OnPoll rechecks RSS feeds which are due to be polled. // // In order for a feed to be polled, the current time must be greater than NextPollTimestampSecs. // In order for an item on a feed to be sent to Matrix, the item's GUID must not exist in RecentGUIDs. // The GUID for an item is created according to the following rules: // - If there is a GUID field, use it. // - Else if there is a Link field, use it as the GUID. // - Else if there is a Title field, use it as the GUID. // // Returns a timestamp representing when this Service should have OnPoll called again. func (s *Service) OnPoll(cli types.MatrixClient) time.Time { logger := log.WithFields(log.Fields{ "service_id": s.ServiceID(), "service_type": s.ServiceType(), }) now := time.Now().Unix() // Second resolution // Work out which feeds should be polled var pollFeeds []string for u, feedInfo := range s.Feeds { if feedInfo.NextPollTimestampSecs == 0 || now >= feedInfo.NextPollTimestampSecs { // re-query this feed pollFeeds = append(pollFeeds, u) } } if len(pollFeeds) == 0 { return s.nextTimestamp() } // Query each feed and send new items to subscribed rooms for _, u := range pollFeeds { feed, items, err := s.queryFeed(u) if err != nil { logger.WithField("feed_url", u).WithError(err).Error("Failed to query feed") incrementMetrics(u, err) continue } incrementMetrics(u, nil) logger.WithFields(log.Fields{ "feed_url": u, "feed_items": len(feed.Items), "new_items": len(items), }).Info("Sending new items") // Loop backwards since [0] is the most recent and we want to send in chronological order for i := len(items) - 1; i >= 0; i-- { item := items[i] if err := s.sendToRooms(cli, u, feed, item); err != nil { logger.WithFields(log.Fields{ "feed_url": u, log.ErrorKey: err, "item": item, }).Error("Failed to send item to room due to 429; aborting further sends") // no point continuing if we errored due to a 429 - we'll just hit the rate limit // again and again break } } } // Persist the service to save the next poll times if _, err := database.GetServiceDB().StoreService(s); err != nil { logger.WithError(err).Error("Failed to persist next poll times for service") } return s.nextTimestamp() } func incrementMetrics(urlStr string, err error) { if err != nil { herr, ok := err.(gofeed.HTTPError) statusCode := 0 // e.g. network timeout if ok { statusCode = herr.StatusCode } pollCounter.With(prometheus.Labels{"http_status": strconv.Itoa(statusCode)}).Inc() } else { pollCounter.With(prometheus.Labels{"http_status": "200"}).Inc() // technically 2xx but gofeed doesn't tell us which } } func (s *Service) nextTimestamp() time.Time { // return the earliest next poll ts var earliestNextTs int64 for _, feedInfo := range s.Feeds { if earliestNextTs == 0 || feedInfo.NextPollTimestampSecs < earliestNextTs { earliestNextTs = feedInfo.NextPollTimestampSecs } } // Don't allow times in the past. Set a min re-poll threshold of 60s to avoid // tight-looping on feeds which 500. now := time.Now().Unix() if earliestNextTs <= now { earliestNextTs = now + 60 } return time.Unix(earliestNextTs, 0) } // Query the given feed, update relevant timestamps and return NEW items func (s *Service) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item, error) { log.WithField("feed_url", feedURL).Info("Querying feed") var items []gofeed.Item feed, err := readFeed(feedURL) // check for no items in addition to any returned errors as it appears some RSS feeds // do not consistently return items. if err == nil && len(feed.Items) == 0 { err = errors.New("feed has 0 items") } if err != nil { f := s.Feeds[feedURL] f.IsFailing = true s.Feeds[feedURL] = f return nil, items, err } // Patch up the item list: make sure each item has a GUID. ensureItemsHaveGUIDs(feed) // Work out which items are new, if any (based on the last updated TS we have) // If the TS is 0 then this is the first ever poll, so let's not send 10s of events // into the room and just do new ones from this point onwards. if s.Feeds[feedURL].NextPollTimestampSecs != 0 { items = s.newItems(feedURL, feed.Items) } now := time.Now().Unix() // Second resolution // Work out when to next poll this feed nextPollTsSec := now + minPollingIntervalSeconds if s.Feeds[feedURL].PollIntervalMins > int(minPollingIntervalSeconds/60) { nextPollTsSec = now + int64(s.Feeds[feedURL].PollIntervalMins*60) } // TODO: Handle the 'sy' Syndication extension to control update interval. // See http://www.feedforall.com/syndication.htm and http://web.resource.org/rss/1.0/modules/syndication/ // Work out which GUIDs to remember. We don't want to remember every GUID ever as that leads to completely // unbounded growth of data. f := s.Feeds[feedURL] // Some RSS feeds can return a very small number of items then bounce // back to their "normal" size, so we cannot just clobber the recent GUID list per request or else we'll // forget what we sent and resend it. Instead, we'll keep 2x the max number of items that we've ever // seen from this feed, up to a max of 10,000. maxGuids := 2 * len(feed.Items) if len(f.RecentGUIDs) > maxGuids { maxGuids = len(f.RecentGUIDs) // already 2x'd. } if maxGuids > 10000 { maxGuids = 10000 } lastSet := uniqueStrings(f.RecentGUIDs) // e.g. [4,5,6] thisSet := uniqueGuids(feed.Items) // e.g. [1,2,3] guids := append(thisSet, lastSet...) // e.g. [1,2,3,4,5,6] guids = uniqueStrings(guids) if len(guids) > maxGuids { // Critically this favours the NEWEST elements, which are the ones we're most likely to see again. guids = guids[0:maxGuids] } // Update the service config to persist the new times f.NextPollTimestampSecs = nextPollTsSec f.FeedUpdatedTimestampSecs = now f.RecentGUIDs = guids f.IsFailing = false s.Feeds[feedURL] = f return feed, items, nil } // containsAny takes a string and an array of words and returns whether any of the words // in the list are contained in the string. The words in the string are considered to be // separated by any non-alphanumeric character. func containsAny(item string, filterWords []string) bool { itemWords := strings.FieldsFunc(item, func(c rune) bool { return !unicode.IsLetter(c) && !unicode.IsNumber(c) }) for _, itemWord := range itemWords { for _, filterWord := range filterWords { if filterWord == itemWord { return true } } } return false } func itemFiltered(i *gofeed.Item, mustInclude, mustNotInclude *includeRules) bool { // At least one word for each field that has been specified must be included for an item to pass the filter. if (i.Author != nil && len(mustInclude.Author) > 0 && !containsAny(i.Author.Name, mustInclude.Author)) || (len(mustInclude.Title) > 0 && !containsAny(i.Title, mustInclude.Title)) || (len(mustInclude.Description) > 0 && !containsAny(i.Description, mustInclude.Description)) { return true } // If at least one word of any field that has been specified is included in the item, it doesn't pass the filter. if (i.Author != nil && containsAny(i.Author.Name, mustNotInclude.Author)) || containsAny(i.Title, mustNotInclude.Title) || containsAny(i.Description, mustNotInclude.Description) { return true } return false } func (s *Service) newItems(feedURL string, allItems []*gofeed.Item) (items []gofeed.Item) { mustInclude := s.Feeds[feedURL].MustInclude mustNotInclude := s.Feeds[feedURL].MustNotInclude for _, i := range allItems { if i == nil { continue } // if we've seen this guid before, we've sent it before seenBefore := false for _, guid := range s.Feeds[feedURL].RecentGUIDs { if guid == i.GUID { seenBefore = true break } } if seenBefore { continue } // Decode HTML for