diff --git a/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go b/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go index 7dc3312..ccc9885 100644 --- a/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go +++ b/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go @@ -256,17 +256,7 @@ func (s *Service) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item, error) } // Patch up the item list: make sure each item has a GUID. - for idx := 0; idx < len(feed.Items); idx++ { - itm := feed.Items[idx] - if itm.GUID == "" { - if itm.Link != "" { - itm.GUID = itm.Link - } else if itm.Title != "" { - itm.GUID = itm.Title - } - feed.Items[idx] = itm - } - } + ensureItemsHaveGUIDs(feed) // Work out which items are new, if any (based on the last updated TS we have) // If the TS is 0 then this is the first ever poll, so let's not send 10s of events @@ -285,20 +275,28 @@ func (s *Service) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item, error) // TODO: Handle the 'sy' Syndication extension to control update interval. // See http://www.feedforall.com/syndication.htm and http://web.resource.org/rss/1.0/modules/syndication/ - // map items to guid strings - var guids []string - for _, itm := range feed.Items { - guids = append(guids, itm.GUID) - } - + // Work out which GUIDs to remember. We don't want to remember every GUID ever as that leads to completely + // unbounded growth of data. f := s.Feeds[feedURL] + // Some RSS feeds can return a very small number of items then bounce + // back to their "normal" size, so we cannot just clobber the recent GUID list per request or else we'll + // forget what we sent and resend it. Instead, we'll keep 2x the max number of items that we've ever + // seen from this feed, up to a max of 1000. + maxGuids := 2 * len(feed.Items) + if len(f.RecentGUIDs) > maxGuids { + maxGuids = len(f.RecentGUIDs) // already 2x'd. + } + if maxGuids > 1000 { + maxGuids = 1000 + } - if len(guids) != len(f.RecentGUIDs) { - log.WithFields(log.Fields{ - "new_guids": guids, - "old_guids": f.RecentGUIDs, - "feed_url": feedURL, - }).Warn("GUID length mismatch") + lastSet := uniqueStrings(f.RecentGUIDs) // e.g. [4,5,6] + thisSet := uniqueGuids(feed.Items) // e.g. [1,2,3] + guids := append(thisSet, lastSet...) // e.g. [1,2,3,4,5,6] + guids = uniqueStrings(guids) + if len(guids) > maxGuids { + // Critically this favours the NEWEST elements, which are the ones we're most likely to see again. + guids = guids[0:maxGuids] } // Update the service config to persist the new times @@ -343,8 +341,12 @@ func (s *Service) newItems(feedURL string, allItems []*gofeed.Item) (items []gof } func (s *Service) sendToRooms(cli *matrix.Client, feedURL string, feed *gofeed.Feed, item gofeed.Item) error { - logger := log.WithField("feed_url", feedURL).WithField("title", item.Title) - logger.Info("New feed item") + logger := log.WithFields(log.Fields{ + "feed_url": feedURL, + "title": item.Title, + "guid": item.GUID, + }) + logger.Info("Sending new feed item") for _, roomID := range s.Feeds[feedURL].Rooms { if _, err := cli.SendMessageEvent(roomID, "m.room.message", itemToHTML(feed, item)); err != nil { logger.WithError(err).WithField("room_id", roomID).Error("Failed to send to room") @@ -361,6 +363,50 @@ func itemToHTML(feed *gofeed.Feed, item gofeed.Item) matrix.HTMLMessage { )) } +func ensureItemsHaveGUIDs(feed *gofeed.Feed) { + for idx := 0; idx < len(feed.Items); idx++ { + itm := feed.Items[idx] + if itm.GUID == "" { + if itm.Link != "" { + itm.GUID = itm.Link + } else if itm.Title != "" { + itm.GUID = itm.Title + } + feed.Items[idx] = itm + } + } +} + +// uniqueStrings returns a new slice of strings with duplicate elements removed. +// Order is otherwise preserved. +func uniqueStrings(a []string) []string { + ret := []string{} + seen := make(map[string]bool) + for _, str := range a { + if seen[str] { + continue + } + seen[str] = true + ret = append(ret, str) + } + return ret +} + +// uniqueGuids returns a new slice of GUID strings with duplicate elements removed. +// Order is otherwise preserved. +func uniqueGuids(a []*gofeed.Item) []string { + ret := []string{} + seen := make(map[string]bool) + for _, item := range a { + if seen[item.GUID] { + continue + } + seen[item.GUID] = true + ret = append(ret, item.GUID) + } + return ret +} + type userAgentRoundTripper struct { Transport http.RoundTripper }