From 332b451d7725d7c40b35e0a5852739b9eacac87e Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Tue, 29 Nov 2016 13:45:55 +0000 Subject: [PATCH 1/2] Redo how recent GUIDs are calculated This is an attempt to fix #133. Previously, we just clobbered the recent GUIDs with the lastest response every single time, assuming that Atom/RSS feeds would consistently return the same items. This appears to not be the case. In the wild, the number of items returned on a single request can vary (sometimes even being 1 or 2 when usually it is 50!). This patch alters *how many* and *which* GUIDs we keep between requests, in an attempt to prevent sending old news for buggy RSS feeds. --- .../go-neb/services/rssbot/rssbot.go | 97 ++++++++++++++----- 1 file changed, 71 insertions(+), 26 deletions(-) diff --git a/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go b/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go index b9c32d4..7844748 100644 --- a/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go +++ b/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go @@ -260,17 +260,7 @@ func (s *Service) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item, error) } // Patch up the item list: make sure each item has a GUID. - for idx := 0; idx < len(feed.Items); idx++ { - itm := feed.Items[idx] - if itm.GUID == "" { - if itm.Link != "" { - itm.GUID = itm.Link - } else if itm.Title != "" { - itm.GUID = itm.Title - } - feed.Items[idx] = itm - } - } + ensureItemsHaveGUIDs(feed) // Work out which items are new, if any (based on the last updated TS we have) // If the TS is 0 then this is the first ever poll, so let's not send 10s of events @@ -289,26 +279,33 @@ func (s *Service) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item, error) // TODO: Handle the 'sy' Syndication extension to control update interval. // See http://www.feedforall.com/syndication.htm and http://web.resource.org/rss/1.0/modules/syndication/ - // map items to guid strings - var guids []string - for _, itm := range feed.Items { - guids = append(guids, itm.GUID) - } - + // Work out which GUIDs to remember. We don't want to remember every GUID ever as that leads to completely + // unbounded growth of data. f := s.Feeds[feedURL] + // Some RSS feeds can return a very small number of items then bounce + // back to their "normal" size, so we cannot just clobber the recent GUID list per request or else we'll + // forget what we sent and resend it. Instead, we'll keep 2x the max number of items that we've ever + // seen from this feed, up to a max of 1000. + maxGuids := 2 * len(feed.Items) + if len(f.RecentGUIDs) > maxGuids { + maxGuids = len(f.RecentGUIDs) // already 2x'd. + } + if maxGuids > 1000 { + maxGuids = 1000 + } - if len(guids) != len(f.RecentGUIDs) { - log.WithFields(log.Fields{ - "new_guids": guids, - "old_guids": f.RecentGUIDs, - "feed_url": feedURL, - }).Warn("GUID length mismatch") + lastSet := uniqueStrings(f.RecentGUIDs) // e.g. [4,5,6] + thisSet := uniqueGuids(feed.Items) // e.g. [1,2,3] + guids := append(thisSet, lastSet...) // e.g. [1,2,3,4,5,6] + if len(guids) > maxGuids { + // Critically this favours the NEWEST elements, which are the ones we're most likely to see again. + guids = guids[0:maxGuids] } // Update the service config to persist the new times f.NextPollTimestampSecs = nextPollTsSec f.FeedUpdatedTimestampSecs = now - f.RecentGUIDs = guids + f.RecentGUIDs = uniqueStrings(guids) f.IsFailing = false s.Feeds[feedURL] = f @@ -347,8 +344,12 @@ func (s *Service) newItems(feedURL string, allItems []*gofeed.Item) (items []gof } func (s *Service) sendToRooms(cli *matrix.Client, feedURL string, feed *gofeed.Feed, item gofeed.Item) error { - logger := log.WithField("feed_url", feedURL).WithField("title", item.Title) - logger.Info("New feed item") + logger := log.WithFields(log.Fields{ + "feed_url": feedURL, + "title": item.Title, + "guid": item.GUID, + }) + logger.Info("Sending new feed item") for _, roomID := range s.Feeds[feedURL].Rooms { if _, err := cli.SendMessageEvent(roomID, "m.room.message", itemToHTML(feed, item)); err != nil { logger.WithError(err).WithField("room_id", roomID).Error("Failed to send to room") @@ -365,6 +366,50 @@ func itemToHTML(feed *gofeed.Feed, item gofeed.Item) matrix.HTMLMessage { )) } +func ensureItemsHaveGUIDs(feed *gofeed.Feed) { + for idx := 0; idx < len(feed.Items); idx++ { + itm := feed.Items[idx] + if itm.GUID == "" { + if itm.Link != "" { + itm.GUID = itm.Link + } else if itm.Title != "" { + itm.GUID = itm.Title + } + feed.Items[idx] = itm + } + } +} + +// uniqueStrings returns a new slice of strings with duplicate elements removed. +// Order is otherwise preserved. +func uniqueStrings(a []string) []string { + ret := []string{} + seen := make(map[string]bool) + for _, str := range a { + if seen[str] { + continue + } + seen[str] = true + ret = append(ret, str) + } + return ret +} + +// uniqueGuids returns a new slice of GUID strings with duplicate elements removed. +// Order is otherwise preserved. +func uniqueGuids(a []*gofeed.Item) []string { + ret := []string{} + seen := make(map[string]bool) + for _, item := range a { + if seen[item.GUID] { + continue + } + seen[item.GUID] = true + ret = append(ret, item.GUID) + } + return ret +} + type userAgentRoundTripper struct { Transport http.RoundTripper } From c4e98238d8b72183a3d8f8b54be9cb811660f527 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Tue, 29 Nov 2016 14:41:14 +0000 Subject: [PATCH 2/2] uniq prior to length checks for accuracy --- src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go b/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go index 7844748..ae4fb3e 100644 --- a/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go +++ b/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go @@ -297,6 +297,7 @@ func (s *Service) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item, error) lastSet := uniqueStrings(f.RecentGUIDs) // e.g. [4,5,6] thisSet := uniqueGuids(feed.Items) // e.g. [1,2,3] guids := append(thisSet, lastSet...) // e.g. [1,2,3,4,5,6] + guids = uniqueStrings(guids) if len(guids) > maxGuids { // Critically this favours the NEWEST elements, which are the ones we're most likely to see again. guids = guids[0:maxGuids] @@ -305,7 +306,7 @@ func (s *Service) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item, error) // Update the service config to persist the new times f.NextPollTimestampSecs = nextPollTsSec f.FeedUpdatedTimestampSecs = now - f.RecentGUIDs = uniqueStrings(guids) + f.RecentGUIDs = guids f.IsFailing = false s.Feeds[feedURL] = f