Browse Source

Redo how recent GUIDs are calculated

This is an attempt to fix #133.

Previously, we just clobbered the recent GUIDs with the lastest response
every single time, assuming that Atom/RSS feeds would consistently return the
same items. This appears to not be the case. In the wild, the number of items
returned on a single request can vary (sometimes even being 1 or 2 when usually
it is 50!).

This patch alters *how many* and *which* GUIDs we keep between requests, in an
attempt to prevent sending old news for buggy RSS feeds.
pull/135/head
Kegan Dougal 8 years ago
parent
commit
332b451d77
  1. 97
      src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go

97
src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go

@ -260,17 +260,7 @@ func (s *Service) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item, error)
} }
// Patch up the item list: make sure each item has a GUID. // Patch up the item list: make sure each item has a GUID.
for idx := 0; idx < len(feed.Items); idx++ {
itm := feed.Items[idx]
if itm.GUID == "" {
if itm.Link != "" {
itm.GUID = itm.Link
} else if itm.Title != "" {
itm.GUID = itm.Title
}
feed.Items[idx] = itm
}
}
ensureItemsHaveGUIDs(feed)
// Work out which items are new, if any (based on the last updated TS we have) // Work out which items are new, if any (based on the last updated TS we have)
// If the TS is 0 then this is the first ever poll, so let's not send 10s of events // If the TS is 0 then this is the first ever poll, so let's not send 10s of events
@ -289,26 +279,33 @@ func (s *Service) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item, error)
// TODO: Handle the 'sy' Syndication extension to control update interval. // TODO: Handle the 'sy' Syndication extension to control update interval.
// See http://www.feedforall.com/syndication.htm and http://web.resource.org/rss/1.0/modules/syndication/ // See http://www.feedforall.com/syndication.htm and http://web.resource.org/rss/1.0/modules/syndication/
// map items to guid strings
var guids []string
for _, itm := range feed.Items {
guids = append(guids, itm.GUID)
}
// Work out which GUIDs to remember. We don't want to remember every GUID ever as that leads to completely
// unbounded growth of data.
f := s.Feeds[feedURL] f := s.Feeds[feedURL]
// Some RSS feeds can return a very small number of items then bounce
// back to their "normal" size, so we cannot just clobber the recent GUID list per request or else we'll
// forget what we sent and resend it. Instead, we'll keep 2x the max number of items that we've ever
// seen from this feed, up to a max of 1000.
maxGuids := 2 * len(feed.Items)
if len(f.RecentGUIDs) > maxGuids {
maxGuids = len(f.RecentGUIDs) // already 2x'd.
}
if maxGuids > 1000 {
maxGuids = 1000
}
if len(guids) != len(f.RecentGUIDs) {
log.WithFields(log.Fields{
"new_guids": guids,
"old_guids": f.RecentGUIDs,
"feed_url": feedURL,
}).Warn("GUID length mismatch")
lastSet := uniqueStrings(f.RecentGUIDs) // e.g. [4,5,6]
thisSet := uniqueGuids(feed.Items) // e.g. [1,2,3]
guids := append(thisSet, lastSet...) // e.g. [1,2,3,4,5,6]
if len(guids) > maxGuids {
// Critically this favours the NEWEST elements, which are the ones we're most likely to see again.
guids = guids[0:maxGuids]
} }
// Update the service config to persist the new times // Update the service config to persist the new times
f.NextPollTimestampSecs = nextPollTsSec f.NextPollTimestampSecs = nextPollTsSec
f.FeedUpdatedTimestampSecs = now f.FeedUpdatedTimestampSecs = now
f.RecentGUIDs = guids
f.RecentGUIDs = uniqueStrings(guids)
f.IsFailing = false f.IsFailing = false
s.Feeds[feedURL] = f s.Feeds[feedURL] = f
@ -347,8 +344,12 @@ func (s *Service) newItems(feedURL string, allItems []*gofeed.Item) (items []gof
} }
func (s *Service) sendToRooms(cli *matrix.Client, feedURL string, feed *gofeed.Feed, item gofeed.Item) error { func (s *Service) sendToRooms(cli *matrix.Client, feedURL string, feed *gofeed.Feed, item gofeed.Item) error {
logger := log.WithField("feed_url", feedURL).WithField("title", item.Title)
logger.Info("New feed item")
logger := log.WithFields(log.Fields{
"feed_url": feedURL,
"title": item.Title,
"guid": item.GUID,
})
logger.Info("Sending new feed item")
for _, roomID := range s.Feeds[feedURL].Rooms { for _, roomID := range s.Feeds[feedURL].Rooms {
if _, err := cli.SendMessageEvent(roomID, "m.room.message", itemToHTML(feed, item)); err != nil { if _, err := cli.SendMessageEvent(roomID, "m.room.message", itemToHTML(feed, item)); err != nil {
logger.WithError(err).WithField("room_id", roomID).Error("Failed to send to room") logger.WithError(err).WithField("room_id", roomID).Error("Failed to send to room")
@ -365,6 +366,50 @@ func itemToHTML(feed *gofeed.Feed, item gofeed.Item) matrix.HTMLMessage {
)) ))
} }
func ensureItemsHaveGUIDs(feed *gofeed.Feed) {
for idx := 0; idx < len(feed.Items); idx++ {
itm := feed.Items[idx]
if itm.GUID == "" {
if itm.Link != "" {
itm.GUID = itm.Link
} else if itm.Title != "" {
itm.GUID = itm.Title
}
feed.Items[idx] = itm
}
}
}
// uniqueStrings returns a new slice of strings with duplicate elements removed.
// Order is otherwise preserved.
func uniqueStrings(a []string) []string {
ret := []string{}
seen := make(map[string]bool)
for _, str := range a {
if seen[str] {
continue
}
seen[str] = true
ret = append(ret, str)
}
return ret
}
// uniqueGuids returns a new slice of GUID strings with duplicate elements removed.
// Order is otherwise preserved.
func uniqueGuids(a []*gofeed.Item) []string {
ret := []string{}
seen := make(map[string]bool)
for _, item := range a {
if seen[item.GUID] {
continue
}
seen[item.GUID] = true
ret = append(ret, item.GUID)
}
return ret
}
type userAgentRoundTripper struct { type userAgentRoundTripper struct {
Transport http.RoundTripper Transport http.RoundTripper
} }

Loading…
Cancel
Save