Browse Source

Merge pull request #135 from matrix-org/kegan/min-guid-length

Redo how recent GUIDs are calculated
kegan/rss-no-cache
Kegsay 8 years ago
committed by GitHub
parent
commit
2480163739
  1. 96
      src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go

96
src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go

@ -256,17 +256,7 @@ func (s *Service) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item, error)
}
// Patch up the item list: make sure each item has a GUID.
for idx := 0; idx < len(feed.Items); idx++ {
itm := feed.Items[idx]
if itm.GUID == "" {
if itm.Link != "" {
itm.GUID = itm.Link
} else if itm.Title != "" {
itm.GUID = itm.Title
}
feed.Items[idx] = itm
}
}
ensureItemsHaveGUIDs(feed)
// Work out which items are new, if any (based on the last updated TS we have)
// If the TS is 0 then this is the first ever poll, so let's not send 10s of events
@ -285,20 +275,28 @@ func (s *Service) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item, error)
// TODO: Handle the 'sy' Syndication extension to control update interval.
// See http://www.feedforall.com/syndication.htm and http://web.resource.org/rss/1.0/modules/syndication/
// map items to guid strings
var guids []string
for _, itm := range feed.Items {
guids = append(guids, itm.GUID)
}
// Work out which GUIDs to remember. We don't want to remember every GUID ever as that leads to completely
// unbounded growth of data.
f := s.Feeds[feedURL]
// Some RSS feeds can return a very small number of items then bounce
// back to their "normal" size, so we cannot just clobber the recent GUID list per request or else we'll
// forget what we sent and resend it. Instead, we'll keep 2x the max number of items that we've ever
// seen from this feed, up to a max of 1000.
maxGuids := 2 * len(feed.Items)
if len(f.RecentGUIDs) > maxGuids {
maxGuids = len(f.RecentGUIDs) // already 2x'd.
}
if maxGuids > 1000 {
maxGuids = 1000
}
if len(guids) != len(f.RecentGUIDs) {
log.WithFields(log.Fields{
"new_guids": guids,
"old_guids": f.RecentGUIDs,
"feed_url": feedURL,
}).Warn("GUID length mismatch")
lastSet := uniqueStrings(f.RecentGUIDs) // e.g. [4,5,6]
thisSet := uniqueGuids(feed.Items) // e.g. [1,2,3]
guids := append(thisSet, lastSet...) // e.g. [1,2,3,4,5,6]
guids = uniqueStrings(guids)
if len(guids) > maxGuids {
// Critically this favours the NEWEST elements, which are the ones we're most likely to see again.
guids = guids[0:maxGuids]
}
// Update the service config to persist the new times
@ -343,8 +341,12 @@ func (s *Service) newItems(feedURL string, allItems []*gofeed.Item) (items []gof
}
func (s *Service) sendToRooms(cli *matrix.Client, feedURL string, feed *gofeed.Feed, item gofeed.Item) error {
logger := log.WithField("feed_url", feedURL).WithField("title", item.Title)
logger.Info("New feed item")
logger := log.WithFields(log.Fields{
"feed_url": feedURL,
"title": item.Title,
"guid": item.GUID,
})
logger.Info("Sending new feed item")
for _, roomID := range s.Feeds[feedURL].Rooms {
if _, err := cli.SendMessageEvent(roomID, "m.room.message", itemToHTML(feed, item)); err != nil {
logger.WithError(err).WithField("room_id", roomID).Error("Failed to send to room")
@ -361,6 +363,50 @@ func itemToHTML(feed *gofeed.Feed, item gofeed.Item) matrix.HTMLMessage {
))
}
func ensureItemsHaveGUIDs(feed *gofeed.Feed) {
for idx := 0; idx < len(feed.Items); idx++ {
itm := feed.Items[idx]
if itm.GUID == "" {
if itm.Link != "" {
itm.GUID = itm.Link
} else if itm.Title != "" {
itm.GUID = itm.Title
}
feed.Items[idx] = itm
}
}
}
// uniqueStrings returns a new slice of strings with duplicate elements removed.
// Order is otherwise preserved.
func uniqueStrings(a []string) []string {
ret := []string{}
seen := make(map[string]bool)
for _, str := range a {
if seen[str] {
continue
}
seen[str] = true
ret = append(ret, str)
}
return ret
}
// uniqueGuids returns a new slice of GUID strings with duplicate elements removed.
// Order is otherwise preserved.
func uniqueGuids(a []*gofeed.Item) []string {
ret := []string{}
seen := make(map[string]bool)
for _, item := range a {
if seen[item.GUID] {
continue
}
seen[item.GUID] = true
ret = append(ret, item.GUID)
}
return ret
}
type userAgentRoundTripper struct {
Transport http.RoundTripper
}

Loading…
Cancel
Save