Browse Source

Merge pull request #84 from matrix-org/kegan/rss-guids

Maintain a list of GUIDs per feed and use them to de-dupe
kegan/metrics
Kegsay 8 years ago
committed by GitHub
parent
commit
d9da3caac4
  1. 46
      src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go

46
src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go

@ -29,6 +29,7 @@ type rssBotService struct {
Rooms []string `json:"rooms"` Rooms []string `json:"rooms"`
NextPollTimestampSecs int64 // Internal: When we should poll again NextPollTimestampSecs int64 // Internal: When we should poll again
FeedUpdatedTimestampSecs int64 // Internal: The last time the feed was updated FeedUpdatedTimestampSecs int64 // Internal: The last time the feed was updated
RecentGUIDs []string // Internal: The most recently seen GUIDs. Sized to the number of items in the feed.
} `json:"feeds"` } `json:"feeds"`
} }
@ -175,14 +176,7 @@ func (s *rssBotService) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item,
// If the TS is 0 then this is the first ever poll, so let's not send 10s of events // If the TS is 0 then this is the first ever poll, so let's not send 10s of events
// into the room and just do new ones from this point onwards. // into the room and just do new ones from this point onwards.
if s.Feeds[feedURL].FeedUpdatedTimestampSecs != 0 { if s.Feeds[feedURL].FeedUpdatedTimestampSecs != 0 {
for _, i := range feed.Items {
if i == nil || i.PublishedParsed == nil {
continue
}
if i.PublishedParsed.Unix() > s.Feeds[feedURL].FeedUpdatedTimestampSecs {
items = append(items, *i)
}
}
items = s.newItems(feedURL, feed.Items)
} }
now := time.Now().Unix() // Second resolution now := time.Now().Unix() // Second resolution
@ -206,11 +200,42 @@ func (s *rssBotService) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item,
// TODO: Handle the 'sy' Syndication extension to control update interval. // TODO: Handle the 'sy' Syndication extension to control update interval.
// See http://www.feedforall.com/syndication.htm and http://web.resource.org/rss/1.0/modules/syndication/ // See http://www.feedforall.com/syndication.htm and http://web.resource.org/rss/1.0/modules/syndication/
s.updateFeedInfo(feedURL, nextPollTsSec, feedLastUpdatedTs)
s.updateFeedInfo(feedURL, feed.Items, nextPollTsSec, feedLastUpdatedTs)
return feed, items, nil return feed, items, nil
} }
func (s *rssBotService) updateFeedInfo(feedURL string, nextPollTs, feedUpdatedTs int64) {
func (s *rssBotService) newItems(feedURL string, allItems []*gofeed.Item) (items []gofeed.Item) {
for _, i := range allItems {
if i == nil || i.PublishedParsed == nil {
continue
}
if i.PublishedParsed.Unix() > s.Feeds[feedURL].FeedUpdatedTimestampSecs {
// if we've seen this guid before, we've sent it before (even if the timestamp is newer)
seenBefore := false
for _, guid := range s.Feeds[feedURL].RecentGUIDs {
if guid == i.GUID {
seenBefore = true
break
}
}
if seenBefore {
continue
}
items = append(items, *i)
}
}
return
}
func (s *rssBotService) updateFeedInfo(feedURL string, allFeedItems []*gofeed.Item, nextPollTs, feedUpdatedTs int64) {
// map items to guid strings
var guids []string
for _, i := range allFeedItems {
guids = append(guids, i.GUID)
}
for u := range s.Feeds { for u := range s.Feeds {
if u != feedURL { if u != feedURL {
continue continue
@ -218,6 +243,7 @@ func (s *rssBotService) updateFeedInfo(feedURL string, nextPollTs, feedUpdatedTs
f := s.Feeds[u] f := s.Feeds[u]
f.NextPollTimestampSecs = nextPollTs f.NextPollTimestampSecs = nextPollTs
f.FeedUpdatedTimestampSecs = feedUpdatedTs f.FeedUpdatedTimestampSecs = feedUpdatedTs
f.RecentGUIDs = guids
s.Feeds[u] = f s.Feeds[u] = f
} }
} }

Loading…
Cancel
Save