From 299164d525d860dc80c2cd0e5dba26e77497f86d Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Tue, 18 Oct 2016 14:16:09 +0100 Subject: [PATCH] Fix RSS feeds which do not return GUIDs and do not have published dates We previously relied on the published date and GUIDs to determine which items were new. We now only rely on the GUID and not the published date as a lot of RSS feeds don't have published dates. A lot of feeds don't have GUIDs either, so we now fallback to the HTTP `Link` field, or worst-case, the item `Title`. --- .../go-neb/services/rssbot/rssbot.go | 42 ++++++++++++------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go b/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go index f89919f..778c077 100644 --- a/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go +++ b/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go @@ -211,6 +211,19 @@ func (s *rssBotService) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item, return nil, items, err } + // Patch up the item list: make sure each item has a GUID. + for idx := 0; idx < len(feed.Items); idx++ { + itm := feed.Items[idx] + if itm.GUID == "" { + if itm.Link != "" { + itm.GUID = itm.Link + } else if itm.Title != "" { + itm.GUID = itm.Title + } + feed.Items[idx] = itm + } + } + // Work out which items are new, if any (based on the last updated TS we have) // If the TS is 0 then this is the first ever poll, so let's not send 10s of events // into the room and just do new ones from this point onwards. @@ -228,6 +241,8 @@ func (s *rssBotService) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item, i := feed.Items[0] if i != nil && i.PublishedParsed != nil { feedLastUpdatedTs = i.PublishedParsed.Unix() + } else { + feedLastUpdatedTs = time.Now().Unix() } } @@ -245,25 +260,22 @@ func (s *rssBotService) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item, func (s *rssBotService) newItems(feedURL string, allItems []*gofeed.Item) (items []gofeed.Item) { for _, i := range allItems { - if i == nil || i.PublishedParsed == nil { + if i == nil { continue } - - if i.PublishedParsed.Unix() > s.Feeds[feedURL].FeedUpdatedTimestampSecs { - // if we've seen this guid before, we've sent it before (even if the timestamp is newer) - seenBefore := false - for _, guid := range s.Feeds[feedURL].RecentGUIDs { - if guid == i.GUID { - seenBefore = true - break - } + // if we've seen this guid before, we've sent it before + seenBefore := false + for _, guid := range s.Feeds[feedURL].RecentGUIDs { + if guid == i.GUID { + seenBefore = true + break } - if seenBefore { - continue - } - - items = append(items, *i) } + if seenBefore { + continue + } + + items = append(items, *i) } return }