Browse Source

Fix RSS feeds which do not return GUIDs and do not have published dates

We previously relied on the published date and GUIDs to determine which items
were new. We now only rely on the GUID and not the published date as a lot of
RSS feeds don't have published dates. A lot of feeds don't have GUIDs either,
so we now fallback to the HTTP `Link` field, or worst-case, the item `Title`.
kegan/rss-notify-on-failures
Kegan Dougal 8 years ago
parent
commit
299164d525
  1. 42
      src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go

42
src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go

@ -211,6 +211,19 @@ func (s *rssBotService) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item,
return nil, items, err return nil, items, err
} }
// Patch up the item list: make sure each item has a GUID.
for idx := 0; idx < len(feed.Items); idx++ {
itm := feed.Items[idx]
if itm.GUID == "" {
if itm.Link != "" {
itm.GUID = itm.Link
} else if itm.Title != "" {
itm.GUID = itm.Title
}
feed.Items[idx] = itm
}
}
// Work out which items are new, if any (based on the last updated TS we have) // Work out which items are new, if any (based on the last updated TS we have)
// If the TS is 0 then this is the first ever poll, so let's not send 10s of events // If the TS is 0 then this is the first ever poll, so let's not send 10s of events
// into the room and just do new ones from this point onwards. // into the room and just do new ones from this point onwards.
@ -228,6 +241,8 @@ func (s *rssBotService) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item,
i := feed.Items[0] i := feed.Items[0]
if i != nil && i.PublishedParsed != nil { if i != nil && i.PublishedParsed != nil {
feedLastUpdatedTs = i.PublishedParsed.Unix() feedLastUpdatedTs = i.PublishedParsed.Unix()
} else {
feedLastUpdatedTs = time.Now().Unix()
} }
} }
@ -245,25 +260,22 @@ func (s *rssBotService) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item,
func (s *rssBotService) newItems(feedURL string, allItems []*gofeed.Item) (items []gofeed.Item) { func (s *rssBotService) newItems(feedURL string, allItems []*gofeed.Item) (items []gofeed.Item) {
for _, i := range allItems { for _, i := range allItems {
if i == nil || i.PublishedParsed == nil {
if i == nil {
continue continue
} }
if i.PublishedParsed.Unix() > s.Feeds[feedURL].FeedUpdatedTimestampSecs {
// if we've seen this guid before, we've sent it before (even if the timestamp is newer)
seenBefore := false
for _, guid := range s.Feeds[feedURL].RecentGUIDs {
if guid == i.GUID {
seenBefore = true
break
}
// if we've seen this guid before, we've sent it before
seenBefore := false
for _, guid := range s.Feeds[feedURL].RecentGUIDs {
if guid == i.GUID {
seenBefore = true
break
} }
if seenBefore {
continue
}
items = append(items, *i)
} }
if seenBefore {
continue
}
items = append(items, *i)
} }
return return
} }

Loading…
Cancel
Save