From 69e0d1bc391aec9cb70580c6c31254e65cf65ed8 Mon Sep 17 00:00:00 2001 From: Nikos Filippakis Date: Thu, 9 Apr 2020 00:00:15 +0200 Subject: [PATCH] RSS show author & ability to filter items (#311) * Show author of RSS feed item if available closes #298 Signed-off-by: Nikos Filippakis * Filter RSS feed items by keywords closes #236, closes #296 Signed-off-by: Nikos Filippakis --- config.sample.yaml | 10 ++ .../go-neb/services/github/github.go | 4 +- .../go-neb/services/rssbot/rssbot.go | 107 ++++++++++++++++-- .../go-neb/services/rssbot/rssbot_test.go | 72 +++++++++++- 4 files changed, 177 insertions(+), 16 deletions(-) diff --git a/config.sample.yaml b/config.sample.yaml index 69a52d3..f447f54 100644 --- a/config.sample.yaml +++ b/config.sample.yaml @@ -100,6 +100,16 @@ services: feeds: "http://lorem-rss.herokuapp.com/feed?unit=second&interval=60": rooms: ["!qmElAGdFYCHoCJuaNt:localhost"] + must_include: + author: + - author1 + description: + - lorem + - ipsum + must_not_include: + title: + - Lorem + - Ipsum - ID: "github_cmd_service" Type: "github" diff --git a/src/github.com/matrix-org/go-neb/services/github/github.go b/src/github.com/matrix-org/go-neb/services/github/github.go index cb867fb..914f628 100644 --- a/src/github.com/matrix-org/go-neb/services/github/github.go +++ b/src/github.com/matrix-org/go-neb/services/github/github.go @@ -450,8 +450,8 @@ func (s *Service) expandCommit(roomID, userID, owner, repo, sha string) interfac if err != nil { log.WithError(err).WithFields(log.Fields{ "owner": owner, - "repo": repo, - "sha": sha, + "repo": repo, + "sha": sha, }).Print("Failed to fetch commit") return nil } diff --git a/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go b/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go index 53b09ba..2aa2934 100644 --- a/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go +++ b/src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go @@ -7,7 +7,9 @@ import ( "html" "net/http" "strconv" + "strings" "time" + "unicode" log "github.com/Sirupsen/logrus" "github.com/die-net/lrucache" @@ -34,6 +36,30 @@ var ( const minPollingIntervalSeconds = 60 * 5 // 5 min (News feeds can be genuinely spammy) +// includeRules contains the rules for including or excluding a feed item. For the fields Author, Title +// and Description in a feed item, there can be some words specified in the config that determine whether +// the item will be displayed or not, depending on whether these words are included in that field. +// +// - If specified in the `must_include` field, the feed item must include at least one word for each field +// that has been specified. This means that if some words have been specified for both Author and Title, +// both the Author and Title must contain at least one of their respective words or the item will be skipped. +// - If specified in the `must_not_include` field, the feed item fields must not contain any of the words +// that were specified for each field. This means that if some words have been specified for both Author +// and Title, if either of them includes at least one of their respective words, the item will be skipped, +// even in the case that the item matched the `must_include` rules. +// +// In both cases, specifying an empty list for a field or not specifying anything causes the field to be ignored. +// The field being checked each time will be split into words (any non-alphanumeric character starts a new word) +// and they will be checked against the provided list. +type includeRules struct { + // Author is a case-sensitive list of words that the author name must contain or not contain. + Author []string `json:"author"` + // Title is a case-sensitive list of words that the author name must contain or not contain. + Title []string `json:"title"` + // Description is a case-sensitive list of words that the author name must contain or not contain. + Description []string `json:"description"` +} + // Service contains the Config fields for this service. // // Example request: @@ -62,6 +88,10 @@ type Service struct { // The time of the last successful poll. This is populated by Go-NEB. Use /getService to retrieve // this value. FeedUpdatedTimestampSecs int64 `json:"last_updated_ts_secs"` + // Specified fields must each include at least one of these words. + MustInclude includeRules `json:"must_include"` + // None of the specified fields must include any of these words. + MustNotInclude includeRules `json:"must_not_include"` // Internal field. When we should poll again. NextPollTimestampSecs int64 // Internal field. The most recently seen GUIDs. Sized to the number of items in the feed. @@ -302,7 +332,44 @@ func (s *Service) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item, error) return feed, items, nil } +// containsAny takes a string and an array of words and returns whether any of the words +// in the list are contained in the string. The words in the string are considered to be +// separated by any non-alphanumeric character. +func containsAny(item string, filterWords []string) bool { + itemWords := strings.FieldsFunc(item, func(c rune) bool { + return !unicode.IsLetter(c) && !unicode.IsNumber(c) + }) + for _, itemWord := range itemWords { + for _, filterWord := range filterWords { + if filterWord == itemWord { + return true + } + } + } + return false +} + +func itemFiltered(i *gofeed.Item, mustInclude, mustNotInclude *includeRules) bool { + // At least one word for each field that has been specified must be included for an item to pass the filter. + if (i.Author != nil && len(mustInclude.Author) > 0 && !containsAny(i.Author.Name, mustInclude.Author)) || + (len(mustInclude.Title) > 0 && !containsAny(i.Title, mustInclude.Title)) || + (len(mustInclude.Description) > 0 && !containsAny(i.Description, mustInclude.Description)) { + return true + } + + // If at least one word of any field that has been specified is included in the item, it doesn't pass the filter. + if (i.Author != nil && containsAny(i.Author.Name, mustNotInclude.Author)) || + containsAny(i.Title, mustNotInclude.Title) || + containsAny(i.Description, mustNotInclude.Description) { + return true + } + return false +} + func (s *Service) newItems(feedURL string, allItems []*gofeed.Item) (items []gofeed.Item) { + mustInclude := s.Feeds[feedURL].MustInclude + mustNotInclude := s.Feeds[feedURL].MustNotInclude + for _, i := range allItems { if i == nil { continue @@ -327,8 +394,14 @@ func (s *Service) newItems(feedURL string, allItems []*gofeed.Item) (items []gof // This will inevitably break for some people, but that group of people are probably smaller, so *shrug*. i.Title = html.UnescapeString(i.Title) i.Description = html.UnescapeString(i.Description) + if i.Author != nil { + i.Author.Name = html.UnescapeString(i.Author.Name) + i.Author.Email = html.UnescapeString(i.Author.Email) + } - items = append(items, *i) + if !itemFiltered(i, &mustInclude, &mustNotInclude) { + items = append(items, *i) + } } return } @@ -355,18 +428,30 @@ func itemToHTML(feed *gofeed.Feed, item gofeed.Item) gomatrix.HTMLMessage { if itemTitle == "" { itemTitle = feed.Title } - + + fmtBody := fmt.Sprintf("%s:
%s", + html.EscapeString(feed.Title), html.EscapeString(item.Link), html.EscapeString(itemTitle)) + if item.Author != nil { + if len(item.Author.Name) > 0 && len(item.Author.Email) > 0 { + fmtBody += fmt.Sprintf(" by %s", html.EscapeString(item.Author.Email), + html.EscapeString(item.Author.Name)) + } else if len(item.Author.Name) > 0 { + fmtBody += fmt.Sprintf(" by %s", html.EscapeString(item.Author.Name)) + } else if len(item.Author.Email) > 0 { + fmtBody += fmt.Sprintf(" by %s", html.EscapeString(item.Author.Email), + html.EscapeString(item.Author.Email)) + } + } return gomatrix.HTMLMessage{ Body: fmt.Sprintf("%s: %s ( %s )", - html.EscapeString(feed.Title), html.EscapeString(item.Title), html.EscapeString(item.Link)), - MsgType: "m.notice", - Format: "org.matrix.custom.html", - FormattedBody: fmt.Sprintf("%s:
%s", - html.EscapeString(feed.Title), html.EscapeString(item.Link), html.EscapeString(itemTitle)), - // FeedTitle: - //
- // Title of the Entry - } + html.EscapeString(feed.Title), html.EscapeString(itemTitle), html.EscapeString(item.Link)), + MsgType: "m.notice", + Format: "org.matrix.custom.html", + FormattedBody: fmtBody, + // FeedTitle: + //
+ // Title of the Entry + } } func ensureItemsHaveGUIDs(feed *gofeed.Feed) { diff --git a/src/github.com/matrix-org/go-neb/services/rssbot/rssbot_test.go b/src/github.com/matrix-org/go-neb/services/rssbot/rssbot_test.go index 5db87e6..b3253dd 100644 --- a/src/github.com/matrix-org/go-neb/services/rssbot/rssbot_test.go +++ b/src/github.com/matrix-org/go-neb/services/rssbot/rssbot_test.go @@ -32,13 +32,13 @@ const rssFeedXML = ` New Item: Majora’s Mask http://go.neb/rss/majoras-mask + The Skullkid! ` -func TestHTMLEntities(t *testing.T) { +func createRSSClient(t *testing.T, feedURL string) *Service { database.SetServiceDB(&database.NopStorage{}) - feedURL := "https://thehappymaskshop.hyrule" // Replace the cachingClient with a mock so we can intercept RSS requests rssTrans := testutils.NewRoundTripper(func(req *http.Request) (*http.Response, error) { if req.URL.String() != feedURL { @@ -55,9 +55,11 @@ func TestHTMLEntities(t *testing.T) { srv, err := types.CreateService("id", "rssbot", "@happy_mask_salesman:hyrule", []byte( `{"feeds": {"`+feedURL+`":{}}}`, // no config yet )) + if err != nil { - t.Fatal("Failed to create RSS bot: ", err) + t.Fatal(err) } + rssbot := srv.(*Service) // Configure the service to force OnPoll to query the RSS feed and attempt to send results @@ -67,6 +69,14 @@ func TestHTMLEntities(t *testing.T) { f.NextPollTimestampSecs = time.Now().Unix() rssbot.Feeds[feedURL] = f + return rssbot +} + +func TestHTMLEntities(t *testing.T) { + feedURL := "https://thehappymaskshop.hyrule" + + rssbot := createRSSClient(t, feedURL) + // Create the Matrix client which will send the notification wg := sync.WaitGroup{} wg.Add(1) @@ -103,3 +113,59 @@ func TestHTMLEntities(t *testing.T) { // Check that the Matrix client sent a message wg.Wait() } + +func TestFeedItemFiltering(t *testing.T) { + feedURL := "https://thehappymaskshop.hyrule" + + // Create rssbot client + rssbot := createRSSClient(t, feedURL) + + feed := rssbot.Feeds[feedURL] + feed.MustInclude.Title = []string{"Zelda"} + rssbot.Feeds[feedURL] = feed + + _, items, _ := rssbot.queryFeed(feedURL) + // Expect that we get no items if we filter for 'Zelda' in title + if len(items) != 0 { + t.Errorf("Expected 0 items, got %v", items) + } + + // Recreate rssbot client + rssbot = createRSSClient(t, feedURL) + + feed = rssbot.Feeds[feedURL] + feed.MustInclude.Title = []string{"Majora"} + rssbot.Feeds[feedURL] = feed + + _, items, _ = rssbot.queryFeed(feedURL) + // Expect one item if we filter for 'Majora' in title + if len(items) != 1 { + t.Errorf("Expected 1 item, got %d", len(items)) + } + + // Recreate rssbot client + rssbot = createRSSClient(t, feedURL) + + feed = rssbot.Feeds[feedURL] + feed.MustNotInclude.Author = []string{"kid"} + rssbot.Feeds[feedURL] = feed + + _, items, _ = rssbot.queryFeed(feedURL) + // 'kid' does not match an entire word in the author name, so it's not filtered + if len(items) != 1 { + t.Errorf("Expected 1 item, got %d", len(items)) + } + + // Recreate rssbot client + rssbot = createRSSClient(t, feedURL) + + feed = rssbot.Feeds[feedURL] + feed.MustNotInclude.Author = []string{"Skullkid"} + rssbot.Feeds[feedURL] = feed + + _, items, _ = rssbot.queryFeed(feedURL) + // Expect no items if we filter for 'Skullkid' not in author name + if len(items) != 0 { + t.Errorf("Expected 0 items, got %v", items) + } +}