Browse Source

RSS show author & ability to filter items (#311)

* Show author of RSS feed item if available

closes #298

Signed-off-by: Nikos Filippakis <me@nfil.dev>

* Filter RSS feed items by keywords

closes #236, closes #296

Signed-off-by: Nikos Filippakis <me@nfil.dev>
pull/318/head
Nikos Filippakis 5 years ago
committed by GitHub
parent
commit
69e0d1bc39
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 10
      config.sample.yaml
  2. 4
      src/github.com/matrix-org/go-neb/services/github/github.go
  3. 107
      src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go
  4. 72
      src/github.com/matrix-org/go-neb/services/rssbot/rssbot_test.go

10
config.sample.yaml

@ -100,6 +100,16 @@ services:
feeds:
"http://lorem-rss.herokuapp.com/feed?unit=second&interval=60":
rooms: ["!qmElAGdFYCHoCJuaNt:localhost"]
must_include:
author:
- author1
description:
- lorem
- ipsum
must_not_include:
title:
- Lorem
- Ipsum
- ID: "github_cmd_service"
Type: "github"

4
src/github.com/matrix-org/go-neb/services/github/github.go

@ -450,8 +450,8 @@ func (s *Service) expandCommit(roomID, userID, owner, repo, sha string) interfac
if err != nil {
log.WithError(err).WithFields(log.Fields{
"owner": owner,
"repo": repo,
"sha": sha,
"repo": repo,
"sha": sha,
}).Print("Failed to fetch commit")
return nil
}

107
src/github.com/matrix-org/go-neb/services/rssbot/rssbot.go

@ -7,7 +7,9 @@ import (
"html"
"net/http"
"strconv"
"strings"
"time"
"unicode"
log "github.com/Sirupsen/logrus"
"github.com/die-net/lrucache"
@ -34,6 +36,30 @@ var (
const minPollingIntervalSeconds = 60 * 5 // 5 min (News feeds can be genuinely spammy)
// includeRules contains the rules for including or excluding a feed item. For the fields Author, Title
// and Description in a feed item, there can be some words specified in the config that determine whether
// the item will be displayed or not, depending on whether these words are included in that field.
//
// - If specified in the `must_include` field, the feed item must include at least one word for each field
// that has been specified. This means that if some words have been specified for both Author and Title,
// both the Author and Title must contain at least one of their respective words or the item will be skipped.
// - If specified in the `must_not_include` field, the feed item fields must not contain any of the words
// that were specified for each field. This means that if some words have been specified for both Author
// and Title, if either of them includes at least one of their respective words, the item will be skipped,
// even in the case that the item matched the `must_include` rules.
//
// In both cases, specifying an empty list for a field or not specifying anything causes the field to be ignored.
// The field being checked each time will be split into words (any non-alphanumeric character starts a new word)
// and they will be checked against the provided list.
type includeRules struct {
// Author is a case-sensitive list of words that the author name must contain or not contain.
Author []string `json:"author"`
// Title is a case-sensitive list of words that the author name must contain or not contain.
Title []string `json:"title"`
// Description is a case-sensitive list of words that the author name must contain or not contain.
Description []string `json:"description"`
}
// Service contains the Config fields for this service.
//
// Example request:
@ -62,6 +88,10 @@ type Service struct {
// The time of the last successful poll. This is populated by Go-NEB. Use /getService to retrieve
// this value.
FeedUpdatedTimestampSecs int64 `json:"last_updated_ts_secs"`
// Specified fields must each include at least one of these words.
MustInclude includeRules `json:"must_include"`
// None of the specified fields must include any of these words.
MustNotInclude includeRules `json:"must_not_include"`
// Internal field. When we should poll again.
NextPollTimestampSecs int64
// Internal field. The most recently seen GUIDs. Sized to the number of items in the feed.
@ -302,7 +332,44 @@ func (s *Service) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item, error)
return feed, items, nil
}
// containsAny takes a string and an array of words and returns whether any of the words
// in the list are contained in the string. The words in the string are considered to be
// separated by any non-alphanumeric character.
func containsAny(item string, filterWords []string) bool {
itemWords := strings.FieldsFunc(item, func(c rune) bool {
return !unicode.IsLetter(c) && !unicode.IsNumber(c)
})
for _, itemWord := range itemWords {
for _, filterWord := range filterWords {
if filterWord == itemWord {
return true
}
}
}
return false
}
func itemFiltered(i *gofeed.Item, mustInclude, mustNotInclude *includeRules) bool {
// At least one word for each field that has been specified must be included for an item to pass the filter.
if (i.Author != nil && len(mustInclude.Author) > 0 && !containsAny(i.Author.Name, mustInclude.Author)) ||
(len(mustInclude.Title) > 0 && !containsAny(i.Title, mustInclude.Title)) ||
(len(mustInclude.Description) > 0 && !containsAny(i.Description, mustInclude.Description)) {
return true
}
// If at least one word of any field that has been specified is included in the item, it doesn't pass the filter.
if (i.Author != nil && containsAny(i.Author.Name, mustNotInclude.Author)) ||
containsAny(i.Title, mustNotInclude.Title) ||
containsAny(i.Description, mustNotInclude.Description) {
return true
}
return false
}
func (s *Service) newItems(feedURL string, allItems []*gofeed.Item) (items []gofeed.Item) {
mustInclude := s.Feeds[feedURL].MustInclude
mustNotInclude := s.Feeds[feedURL].MustNotInclude
for _, i := range allItems {
if i == nil {
continue
@ -327,8 +394,14 @@ func (s *Service) newItems(feedURL string, allItems []*gofeed.Item) (items []gof
// This will inevitably break for some people, but that group of people are probably smaller, so *shrug*.
i.Title = html.UnescapeString(i.Title)
i.Description = html.UnescapeString(i.Description)
if i.Author != nil {
i.Author.Name = html.UnescapeString(i.Author.Name)
i.Author.Email = html.UnescapeString(i.Author.Email)
}
items = append(items, *i)
if !itemFiltered(i, &mustInclude, &mustNotInclude) {
items = append(items, *i)
}
}
return
}
@ -355,18 +428,30 @@ func itemToHTML(feed *gofeed.Feed, item gofeed.Item) gomatrix.HTMLMessage {
if itemTitle == "" {
itemTitle = feed.Title
}
fmtBody := fmt.Sprintf("<strong>%s</strong>:<br><a href=\"%s\"><strong>%s</strong></a>",
html.EscapeString(feed.Title), html.EscapeString(item.Link), html.EscapeString(itemTitle))
if item.Author != nil {
if len(item.Author.Name) > 0 && len(item.Author.Email) > 0 {
fmtBody += fmt.Sprintf(" by <a href=\"mailto:%s\">%s</a>", html.EscapeString(item.Author.Email),
html.EscapeString(item.Author.Name))
} else if len(item.Author.Name) > 0 {
fmtBody += fmt.Sprintf(" by %s", html.EscapeString(item.Author.Name))
} else if len(item.Author.Email) > 0 {
fmtBody += fmt.Sprintf(" by <a href=\"mailto:%s\">%s</a>", html.EscapeString(item.Author.Email),
html.EscapeString(item.Author.Email))
}
}
return gomatrix.HTMLMessage{
Body: fmt.Sprintf("%s: %s ( %s )",
html.EscapeString(feed.Title), html.EscapeString(item.Title), html.EscapeString(item.Link)),
MsgType: "m.notice",
Format: "org.matrix.custom.html",
FormattedBody: fmt.Sprintf("<strong>%s</strong>:<br><a href=\"%s\"><strong>%s</strong></a>",
html.EscapeString(feed.Title), html.EscapeString(item.Link), html.EscapeString(itemTitle)),
// <strong>FeedTitle</strong>:
// <br>
// <a href="url-of-the-entry"><strong>Title of the Entry</strong></a>
}
html.EscapeString(feed.Title), html.EscapeString(itemTitle), html.EscapeString(item.Link)),
MsgType: "m.notice",
Format: "org.matrix.custom.html",
FormattedBody: fmtBody,
// <strong>FeedTitle</strong>:
// <br>
// <a href="url-of-the-entry"><strong>Title of the Entry</strong></a>
}
}
func ensureItemsHaveGUIDs(feed *gofeed.Feed) {

72
src/github.com/matrix-org/go-neb/services/rssbot/rssbot_test.go

@ -32,13 +32,13 @@ const rssFeedXML = `
<item>
<title>New Item: Majora&#8217;s Mask</title>
<link>http://go.neb/rss/majoras-mask</link>
<author>The Skullkid!</author>
</item>
</channel>
</rss>`
func TestHTMLEntities(t *testing.T) {
func createRSSClient(t *testing.T, feedURL string) *Service {
database.SetServiceDB(&database.NopStorage{})
feedURL := "https://thehappymaskshop.hyrule"
// Replace the cachingClient with a mock so we can intercept RSS requests
rssTrans := testutils.NewRoundTripper(func(req *http.Request) (*http.Response, error) {
if req.URL.String() != feedURL {
@ -55,9 +55,11 @@ func TestHTMLEntities(t *testing.T) {
srv, err := types.CreateService("id", "rssbot", "@happy_mask_salesman:hyrule", []byte(
`{"feeds": {"`+feedURL+`":{}}}`, // no config yet
))
if err != nil {
t.Fatal("Failed to create RSS bot: ", err)
t.Fatal(err)
}
rssbot := srv.(*Service)
// Configure the service to force OnPoll to query the RSS feed and attempt to send results
@ -67,6 +69,14 @@ func TestHTMLEntities(t *testing.T) {
f.NextPollTimestampSecs = time.Now().Unix()
rssbot.Feeds[feedURL] = f
return rssbot
}
func TestHTMLEntities(t *testing.T) {
feedURL := "https://thehappymaskshop.hyrule"
rssbot := createRSSClient(t, feedURL)
// Create the Matrix client which will send the notification
wg := sync.WaitGroup{}
wg.Add(1)
@ -103,3 +113,59 @@ func TestHTMLEntities(t *testing.T) {
// Check that the Matrix client sent a message
wg.Wait()
}
func TestFeedItemFiltering(t *testing.T) {
feedURL := "https://thehappymaskshop.hyrule"
// Create rssbot client
rssbot := createRSSClient(t, feedURL)
feed := rssbot.Feeds[feedURL]
feed.MustInclude.Title = []string{"Zelda"}
rssbot.Feeds[feedURL] = feed
_, items, _ := rssbot.queryFeed(feedURL)
// Expect that we get no items if we filter for 'Zelda' in title
if len(items) != 0 {
t.Errorf("Expected 0 items, got %v", items)
}
// Recreate rssbot client
rssbot = createRSSClient(t, feedURL)
feed = rssbot.Feeds[feedURL]
feed.MustInclude.Title = []string{"Majora"}
rssbot.Feeds[feedURL] = feed
_, items, _ = rssbot.queryFeed(feedURL)
// Expect one item if we filter for 'Majora' in title
if len(items) != 1 {
t.Errorf("Expected 1 item, got %d", len(items))
}
// Recreate rssbot client
rssbot = createRSSClient(t, feedURL)
feed = rssbot.Feeds[feedURL]
feed.MustNotInclude.Author = []string{"kid"}
rssbot.Feeds[feedURL] = feed
_, items, _ = rssbot.queryFeed(feedURL)
// 'kid' does not match an entire word in the author name, so it's not filtered
if len(items) != 1 {
t.Errorf("Expected 1 item, got %d", len(items))
}
// Recreate rssbot client
rssbot = createRSSClient(t, feedURL)
feed = rssbot.Feeds[feedURL]
feed.MustNotInclude.Author = []string{"Skullkid"}
rssbot.Feeds[feedURL] = feed
_, items, _ = rssbot.queryFeed(feedURL)
// Expect no items if we filter for 'Skullkid' not in author name
if len(items) != 0 {
t.Errorf("Expected 0 items, got %v", items)
}
}
Loading…
Cancel
Save