You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

549 lines
19 KiB

8 years ago
8 years ago
8 years ago
4 years ago
5 years ago
  1. // Package rssbot implements a Service capable of reading Atom/RSS feeds.
  2. package rssbot
  3. import (
  4. "errors"
  5. "fmt"
  6. "html"
  7. "net/http"
  8. "strconv"
  9. "strings"
  10. "time"
  11. "unicode"
  12. "github.com/die-net/lrucache"
  13. "github.com/gregjones/httpcache"
  14. "github.com/matrix-org/go-neb/database"
  15. "github.com/matrix-org/go-neb/polling"
  16. "github.com/matrix-org/go-neb/types"
  17. "github.com/mmcdole/gofeed"
  18. "github.com/prometheus/client_golang/prometheus"
  19. log "github.com/sirupsen/logrus"
  20. mevt "maunium.net/go/mautrix/event"
  21. "maunium.net/go/mautrix/id"
  22. )
  23. // ServiceType of the RSS Bot service
  24. const ServiceType = "rssbot"
  25. var cachingClient *http.Client
  26. var (
  27. pollCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
  28. Name: "goneb_rss_polls_total",
  29. Help: "The number of feed polls from RSS services",
  30. }, []string{"http_status"})
  31. )
  32. const minPollingIntervalSeconds = 60 * 5 // 5 min (News feeds can be genuinely spammy)
  33. // includeRules contains the rules for including or excluding a feed item. For the fields Author, Title
  34. // and Description in a feed item, there can be some words specified in the config that determine whether
  35. // the item will be displayed or not, depending on whether these words are included in that field.
  36. //
  37. // - If specified in the `must_include` field, the feed item must include at least one word for each field
  38. // that has been specified. This means that if some words have been specified for both Author and Title,
  39. // both the Author and Title must contain at least one of their respective words or the item will be skipped.
  40. // - If specified in the `must_not_include` field, the feed item fields must not contain any of the words
  41. // that were specified for each field. This means that if some words have been specified for both Author
  42. // and Title, if either of them includes at least one of their respective words, the item will be skipped,
  43. // even in the case that the item matched the `must_include` rules.
  44. //
  45. // In both cases, specifying an empty list for a field or not specifying anything causes the field to be ignored.
  46. // The field being checked each time will be split into words (any non-alphanumeric character starts a new word)
  47. // and they will be checked against the provided list.
  48. type includeRules struct {
  49. // Author is a case-sensitive list of words that the author name must contain or not contain.
  50. Author []string `json:"author"`
  51. // Title is a case-sensitive list of words that the author name must contain or not contain.
  52. Title []string `json:"title"`
  53. // Description is a case-sensitive list of words that the author name must contain or not contain.
  54. Description []string `json:"description"`
  55. }
  56. // Service contains the Config fields for this service.
  57. //
  58. // Example request:
  59. // {
  60. // feeds: {
  61. // "http://rss.cnn.com/rss/edition.rss": {
  62. // poll_interval_mins: 60,
  63. // rooms: ["!cBrPbzWazCtlkMNQSF:localhost"]
  64. // },
  65. // "https://www.wired.com/feed/": {
  66. // rooms: ["!qmElAGdFYCHoCJuaNt:localhost"]
  67. // }
  68. // }
  69. // }
  70. type Service struct {
  71. types.DefaultService
  72. // Feeds is a map of feed URL to configuration options for this feed.
  73. Feeds map[string]struct {
  74. // Optional. The time to wait between polls. If this is less than minPollingIntervalSeconds, it is ignored.
  75. PollIntervalMins int `json:"poll_interval_mins"`
  76. // The list of rooms to send feed updates into. This cannot be empty.
  77. Rooms []id.RoomID `json:"rooms"`
  78. // True if rss bot is unable to poll this feed. This is populated by Go-NEB. Use /getService to
  79. // retrieve this value.
  80. IsFailing bool `json:"is_failing"`
  81. // The time of the last successful poll. This is populated by Go-NEB. Use /getService to retrieve
  82. // this value.
  83. FeedUpdatedTimestampSecs int64 `json:"last_updated_ts_secs"`
  84. // Specified fields must each include at least one of these words.
  85. MustInclude includeRules `json:"must_include"`
  86. // None of the specified fields must include any of these words.
  87. MustNotInclude includeRules `json:"must_not_include"`
  88. // Internal field. When we should poll again.
  89. NextPollTimestampSecs int64
  90. // Internal field. The most recently seen GUIDs. Sized to the number of items in the feed.
  91. RecentGUIDs []string
  92. } `json:"feeds"`
  93. }
  94. // Register will check the liveness of each RSS feed given. If all feeds check out okay, no error is returned.
  95. func (s *Service) Register(oldService types.Service, client types.MatrixClient) error {
  96. if len(s.Feeds) == 0 {
  97. // this is an error UNLESS the old service had some feeds in which case they are deleting us :(
  98. var numOldFeeds int
  99. oldFeedService, ok := oldService.(*Service)
  100. if !ok {
  101. log.WithField("service", oldService).Error("Old service isn't an rssbot.Service")
  102. } else {
  103. numOldFeeds = len(oldFeedService.Feeds)
  104. }
  105. if numOldFeeds == 0 {
  106. return errors.New("An RSS feed must be specified")
  107. }
  108. return nil
  109. }
  110. // Make sure we can parse the feed
  111. for feedURL, feedInfo := range s.Feeds {
  112. if _, err := readFeed(feedURL); err != nil {
  113. return fmt.Errorf("Failed to read URL %s: %s", feedURL, err.Error())
  114. }
  115. if len(feedInfo.Rooms) == 0 {
  116. return fmt.Errorf("Feed %s has no rooms to send updates to", feedURL)
  117. }
  118. }
  119. s.joinRooms(client)
  120. return nil
  121. }
  122. func (s *Service) joinRooms(client types.MatrixClient) {
  123. roomSet := make(map[id.RoomID]bool)
  124. for _, feedInfo := range s.Feeds {
  125. for _, roomID := range feedInfo.Rooms {
  126. roomSet[roomID] = true
  127. }
  128. }
  129. for roomID := range roomSet {
  130. if _, err := client.JoinRoom(roomID.String(), "", nil); err != nil {
  131. log.WithFields(log.Fields{
  132. log.ErrorKey: err,
  133. "room_id": roomID,
  134. }).Error("Failed to join room")
  135. }
  136. }
  137. }
  138. // PostRegister deletes this service if there are no feeds remaining.
  139. func (s *Service) PostRegister(oldService types.Service) {
  140. if len(s.Feeds) == 0 { // bye-bye :(
  141. logger := log.WithFields(log.Fields{
  142. "service_id": s.ServiceID(),
  143. "service_type": s.ServiceType(),
  144. })
  145. logger.Info("Deleting service: No feeds remaining.")
  146. polling.StopPolling(s)
  147. if err := database.GetServiceDB().DeleteService(s.ServiceID()); err != nil {
  148. logger.WithError(err).Error("Failed to delete service")
  149. }
  150. }
  151. }
  152. // OnPoll rechecks RSS feeds which are due to be polled.
  153. //
  154. // In order for a feed to be polled, the current time must be greater than NextPollTimestampSecs.
  155. // In order for an item on a feed to be sent to Matrix, the item's GUID must not exist in RecentGUIDs.
  156. // The GUID for an item is created according to the following rules:
  157. // - If there is a GUID field, use it.
  158. // - Else if there is a Link field, use it as the GUID.
  159. // - Else if there is a Title field, use it as the GUID.
  160. //
  161. // Returns a timestamp representing when this Service should have OnPoll called again.
  162. func (s *Service) OnPoll(cli types.MatrixClient) time.Time {
  163. logger := log.WithFields(log.Fields{
  164. "service_id": s.ServiceID(),
  165. "service_type": s.ServiceType(),
  166. })
  167. now := time.Now().Unix() // Second resolution
  168. // Work out which feeds should be polled
  169. var pollFeeds []string
  170. for u, feedInfo := range s.Feeds {
  171. if feedInfo.NextPollTimestampSecs == 0 || now >= feedInfo.NextPollTimestampSecs {
  172. // re-query this feed
  173. pollFeeds = append(pollFeeds, u)
  174. }
  175. }
  176. if len(pollFeeds) == 0 {
  177. return s.nextTimestamp()
  178. }
  179. // Query each feed and send new items to subscribed rooms
  180. for _, u := range pollFeeds {
  181. feed, items, err := s.queryFeed(u)
  182. if err != nil {
  183. logger.WithField("feed_url", u).WithError(err).Error("Failed to query feed")
  184. incrementMetrics(u, err)
  185. continue
  186. }
  187. incrementMetrics(u, nil)
  188. logger.WithFields(log.Fields{
  189. "feed_url": u,
  190. "feed_items": len(feed.Items),
  191. "new_items": len(items),
  192. }).Info("Sending new items")
  193. // Loop backwards since [0] is the most recent and we want to send in chronological order
  194. for i := len(items) - 1; i >= 0; i-- {
  195. item := items[i]
  196. if err := s.sendToRooms(cli, u, feed, item); err != nil {
  197. logger.WithFields(log.Fields{
  198. "feed_url": u,
  199. log.ErrorKey: err,
  200. "item": item,
  201. }).Error("Failed to send item to room due to 429; aborting further sends")
  202. // no point continuing if we errored due to a 429 - we'll just hit the rate limit
  203. // again and again
  204. break
  205. }
  206. }
  207. }
  208. // Persist the service to save the next poll times
  209. if _, err := database.GetServiceDB().StoreService(s); err != nil {
  210. logger.WithError(err).Error("Failed to persist next poll times for service")
  211. }
  212. return s.nextTimestamp()
  213. }
  214. func incrementMetrics(urlStr string, err error) {
  215. if err != nil {
  216. herr, ok := err.(gofeed.HTTPError)
  217. statusCode := 0 // e.g. network timeout
  218. if ok {
  219. statusCode = herr.StatusCode
  220. }
  221. pollCounter.With(prometheus.Labels{"http_status": strconv.Itoa(statusCode)}).Inc()
  222. } else {
  223. pollCounter.With(prometheus.Labels{"http_status": "200"}).Inc() // technically 2xx but gofeed doesn't tell us which
  224. }
  225. }
  226. func (s *Service) nextTimestamp() time.Time {
  227. // return the earliest next poll ts
  228. var earliestNextTs int64
  229. for _, feedInfo := range s.Feeds {
  230. if earliestNextTs == 0 || feedInfo.NextPollTimestampSecs < earliestNextTs {
  231. earliestNextTs = feedInfo.NextPollTimestampSecs
  232. }
  233. }
  234. // Don't allow times in the past. Set a min re-poll threshold of 60s to avoid
  235. // tight-looping on feeds which 500.
  236. now := time.Now().Unix()
  237. if earliestNextTs <= now {
  238. earliestNextTs = now + 60
  239. }
  240. return time.Unix(earliestNextTs, 0)
  241. }
  242. // Query the given feed, update relevant timestamps and return NEW items
  243. func (s *Service) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item, error) {
  244. log.WithField("feed_url", feedURL).Info("Querying feed")
  245. var items []gofeed.Item
  246. feed, err := readFeed(feedURL)
  247. // check for no items in addition to any returned errors as it appears some RSS feeds
  248. // do not consistently return items.
  249. if err == nil && len(feed.Items) == 0 {
  250. err = errors.New("feed has 0 items")
  251. }
  252. if err != nil {
  253. f := s.Feeds[feedURL]
  254. f.IsFailing = true
  255. s.Feeds[feedURL] = f
  256. return nil, items, err
  257. }
  258. // Patch up the item list: make sure each item has a GUID.
  259. ensureItemsHaveGUIDs(feed)
  260. // Work out which items are new, if any (based on the last updated TS we have)
  261. // If the TS is 0 then this is the first ever poll, so let's not send 10s of events
  262. // into the room and just do new ones from this point onwards.
  263. if s.Feeds[feedURL].NextPollTimestampSecs != 0 {
  264. items = s.newItems(feedURL, feed.Items)
  265. }
  266. now := time.Now().Unix() // Second resolution
  267. // Work out when to next poll this feed
  268. nextPollTsSec := now + minPollingIntervalSeconds
  269. if s.Feeds[feedURL].PollIntervalMins > int(minPollingIntervalSeconds/60) {
  270. nextPollTsSec = now + int64(s.Feeds[feedURL].PollIntervalMins*60)
  271. }
  272. // TODO: Handle the 'sy' Syndication extension to control update interval.
  273. // See http://www.feedforall.com/syndication.htm and http://web.resource.org/rss/1.0/modules/syndication/
  274. // Work out which GUIDs to remember. We don't want to remember every GUID ever as that leads to completely
  275. // unbounded growth of data.
  276. f := s.Feeds[feedURL]
  277. // Some RSS feeds can return a very small number of items then bounce
  278. // back to their "normal" size, so we cannot just clobber the recent GUID list per request or else we'll
  279. // forget what we sent and resend it. Instead, we'll keep 2x the max number of items that we've ever
  280. // seen from this feed, up to a max of 10,000.
  281. maxGuids := 2 * len(feed.Items)
  282. if len(f.RecentGUIDs) > maxGuids {
  283. maxGuids = len(f.RecentGUIDs) // already 2x'd.
  284. }
  285. if maxGuids > 10000 {
  286. maxGuids = 10000
  287. }
  288. lastSet := uniqueStrings(f.RecentGUIDs) // e.g. [4,5,6]
  289. thisSet := uniqueGuids(feed.Items) // e.g. [1,2,3]
  290. guids := append(thisSet, lastSet...) // e.g. [1,2,3,4,5,6]
  291. guids = uniqueStrings(guids)
  292. if len(guids) > maxGuids {
  293. // Critically this favours the NEWEST elements, which are the ones we're most likely to see again.
  294. guids = guids[0:maxGuids]
  295. }
  296. // Update the service config to persist the new times
  297. f.NextPollTimestampSecs = nextPollTsSec
  298. f.FeedUpdatedTimestampSecs = now
  299. f.RecentGUIDs = guids
  300. f.IsFailing = false
  301. s.Feeds[feedURL] = f
  302. return feed, items, nil
  303. }
  304. // containsAny takes a string and an array of words and returns whether any of the words
  305. // in the list are contained in the string. The words in the string are considered to be
  306. // separated by any non-alphanumeric character.
  307. func containsAny(item string, filterWords []string) bool {
  308. itemWords := strings.FieldsFunc(item, func(c rune) bool {
  309. return !unicode.IsLetter(c) && !unicode.IsNumber(c)
  310. })
  311. for _, itemWord := range itemWords {
  312. for _, filterWord := range filterWords {
  313. if filterWord == itemWord {
  314. return true
  315. }
  316. }
  317. }
  318. return false
  319. }
  320. func itemFiltered(i *gofeed.Item, mustInclude, mustNotInclude *includeRules) bool {
  321. // At least one word for each field that has been specified must be included for an item to pass the filter.
  322. if (i.Author != nil && len(mustInclude.Author) > 0 && !containsAny(i.Author.Name, mustInclude.Author)) ||
  323. (len(mustInclude.Title) > 0 && !containsAny(i.Title, mustInclude.Title)) ||
  324. (len(mustInclude.Description) > 0 && !containsAny(i.Description, mustInclude.Description)) {
  325. return true
  326. }
  327. // If at least one word of any field that has been specified is included in the item, it doesn't pass the filter.
  328. if (i.Author != nil && containsAny(i.Author.Name, mustNotInclude.Author)) ||
  329. containsAny(i.Title, mustNotInclude.Title) ||
  330. containsAny(i.Description, mustNotInclude.Description) {
  331. return true
  332. }
  333. return false
  334. }
  335. func (s *Service) newItems(feedURL string, allItems []*gofeed.Item) (items []gofeed.Item) {
  336. mustInclude := s.Feeds[feedURL].MustInclude
  337. mustNotInclude := s.Feeds[feedURL].MustNotInclude
  338. for _, i := range allItems {
  339. if i == nil {
  340. continue
  341. }
  342. // if we've seen this guid before, we've sent it before
  343. seenBefore := false
  344. for _, guid := range s.Feeds[feedURL].RecentGUIDs {
  345. if guid == i.GUID {
  346. seenBefore = true
  347. break
  348. }
  349. }
  350. if seenBefore {
  351. continue
  352. }
  353. // Decode HTML for <title> and <description>:
  354. // The RSS 2.0 Spec http://cyber.harvard.edu/rss/rss.html#hrelementsOfLtitemgt supports a bunch
  355. // of weird ways to put HTML into <title> and <description> tags. Not all RSS feed producers run
  356. // these fields through entity encoders (some have ' unencoded, others have it as &#8217;). We'll
  357. // assume that all RSS fields are sending HTML for these fields and run them through a standard decoder.
  358. // This will inevitably break for some people, but that group of people are probably smaller, so *shrug*.
  359. i.Title = html.UnescapeString(i.Title)
  360. i.Description = html.UnescapeString(i.Description)
  361. if i.Author != nil {
  362. i.Author.Name = html.UnescapeString(i.Author.Name)
  363. i.Author.Email = html.UnescapeString(i.Author.Email)
  364. }
  365. if !itemFiltered(i, &mustInclude, &mustNotInclude) {
  366. items = append(items, *i)
  367. }
  368. }
  369. return
  370. }
  371. func (s *Service) sendToRooms(cli types.MatrixClient, feedURL string, feed *gofeed.Feed, item gofeed.Item) error {
  372. logger := log.WithFields(log.Fields{
  373. "feed_url": feedURL,
  374. "title": item.Title,
  375. "guid": item.GUID,
  376. })
  377. logger.Info("Sending new feed item")
  378. for _, roomID := range s.Feeds[feedURL].Rooms {
  379. if _, err := cli.SendMessageEvent(roomID, mevt.EventMessage, itemToHTML(feed, item)); err != nil {
  380. if httpErr, ok := err.(mautrix.HTTPError); ok && httpErr.IsStatus(429) {
  381. return err
  382. } else {
  383. logger.WithError(err).WithField("room_id", roomID).Error("Failed to send to room")
  384. }
  385. }
  386. }
  387. return nil
  388. }
  389. func itemToHTML(feed *gofeed.Feed, item gofeed.Item) mevt.MessageEventContent {
  390. // If an item does not have a title, try using the feed's title instead
  391. // Create a new variable instead of mutating that which is passed in
  392. itemTitle := item.Title
  393. if itemTitle == "" {
  394. itemTitle = feed.Title
  395. }
  396. fmtBody := fmt.Sprintf("<strong>%s</strong>:<br><a href=\"%s\"><strong>%s</strong></a>",
  397. html.EscapeString(feed.Title), html.EscapeString(item.Link), html.EscapeString(itemTitle))
  398. if item.Author != nil {
  399. if len(item.Author.Name) > 0 && len(item.Author.Email) > 0 {
  400. fmtBody += fmt.Sprintf(" by <a href=\"mailto:%s\">%s</a>", html.EscapeString(item.Author.Email),
  401. html.EscapeString(item.Author.Name))
  402. } else if len(item.Author.Name) > 0 {
  403. fmtBody += fmt.Sprintf(" by %s", html.EscapeString(item.Author.Name))
  404. } else if len(item.Author.Email) > 0 {
  405. fmtBody += fmt.Sprintf(" by <a href=\"mailto:%s\">%s</a>", html.EscapeString(item.Author.Email),
  406. html.EscapeString(item.Author.Email))
  407. }
  408. }
  409. return mevt.MessageEventContent{
  410. Body: fmt.Sprintf("%s: %s ( %s )",
  411. html.EscapeString(feed.Title), html.EscapeString(itemTitle), html.EscapeString(item.Link)),
  412. MsgType: "m.notice",
  413. Format: mevt.FormatHTML,
  414. FormattedBody: fmtBody,
  415. // <strong>FeedTitle</strong>:
  416. // <br>
  417. // <a href="url-of-the-entry"><strong>Title of the Entry</strong></a>
  418. }
  419. }
  420. func ensureItemsHaveGUIDs(feed *gofeed.Feed) {
  421. for idx := 0; idx < len(feed.Items); idx++ {
  422. itm := feed.Items[idx]
  423. if itm.GUID == "" {
  424. if itm.Link != "" {
  425. itm.GUID = itm.Link
  426. } else if itm.Title != "" {
  427. itm.GUID = itm.Title
  428. }
  429. feed.Items[idx] = itm
  430. }
  431. }
  432. }
  433. // uniqueStrings returns a new slice of strings with duplicate elements removed.
  434. // Order is otherwise preserved.
  435. func uniqueStrings(a []string) []string {
  436. ret := []string{}
  437. seen := make(map[string]bool)
  438. for _, str := range a {
  439. if seen[str] {
  440. continue
  441. }
  442. seen[str] = true
  443. ret = append(ret, str)
  444. }
  445. return ret
  446. }
  447. // uniqueGuids returns a new slice of GUID strings with duplicate elements removed.
  448. // Order is otherwise preserved.
  449. func uniqueGuids(a []*gofeed.Item) []string {
  450. ret := []string{}
  451. seen := make(map[string]bool)
  452. for _, item := range a {
  453. if seen[item.GUID] {
  454. continue
  455. }
  456. seen[item.GUID] = true
  457. ret = append(ret, item.GUID)
  458. }
  459. return ret
  460. }
  461. type userAgentRoundTripper struct {
  462. Transport http.RoundTripper
  463. }
  464. func (rt userAgentRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
  465. req.Header.Set("User-Agent", "Go-NEB")
  466. return rt.Transport.RoundTrip(req)
  467. }
  468. func readFeed(feedURL string) (*gofeed.Feed, error) {
  469. // Don't use fp.ParseURL because it leaks on non-2xx responses as of 2016/11/29 (cac19c6c27)
  470. fp := gofeed.NewParser()
  471. resp, err := cachingClient.Get(feedURL)
  472. if resp != nil {
  473. defer resp.Body.Close()
  474. }
  475. if err != nil {
  476. return nil, err
  477. }
  478. if resp.StatusCode < 200 || resp.StatusCode >= 300 {
  479. return nil, gofeed.HTTPError{
  480. StatusCode: resp.StatusCode,
  481. Status: resp.Status,
  482. }
  483. }
  484. return fp.Parse(resp.Body)
  485. }
  486. func init() {
  487. lruCache := lrucache.New(1024*1024*20, 0) // 20 MB cache, no max-age
  488. cachingClient = &http.Client{
  489. Transport: userAgentRoundTripper{httpcache.NewTransport(lruCache)},
  490. }
  491. types.RegisterService(func(serviceID string, serviceUserID id.UserID, webhookEndpointURL string) types.Service {
  492. r := &Service{
  493. DefaultService: types.NewDefaultService(serviceID, serviceUserID, ServiceType),
  494. }
  495. return r
  496. })
  497. prometheus.MustRegister(pollCounter)
  498. }