You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

542 lines
18 KiB

8 years ago
8 years ago
8 years ago
5 years ago
  1. // Package rssbot implements a Service capable of reading Atom/RSS feeds.
  2. package rssbot
  3. import (
  4. "errors"
  5. "fmt"
  6. "html"
  7. "net/http"
  8. "strconv"
  9. "strings"
  10. "time"
  11. "unicode"
  12. "github.com/die-net/lrucache"
  13. "github.com/gregjones/httpcache"
  14. "github.com/matrix-org/go-neb/database"
  15. "github.com/matrix-org/go-neb/polling"
  16. "github.com/matrix-org/go-neb/types"
  17. "github.com/mmcdole/gofeed"
  18. "github.com/prometheus/client_golang/prometheus"
  19. log "github.com/sirupsen/logrus"
  20. mevt "maunium.net/go/mautrix/event"
  21. "maunium.net/go/mautrix/id"
  22. )
  23. // ServiceType of the RSS Bot service
  24. const ServiceType = "rssbot"
  25. var cachingClient *http.Client
  26. var (
  27. pollCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
  28. Name: "goneb_rss_polls_total",
  29. Help: "The number of feed polls from RSS services",
  30. }, []string{"http_status"})
  31. )
  32. const minPollingIntervalSeconds = 60 * 5 // 5 min (News feeds can be genuinely spammy)
  33. // includeRules contains the rules for including or excluding a feed item. For the fields Author, Title
  34. // and Description in a feed item, there can be some words specified in the config that determine whether
  35. // the item will be displayed or not, depending on whether these words are included in that field.
  36. //
  37. // - If specified in the `must_include` field, the feed item must include at least one word for each field
  38. // that has been specified. This means that if some words have been specified for both Author and Title,
  39. // both the Author and Title must contain at least one of their respective words or the item will be skipped.
  40. // - If specified in the `must_not_include` field, the feed item fields must not contain any of the words
  41. // that were specified for each field. This means that if some words have been specified for both Author
  42. // and Title, if either of them includes at least one of their respective words, the item will be skipped,
  43. // even in the case that the item matched the `must_include` rules.
  44. //
  45. // In both cases, specifying an empty list for a field or not specifying anything causes the field to be ignored.
  46. // The field being checked each time will be split into words (any non-alphanumeric character starts a new word)
  47. // and they will be checked against the provided list.
  48. type includeRules struct {
  49. // Author is a case-sensitive list of words that the author name must contain or not contain.
  50. Author []string `json:"author"`
  51. // Title is a case-sensitive list of words that the author name must contain or not contain.
  52. Title []string `json:"title"`
  53. // Description is a case-sensitive list of words that the author name must contain or not contain.
  54. Description []string `json:"description"`
  55. }
  56. // Service contains the Config fields for this service.
  57. //
  58. // Example request:
  59. // {
  60. // feeds: {
  61. // "http://rss.cnn.com/rss/edition.rss": {
  62. // poll_interval_mins: 60,
  63. // rooms: ["!cBrPbzWazCtlkMNQSF:localhost"]
  64. // },
  65. // "https://www.wired.com/feed/": {
  66. // rooms: ["!qmElAGdFYCHoCJuaNt:localhost"]
  67. // }
  68. // }
  69. // }
  70. type Service struct {
  71. types.DefaultService
  72. // Feeds is a map of feed URL to configuration options for this feed.
  73. Feeds map[string]struct {
  74. // Optional. The time to wait between polls. If this is less than minPollingIntervalSeconds, it is ignored.
  75. PollIntervalMins int `json:"poll_interval_mins"`
  76. // The list of rooms to send feed updates into. This cannot be empty.
  77. Rooms []id.RoomID `json:"rooms"`
  78. // True if rss bot is unable to poll this feed. This is populated by Go-NEB. Use /getService to
  79. // retrieve this value.
  80. IsFailing bool `json:"is_failing"`
  81. // The time of the last successful poll. This is populated by Go-NEB. Use /getService to retrieve
  82. // this value.
  83. FeedUpdatedTimestampSecs int64 `json:"last_updated_ts_secs"`
  84. // Specified fields must each include at least one of these words.
  85. MustInclude includeRules `json:"must_include"`
  86. // None of the specified fields must include any of these words.
  87. MustNotInclude includeRules `json:"must_not_include"`
  88. // Internal field. When we should poll again.
  89. NextPollTimestampSecs int64
  90. // Internal field. The most recently seen GUIDs. Sized to the number of items in the feed.
  91. RecentGUIDs []string
  92. } `json:"feeds"`
  93. }
  94. // Register will check the liveness of each RSS feed given. If all feeds check out okay, no error is returned.
  95. func (s *Service) Register(oldService types.Service, client types.MatrixClient) error {
  96. if len(s.Feeds) == 0 {
  97. // this is an error UNLESS the old service had some feeds in which case they are deleting us :(
  98. var numOldFeeds int
  99. oldFeedService, ok := oldService.(*Service)
  100. if !ok {
  101. log.WithField("service", oldService).Error("Old service isn't an rssbot.Service")
  102. } else {
  103. numOldFeeds = len(oldFeedService.Feeds)
  104. }
  105. if numOldFeeds == 0 {
  106. return errors.New("An RSS feed must be specified")
  107. }
  108. return nil
  109. }
  110. // Make sure we can parse the feed
  111. for feedURL, feedInfo := range s.Feeds {
  112. if _, err := readFeed(feedURL); err != nil {
  113. return fmt.Errorf("Failed to read URL %s: %s", feedURL, err.Error())
  114. }
  115. if len(feedInfo.Rooms) == 0 {
  116. return fmt.Errorf("Feed %s has no rooms to send updates to", feedURL)
  117. }
  118. }
  119. s.joinRooms(client)
  120. return nil
  121. }
  122. func (s *Service) joinRooms(client types.MatrixClient) {
  123. roomSet := make(map[id.RoomID]bool)
  124. for _, feedInfo := range s.Feeds {
  125. for _, roomID := range feedInfo.Rooms {
  126. roomSet[roomID] = true
  127. }
  128. }
  129. for roomID := range roomSet {
  130. if _, err := client.JoinRoom(roomID.String(), "", nil); err != nil {
  131. log.WithFields(log.Fields{
  132. log.ErrorKey: err,
  133. "room_id": roomID,
  134. }).Error("Failed to join room")
  135. }
  136. }
  137. }
  138. // PostRegister deletes this service if there are no feeds remaining.
  139. func (s *Service) PostRegister(oldService types.Service) {
  140. if len(s.Feeds) == 0 { // bye-bye :(
  141. logger := log.WithFields(log.Fields{
  142. "service_id": s.ServiceID(),
  143. "service_type": s.ServiceType(),
  144. })
  145. logger.Info("Deleting service: No feeds remaining.")
  146. polling.StopPolling(s)
  147. if err := database.GetServiceDB().DeleteService(s.ServiceID()); err != nil {
  148. logger.WithError(err).Error("Failed to delete service")
  149. }
  150. }
  151. }
  152. // OnPoll rechecks RSS feeds which are due to be polled.
  153. //
  154. // In order for a feed to be polled, the current time must be greater than NextPollTimestampSecs.
  155. // In order for an item on a feed to be sent to Matrix, the item's GUID must not exist in RecentGUIDs.
  156. // The GUID for an item is created according to the following rules:
  157. // - If there is a GUID field, use it.
  158. // - Else if there is a Link field, use it as the GUID.
  159. // - Else if there is a Title field, use it as the GUID.
  160. //
  161. // Returns a timestamp representing when this Service should have OnPoll called again.
  162. func (s *Service) OnPoll(cli types.MatrixClient) time.Time {
  163. logger := log.WithFields(log.Fields{
  164. "service_id": s.ServiceID(),
  165. "service_type": s.ServiceType(),
  166. })
  167. now := time.Now().Unix() // Second resolution
  168. // Work out which feeds should be polled
  169. var pollFeeds []string
  170. for u, feedInfo := range s.Feeds {
  171. if feedInfo.NextPollTimestampSecs == 0 || now >= feedInfo.NextPollTimestampSecs {
  172. // re-query this feed
  173. pollFeeds = append(pollFeeds, u)
  174. }
  175. }
  176. if len(pollFeeds) == 0 {
  177. return s.nextTimestamp()
  178. }
  179. // Query each feed and send new items to subscribed rooms
  180. for _, u := range pollFeeds {
  181. feed, items, err := s.queryFeed(u)
  182. if err != nil {
  183. logger.WithField("feed_url", u).WithError(err).Error("Failed to query feed")
  184. incrementMetrics(u, err)
  185. continue
  186. }
  187. incrementMetrics(u, nil)
  188. logger.WithFields(log.Fields{
  189. "feed_url": u,
  190. "feed_items": len(feed.Items),
  191. "new_items": len(items),
  192. }).Info("Sending new items")
  193. // Loop backwards since [0] is the most recent and we want to send in chronological order
  194. for i := len(items) - 1; i >= 0; i-- {
  195. item := items[i]
  196. if err := s.sendToRooms(cli, u, feed, item); err != nil {
  197. logger.WithFields(log.Fields{
  198. "feed_url": u,
  199. log.ErrorKey: err,
  200. "item": item,
  201. }).Error("Failed to send item to room")
  202. }
  203. }
  204. }
  205. // Persist the service to save the next poll times
  206. if _, err := database.GetServiceDB().StoreService(s); err != nil {
  207. logger.WithError(err).Error("Failed to persist next poll times for service")
  208. }
  209. return s.nextTimestamp()
  210. }
  211. func incrementMetrics(urlStr string, err error) {
  212. if err != nil {
  213. herr, ok := err.(gofeed.HTTPError)
  214. statusCode := 0 // e.g. network timeout
  215. if ok {
  216. statusCode = herr.StatusCode
  217. }
  218. pollCounter.With(prometheus.Labels{"http_status": strconv.Itoa(statusCode)}).Inc()
  219. } else {
  220. pollCounter.With(prometheus.Labels{"http_status": "200"}).Inc() // technically 2xx but gofeed doesn't tell us which
  221. }
  222. }
  223. func (s *Service) nextTimestamp() time.Time {
  224. // return the earliest next poll ts
  225. var earliestNextTs int64
  226. for _, feedInfo := range s.Feeds {
  227. if earliestNextTs == 0 || feedInfo.NextPollTimestampSecs < earliestNextTs {
  228. earliestNextTs = feedInfo.NextPollTimestampSecs
  229. }
  230. }
  231. // Don't allow times in the past. Set a min re-poll threshold of 60s to avoid
  232. // tight-looping on feeds which 500.
  233. now := time.Now().Unix()
  234. if earliestNextTs <= now {
  235. earliestNextTs = now + 60
  236. }
  237. return time.Unix(earliestNextTs, 0)
  238. }
  239. // Query the given feed, update relevant timestamps and return NEW items
  240. func (s *Service) queryFeed(feedURL string) (*gofeed.Feed, []gofeed.Item, error) {
  241. log.WithField("feed_url", feedURL).Info("Querying feed")
  242. var items []gofeed.Item
  243. feed, err := readFeed(feedURL)
  244. // check for no items in addition to any returned errors as it appears some RSS feeds
  245. // do not consistently return items.
  246. if err == nil && len(feed.Items) == 0 {
  247. err = errors.New("feed has 0 items")
  248. }
  249. if err != nil {
  250. f := s.Feeds[feedURL]
  251. f.IsFailing = true
  252. s.Feeds[feedURL] = f
  253. return nil, items, err
  254. }
  255. // Patch up the item list: make sure each item has a GUID.
  256. ensureItemsHaveGUIDs(feed)
  257. // Work out which items are new, if any (based on the last updated TS we have)
  258. // If the TS is 0 then this is the first ever poll, so let's not send 10s of events
  259. // into the room and just do new ones from this point onwards.
  260. if s.Feeds[feedURL].NextPollTimestampSecs != 0 {
  261. items = s.newItems(feedURL, feed.Items)
  262. }
  263. now := time.Now().Unix() // Second resolution
  264. // Work out when to next poll this feed
  265. nextPollTsSec := now + minPollingIntervalSeconds
  266. if s.Feeds[feedURL].PollIntervalMins > int(minPollingIntervalSeconds/60) {
  267. nextPollTsSec = now + int64(s.Feeds[feedURL].PollIntervalMins*60)
  268. }
  269. // TODO: Handle the 'sy' Syndication extension to control update interval.
  270. // See http://www.feedforall.com/syndication.htm and http://web.resource.org/rss/1.0/modules/syndication/
  271. // Work out which GUIDs to remember. We don't want to remember every GUID ever as that leads to completely
  272. // unbounded growth of data.
  273. f := s.Feeds[feedURL]
  274. // Some RSS feeds can return a very small number of items then bounce
  275. // back to their "normal" size, so we cannot just clobber the recent GUID list per request or else we'll
  276. // forget what we sent and resend it. Instead, we'll keep 2x the max number of items that we've ever
  277. // seen from this feed, up to a max of 10,000.
  278. maxGuids := 2 * len(feed.Items)
  279. if len(f.RecentGUIDs) > maxGuids {
  280. maxGuids = len(f.RecentGUIDs) // already 2x'd.
  281. }
  282. if maxGuids > 10000 {
  283. maxGuids = 10000
  284. }
  285. lastSet := uniqueStrings(f.RecentGUIDs) // e.g. [4,5,6]
  286. thisSet := uniqueGuids(feed.Items) // e.g. [1,2,3]
  287. guids := append(thisSet, lastSet...) // e.g. [1,2,3,4,5,6]
  288. guids = uniqueStrings(guids)
  289. if len(guids) > maxGuids {
  290. // Critically this favours the NEWEST elements, which are the ones we're most likely to see again.
  291. guids = guids[0:maxGuids]
  292. }
  293. // Update the service config to persist the new times
  294. f.NextPollTimestampSecs = nextPollTsSec
  295. f.FeedUpdatedTimestampSecs = now
  296. f.RecentGUIDs = guids
  297. f.IsFailing = false
  298. s.Feeds[feedURL] = f
  299. return feed, items, nil
  300. }
  301. // containsAny takes a string and an array of words and returns whether any of the words
  302. // in the list are contained in the string. The words in the string are considered to be
  303. // separated by any non-alphanumeric character.
  304. func containsAny(item string, filterWords []string) bool {
  305. itemWords := strings.FieldsFunc(item, func(c rune) bool {
  306. return !unicode.IsLetter(c) && !unicode.IsNumber(c)
  307. })
  308. for _, itemWord := range itemWords {
  309. for _, filterWord := range filterWords {
  310. if filterWord == itemWord {
  311. return true
  312. }
  313. }
  314. }
  315. return false
  316. }
  317. func itemFiltered(i *gofeed.Item, mustInclude, mustNotInclude *includeRules) bool {
  318. // At least one word for each field that has been specified must be included for an item to pass the filter.
  319. if (i.Author != nil && len(mustInclude.Author) > 0 && !containsAny(i.Author.Name, mustInclude.Author)) ||
  320. (len(mustInclude.Title) > 0 && !containsAny(i.Title, mustInclude.Title)) ||
  321. (len(mustInclude.Description) > 0 && !containsAny(i.Description, mustInclude.Description)) {
  322. return true
  323. }
  324. // If at least one word of any field that has been specified is included in the item, it doesn't pass the filter.
  325. if (i.Author != nil && containsAny(i.Author.Name, mustNotInclude.Author)) ||
  326. containsAny(i.Title, mustNotInclude.Title) ||
  327. containsAny(i.Description, mustNotInclude.Description) {
  328. return true
  329. }
  330. return false
  331. }
  332. func (s *Service) newItems(feedURL string, allItems []*gofeed.Item) (items []gofeed.Item) {
  333. mustInclude := s.Feeds[feedURL].MustInclude
  334. mustNotInclude := s.Feeds[feedURL].MustNotInclude
  335. for _, i := range allItems {
  336. if i == nil {
  337. continue
  338. }
  339. // if we've seen this guid before, we've sent it before
  340. seenBefore := false
  341. for _, guid := range s.Feeds[feedURL].RecentGUIDs {
  342. if guid == i.GUID {
  343. seenBefore = true
  344. break
  345. }
  346. }
  347. if seenBefore {
  348. continue
  349. }
  350. // Decode HTML for <title> and <description>:
  351. // The RSS 2.0 Spec http://cyber.harvard.edu/rss/rss.html#hrelementsOfLtitemgt supports a bunch
  352. // of weird ways to put HTML into <title> and <description> tags. Not all RSS feed producers run
  353. // these fields through entity encoders (some have ' unencoded, others have it as &#8217;). We'll
  354. // assume that all RSS fields are sending HTML for these fields and run them through a standard decoder.
  355. // This will inevitably break for some people, but that group of people are probably smaller, so *shrug*.
  356. i.Title = html.UnescapeString(i.Title)
  357. i.Description = html.UnescapeString(i.Description)
  358. if i.Author != nil {
  359. i.Author.Name = html.UnescapeString(i.Author.Name)
  360. i.Author.Email = html.UnescapeString(i.Author.Email)
  361. }
  362. if !itemFiltered(i, &mustInclude, &mustNotInclude) {
  363. items = append(items, *i)
  364. }
  365. }
  366. return
  367. }
  368. func (s *Service) sendToRooms(cli types.MatrixClient, feedURL string, feed *gofeed.Feed, item gofeed.Item) error {
  369. logger := log.WithFields(log.Fields{
  370. "feed_url": feedURL,
  371. "title": item.Title,
  372. "guid": item.GUID,
  373. })
  374. logger.Info("Sending new feed item")
  375. for _, roomID := range s.Feeds[feedURL].Rooms {
  376. if _, err := cli.SendMessageEvent(roomID, mevt.EventMessage, itemToHTML(feed, item)); err != nil {
  377. logger.WithError(err).WithField("room_id", roomID).Error("Failed to send to room")
  378. }
  379. }
  380. return nil
  381. }
  382. func itemToHTML(feed *gofeed.Feed, item gofeed.Item) mevt.MessageEventContent {
  383. // If an item does not have a title, try using the feed's title instead
  384. // Create a new variable instead of mutating that which is passed in
  385. itemTitle := item.Title
  386. if itemTitle == "" {
  387. itemTitle = feed.Title
  388. }
  389. fmtBody := fmt.Sprintf("<strong>%s</strong>:<br><a href=\"%s\"><strong>%s</strong></a>",
  390. html.EscapeString(feed.Title), html.EscapeString(item.Link), html.EscapeString(itemTitle))
  391. if item.Author != nil {
  392. if len(item.Author.Name) > 0 && len(item.Author.Email) > 0 {
  393. fmtBody += fmt.Sprintf(" by <a href=\"mailto:%s\">%s</a>", html.EscapeString(item.Author.Email),
  394. html.EscapeString(item.Author.Name))
  395. } else if len(item.Author.Name) > 0 {
  396. fmtBody += fmt.Sprintf(" by %s", html.EscapeString(item.Author.Name))
  397. } else if len(item.Author.Email) > 0 {
  398. fmtBody += fmt.Sprintf(" by <a href=\"mailto:%s\">%s</a>", html.EscapeString(item.Author.Email),
  399. html.EscapeString(item.Author.Email))
  400. }
  401. }
  402. return mevt.MessageEventContent{
  403. Body: fmt.Sprintf("%s: %s ( %s )",
  404. html.EscapeString(feed.Title), html.EscapeString(itemTitle), html.EscapeString(item.Link)),
  405. MsgType: "m.notice",
  406. Format: mevt.FormatHTML,
  407. FormattedBody: fmtBody,
  408. // <strong>FeedTitle</strong>:
  409. // <br>
  410. // <a href="url-of-the-entry"><strong>Title of the Entry</strong></a>
  411. }
  412. }
  413. func ensureItemsHaveGUIDs(feed *gofeed.Feed) {
  414. for idx := 0; idx < len(feed.Items); idx++ {
  415. itm := feed.Items[idx]
  416. if itm.GUID == "" {
  417. if itm.Link != "" {
  418. itm.GUID = itm.Link
  419. } else if itm.Title != "" {
  420. itm.GUID = itm.Title
  421. }
  422. feed.Items[idx] = itm
  423. }
  424. }
  425. }
  426. // uniqueStrings returns a new slice of strings with duplicate elements removed.
  427. // Order is otherwise preserved.
  428. func uniqueStrings(a []string) []string {
  429. ret := []string{}
  430. seen := make(map[string]bool)
  431. for _, str := range a {
  432. if seen[str] {
  433. continue
  434. }
  435. seen[str] = true
  436. ret = append(ret, str)
  437. }
  438. return ret
  439. }
  440. // uniqueGuids returns a new slice of GUID strings with duplicate elements removed.
  441. // Order is otherwise preserved.
  442. func uniqueGuids(a []*gofeed.Item) []string {
  443. ret := []string{}
  444. seen := make(map[string]bool)
  445. for _, item := range a {
  446. if seen[item.GUID] {
  447. continue
  448. }
  449. seen[item.GUID] = true
  450. ret = append(ret, item.GUID)
  451. }
  452. return ret
  453. }
  454. type userAgentRoundTripper struct {
  455. Transport http.RoundTripper
  456. }
  457. func (rt userAgentRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
  458. req.Header.Set("User-Agent", "Go-NEB")
  459. return rt.Transport.RoundTrip(req)
  460. }
  461. func readFeed(feedURL string) (*gofeed.Feed, error) {
  462. // Don't use fp.ParseURL because it leaks on non-2xx responses as of 2016/11/29 (cac19c6c27)
  463. fp := gofeed.NewParser()
  464. resp, err := cachingClient.Get(feedURL)
  465. if resp != nil {
  466. defer resp.Body.Close()
  467. }
  468. if err != nil {
  469. return nil, err
  470. }
  471. if resp.StatusCode < 200 || resp.StatusCode >= 300 {
  472. return nil, gofeed.HTTPError{
  473. StatusCode: resp.StatusCode,
  474. Status: resp.Status,
  475. }
  476. }
  477. return fp.Parse(resp.Body)
  478. }
  479. func init() {
  480. lruCache := lrucache.New(1024*1024*20, 0) // 20 MB cache, no max-age
  481. cachingClient = &http.Client{
  482. Transport: userAgentRoundTripper{httpcache.NewTransport(lruCache)},
  483. }
  484. types.RegisterService(func(serviceID string, serviceUserID id.UserID, webhookEndpointURL string) types.Service {
  485. r := &Service{
  486. DefaultService: types.NewDefaultService(serviceID, serviceUserID, ServiceType),
  487. }
  488. return r
  489. })
  490. prometheus.MustRegister(pollCounter)
  491. }