diff --git a/newswires/app/controllers/QueryController.scala b/newswires/app/controllers/QueryController.scala index 2e850e68..bf3d16b9 100644 --- a/newswires/app/controllers/QueryController.scala +++ b/newswires/app/controllers/QueryController.scala @@ -2,12 +2,15 @@ package controllers import com.gu.pandomainauth.PanDomainAuthSettingsRefresher import com.gu.permissions.PermissionsProvider -import db.FingerpostWireEntry +import db.{FingerpostWireEntry, KeywordCount, Trending} import play.api.libs.json.Json import play.api.libs.ws.WSClient import play.api.mvc.{Action, AnyContent, BaseController, ControllerComponents} import play.api.{Configuration, Logging} +import java.time.ZonedDateTime +import scala.concurrent.duration.DurationInt + class QueryController( val controllerComponents: ControllerComponents, val configuration: Configuration, @@ -35,4 +38,33 @@ class QueryController( Ok(Json.toJson(results)) } + def trendingKeywords( + inLastHours: Int = 150 + ): Action[AnyContent] = AuthAction { + val now = ZonedDateTime.now() + val keywordUsages = + FingerpostWireEntry.getKeywordsWithTimestamps(inLastHours) + val zScoreByKeyword = keywordUsages + .groupMap(_.keyword)(_.timestamp) + .filter(_._2.size > 10) + .map { case (keyword, timestamps) => + keyword -> Trending.trendingScore( + timestamps, + 1.hour, + now.minusHours(inLastHours), + now + ) + } + .toList + val topKeywords = zScoreByKeyword.sortBy(-_._2).take(10).map(_._1) + val topKeywordUsageCounts = keywordUsages + .filter(ku => topKeywords.contains(ku.keyword)) + .groupBy(_.keyword) + .view + .mapValues(_.size) + Ok(Json.toJson(topKeywordUsageCounts.map { case (k, v) => + KeywordCount(k, v) + })) + } + } diff --git a/newswires/app/db/FingerpostWireEntry.scala b/newswires/app/db/FingerpostWireEntry.scala index 2eff9bd8..9bbfa93e 100644 --- a/newswires/app/db/FingerpostWireEntry.scala +++ b/newswires/app/db/FingerpostWireEntry.scala @@ -99,7 +99,6 @@ object FingerpostWireEntry extends SQLSyntaxSupport[FingerpostWireEntry] { .fold(sqls"")(inLastHours => sqls"WHERE ingested_at > now() - ($inLastHours::text || ' hours')::interval" ) - println(innerWhereClause) val limitClause = maybeLimit .map(limit => sqls"LIMIT $limit") .orElse(maybeInLastHours.map(_ => sqls"LIMIT 10")) @@ -117,7 +116,27 @@ object FingerpostWireEntry extends SQLSyntaxSupport[FingerpostWireEntry] { .map(rs => rs.string("keyword") -> rs.int("count")) .list() .apply() - .toMap // TODO would a list be better? + .map { case (keyword, count) => KeywordCount(keyword, count) } } + def getKeywordsWithTimestamps(inLastHours: Int) = + DB readOnly { implicit session => + sql"""| SELECT jsonb_array_elements(content -> 'keywords') as keyword, ingested_at + | FROM fingerpost_wire_entry + | WHERE ingested_at > now() - ($inLastHours::text || ' hours')::interval + | """.stripMargin + .map(rs => rs.string("keyword") -> rs.zonedDateTime("ingested_at")) + .list() + .apply() + .map { case (keyword, timestamp) => KeywordUsage(keyword, timestamp) } + } + +} + +case class KeywordUsage(keyword: String, timestamp: ZonedDateTime) + +case class KeywordCount(keyword: String, count: Int) + +object KeywordCount { + implicit val format: OFormat[KeywordCount] = Json.format[KeywordCount] } diff --git a/newswires/app/db/Trending.scala b/newswires/app/db/Trending.scala new file mode 100644 index 00000000..33c1d7ef --- /dev/null +++ b/newswires/app/db/Trending.scala @@ -0,0 +1,77 @@ +package db + +import java.time.ZonedDateTime +import java.time.temporal.ChronoUnit +import scala.annotation.tailrec +import scala.concurrent.duration.FiniteDuration +import scala.math.Numeric.Implicits._ + +object Trending { + + def trendingScore( + pop: List[ZonedDateTime], + window: FiniteDuration, + timePeriodStart: ZonedDateTime, + timePeriodEnd: ZonedDateTime + ): Double = { + val mostRecent = + bucketByTimePeriod(pop, window, timePeriodStart, timePeriodEnd) + zScore(mostRecent, 5) + } + + def zScore(pop: List[Int], observation: Int): Double = { + val populationMean = mean(pop) + val standardDeviation = stdDev(pop) + + // https://stackoverflow.com/questions/787496/what-is-the-best-way-to-compute-trending-topics-or-tags + (observation - populationMean) / standardDeviation + } + + def bucketByTimePeriod( + pop: List[ZonedDateTime], + window: FiniteDuration, + timePeriodStart: ZonedDateTime, + timePeriodEnd: ZonedDateTime + ): List[Int] = { + val sortedPop = pop.sorted + + @tailrec + def buckets( + bucketsSoFar: List[Int], + bucketEndTime: ZonedDateTime, + remainingPopulation: List[ZonedDateTime] + ): List[Int] = { + if (bucketEndTime isAfter timePeriodEnd) { + bucketsSoFar + } else { + val (bucket, remaining) = + remainingPopulation.span(_ isBefore bucketEndTime) + buckets( + bucket.size :: bucketsSoFar, + bucketEndTime.plus(window.toMillis, ChronoUnit.MILLIS), + remaining + ) + } + } + + buckets( + List(), + timePeriodStart.plus(window.toMillis, ChronoUnit.MILLIS), + sortedPop + ) + } + + private def mean[T: Numeric](xs: Iterable[T]): Double = + xs.sum.toDouble / xs.size + + // https://gist.github.com/navicore/7973711f300f00f9d878026eaf84bed2 + private def variance[T: Numeric](xs: Iterable[T]): Double = { + val avg = mean(xs) + + xs.map(_.toDouble).map(a => math.pow(a - avg, 2)).sum / xs.size + } + + private def stdDev[T: Numeric](xs: Iterable[T]): Double = + math.sqrt(variance(xs)) + +} diff --git a/newswires/client/package-lock.json b/newswires/client/package-lock.json index 923517ee..55e33734 100644 --- a/newswires/client/package-lock.json +++ b/newswires/client/package-lock.json @@ -15,7 +15,8 @@ "moment": "2.30.1", "react": "^18.3.1", "react-dom": "^18.3.1", - "sanitize-html": "^2.13.0" + "sanitize-html": "^2.13.0", + "zod": "3.23.8" }, "devDependencies": { "@eslint/js": "^9.9.0", @@ -7708,6 +7709,15 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/zod": { + "version": "3.23.8", + "resolved": "https://registry.npmjs.org/zod/-/zod-3.23.8.tgz", + "integrity": "sha512-XBx9AXhXktjUqnepgTiE5flcKIYWi/rme0Eaj+5Y0lftuGBq+jyRu/md4WnuxqgP1ubdpNCsYEYPxrzVHD8d6g==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/colinhacks" + } + }, "node_modules/zwitch": { "version": "1.0.5", "resolved": "https://registry.npmjs.org/zwitch/-/zwitch-1.0.5.tgz", diff --git a/newswires/client/package.json b/newswires/client/package.json index 79da4a51..bf855838 100644 --- a/newswires/client/package.json +++ b/newswires/client/package.json @@ -21,7 +21,8 @@ "moment": "2.30.1", "react": "^18.3.1", "react-dom": "^18.3.1", - "sanitize-html": "^2.13.0" + "sanitize-html": "^2.13.0", + "zod": "3.23.8" }, "devDependencies": { "@eslint/js": "^9.9.0", diff --git a/newswires/client/src/App.tsx b/newswires/client/src/App.tsx index feedb12c..10e437ab 100644 --- a/newswires/client/src/App.tsx +++ b/newswires/client/src/App.tsx @@ -1,5 +1,4 @@ import { - EuiEmptyPrompt, EuiHeader, EuiHeaderSectionItem, EuiPageTemplate, @@ -8,6 +7,7 @@ import { } from '@elastic/eui'; import '@elastic/eui/dist/eui_theme_light.css'; import { Feed } from './Feed'; +import { Home } from './Home'; import { SearchBox } from './SearchBox'; import { useHistory } from './urlState'; @@ -24,7 +24,9 @@ export function App() { -

Newswires

+ +

Newswires

+
{currentState.location !== '' && ( @@ -40,17 +42,7 @@ export function App() { {currentState.location === 'feed' && ( )} - {currentState.location === '' && ( - Search wires} - body={ - - } - /> - )} + {currentState.location === '' && } ); diff --git a/newswires/client/src/Home.tsx b/newswires/client/src/Home.tsx new file mode 100644 index 00000000..f773b1a8 --- /dev/null +++ b/newswires/client/src/Home.tsx @@ -0,0 +1,111 @@ +import { + EuiBadge, + EuiButton, + EuiEmptyPrompt, + EuiFlexGroup, + EuiFlexItem, + EuiListGroup, +} from '@elastic/eui'; +import { Fragment, useEffect, useMemo, useState } from 'react'; +import { SearchBox } from './SearchBox'; +import type { KeywordCounts } from './sharedTypes'; +import { KeywordCountsSchema } from './sharedTypes'; +import { useHistory } from './urlState'; + +export function Home({ + updateQuery, +}: { + updateQuery: (newQuery: string) => void; +}) { + const { currentState } = useHistory(); + const [keywords, setKeywords] = useState([]); + const [trendingKeywords, setTrendingKeywords] = useState([]); + + useEffect(() => { + fetch('api/keywords?limit=5') + .then((response) => { + if (response.ok) { + return response.json(); + } + }) + .then((data) => { + const maybeKeywords = KeywordCountsSchema.safeParse(data); + if (maybeKeywords.success) { + setKeywords(maybeKeywords.data); + } else { + console.error('Error parsing keywords:', maybeKeywords.error); + } + }) + .catch((error) => { + console.error('Error fetching keywords:', error); + }); + fetch('api/keywords/trending?limit=5') + .then((response) => { + if (response.ok) { + return response.json(); + } + }) + .then((data) => { + const maybeKeywords = KeywordCountsSchema.safeParse(data); + if (maybeKeywords.success) { + setTrendingKeywords(maybeKeywords.data); + } else { + console.error( + 'Error parsing trending keywords:', + maybeKeywords.error, + ); + } + }) + .catch((error) => { + console.error('Error fetching trending keywords:', error); + }); + }); + + const body = ( + + + + {keywords.length > 0 && ( + +

Top keywords

+ +
+ )} + {trendingKeywords.length > 0 && ( + +

Trending keywords

+ +
+ )} +
+
+ ); + + return Search wires} body={body} />; +} + +const KeywordsList = ({ keywords }: { keywords: KeywordCounts }) => { + const sortedKeywords = useMemo(() => { + return keywords.sort((a, b) => b.count - a.count); + }, [keywords]); + + return ( + + + {sortedKeywords.map(({ keyword, count }) => ( + // TODO: fix query when keyword support added to search: + // - specify as keyword rather than 'q' + // - make sure keyword text is properly encoded + // - handle quote marks if still present in response + + {keyword.replaceAll('"', '')}{' '} + {count} + + ))} + + + ); +}; diff --git a/newswires/client/src/WiresCards.tsx b/newswires/client/src/WiresCards.tsx index a71569b7..1af6082a 100644 --- a/newswires/client/src/WiresCards.tsx +++ b/newswires/client/src/WiresCards.tsx @@ -84,7 +84,7 @@ export const WireCardList = ({ wires }: { wires: WireData[] }) => { flyout = ( setIsFlyoutVisible(false)} aria-labelledby={pushedFlyoutTitleId} > diff --git a/newswires/client/src/sharedTypes.ts b/newswires/client/src/sharedTypes.ts index 9cdadde6..a960807f 100644 --- a/newswires/client/src/sharedTypes.ts +++ b/newswires/client/src/sharedTypes.ts @@ -1,20 +1,35 @@ -export type WireData = { - id: number; - externalId: string; - ingestedAt: string; - content: Partial<{ - uri: string; - usn: string; - version: string; - firstVersion: string; // date - versionCreated: string; // date - dateTimeSent: string; //date - headline: string; - subhead: string; - byline: string; - keywords: string[]; - usage: string; - location: string; - body_text: string; - }>; -}; +import { z } from 'zod'; + +const FingerpostContentSchema = z.object({ + uri: z.string(), + usn: z.string(), + version: z.string(), + firstVersion: z.string(), + versionCreated: z.string(), + dateTimeSent: z.string(), + headline: z.string(), + subhead: z.string(), + byline: z.string(), + keywords: z.array(z.string()), + usage: z.string(), + location: z.string(), + body_text: z.string(), +}); + +export const WireDataSchema = z.object({ + id: z.number(), + externalId: z.string(), + ingestedAt: z.string(), + content: FingerpostContentSchema.partial(), +}); + +export type WireData = z.infer; + +const KeywordCountSchema = z.object({ + keyword: z.string(), + count: z.number(), +}); + +export const KeywordCountsSchema = z.array(KeywordCountSchema); + +export type KeywordCounts = z.infer; diff --git a/newswires/conf/routes b/newswires/conf/routes index 86059a64..8b3cd6ef 100644 --- a/newswires/conf/routes +++ b/newswires/conf/routes @@ -9,6 +9,7 @@ GET /feed controllers.ViteController.index() GET /api/ controllers.HomeController.index() GET /api/search controllers.QueryController.query(q: Option[String]) GET /api/keywords controllers.QueryController.keywords(inLastHours: Option[Int], limit:Option[Int]) +GET /api/keywords/trending controllers.QueryController.trendingKeywords() GET /oauthCallback controllers.AuthController.oauthCallback() diff --git a/newswires/test/db/TrendingSpec.scala b/newswires/test/db/TrendingSpec.scala new file mode 100644 index 00000000..28046ccb --- /dev/null +++ b/newswires/test/db/TrendingSpec.scala @@ -0,0 +1,80 @@ +package db + +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +import java.time.ZonedDateTime +import scala.concurrent.duration.DurationInt + +class TrendingSpec extends AnyFlatSpec with Matchers { + + /** nb. the +- 0.0000000000001 is a tolerance for floating point error + */ + "Trending" should "calculate the z-score" in { + assert(Trending.zScore(List(1, 2, 3, 4, 5), 3) == 0.0) + assert( + Trending.zScore( + List(21, 22, 19, 18, 17, 22, 20, 20), + 20 + ) === 0.0739221270955 +- 0.0000000000001 + ) + assert( + Trending.zScore( + List(21, 22, 19, 18, 17, 22, 20, 20, 1, 2, 3, 1, 2, 1, 0, 1), + 2 + ) === -0.922793112954 +- 0.000000000001 + ) + } + + "bucketByTimePeriod" should "return a list of the numbers of entries in each bucket" in { + val times = List( + ZonedDateTime.parse("2021-01-01T00:00:00Z"), + ZonedDateTime.parse("2021-01-01T00:00:00Z"), + ZonedDateTime.parse("2021-01-01T00:00:01Z"), + ZonedDateTime.parse("2021-01-01T00:00:03Z") + ) + assert( + Trending.bucketByTimePeriod( + times, + window = 1.second, + timePeriodStart = ZonedDateTime.parse("2021-01-01T00:00:00Z"), + timePeriodEnd = ZonedDateTime.parse("2021-01-01T00:00:04Z") + ) == List(1, 0, 1, 2) + ) + } + + "bucketByTimePeriod" should "be able to handle unsorted input" in { + val times = List( + ZonedDateTime.parse("2021-01-01T00:00:01Z"), + ZonedDateTime.parse("2021-01-01T00:00:00Z"), + ZonedDateTime.parse("2021-01-01T00:00:03Z"), + ZonedDateTime.parse("2021-01-01T00:00:00Z") + ) + assert( + Trending.bucketByTimePeriod( + times, + window = 1.second, + timePeriodStart = ZonedDateTime.parse("2021-01-01T00:00:00Z"), + timePeriodEnd = ZonedDateTime.parse("2021-01-01T00:00:04Z") + ) == List(1, 0, 1, 2) + ) + } + + "bucketByTimePeriod" should "allow for empty buckets at either end as well as in the middle" in { + val times = List( + ZonedDateTime.parse("2021-01-01T00:00:01Z"), + ZonedDateTime.parse("2021-01-01T00:00:00Z"), + ZonedDateTime.parse("2021-01-01T00:00:03Z"), + ZonedDateTime.parse("2021-01-01T00:00:00Z") + ) + assert( + Trending.bucketByTimePeriod( + times, + window = 1.second, + timePeriodStart = ZonedDateTime.parse("2020-12-31T23:59:59Z"), + timePeriodEnd = ZonedDateTime.parse("2021-01-01T00:00:05Z") + ) == List(0, 1, 0, 1, 2, 0) + ) + } + +}