Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pf/trending keywords #26

Draft
wants to merge 7 commits into
base: pf/display-top-keywords
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 33 additions & 1 deletion newswires/app/controllers/QueryController.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@ package controllers

import com.gu.pandomainauth.PanDomainAuthSettingsRefresher
import com.gu.permissions.PermissionsProvider
import db.FingerpostWireEntry
import db.{FingerpostWireEntry, KeywordCount, Trending}
import play.api.libs.json.Json
import play.api.libs.ws.WSClient
import play.api.mvc.{Action, AnyContent, BaseController, ControllerComponents}
import play.api.{Configuration, Logging}

import java.time.ZonedDateTime
import scala.concurrent.duration.DurationInt

class QueryController(
val controllerComponents: ControllerComponents,
val configuration: Configuration,
Expand Down Expand Up @@ -35,4 +38,33 @@ class QueryController(
Ok(Json.toJson(results))
}

def trendingKeywords(
inLastHours: Int = 150
): Action[AnyContent] = AuthAction {
val now = ZonedDateTime.now()
val keywordUsages =
FingerpostWireEntry.getKeywordsWithTimestamps(inLastHours)
val zScoreByKeyword = keywordUsages
.groupMap(_.keyword)(_.timestamp)
.filter(_._2.size > 10)
.map { case (keyword, timestamps) =>
keyword -> Trending.trendingScore(
timestamps,
1.hour,
now.minusHours(inLastHours),
now
)
}
.toList
val topKeywords = zScoreByKeyword.sortBy(-_._2).take(10).map(_._1)
val topKeywordUsageCounts = keywordUsages
.filter(ku => topKeywords.contains(ku.keyword))
.groupBy(_.keyword)
.view
.mapValues(_.size)
Ok(Json.toJson(topKeywordUsageCounts.map { case (k, v) =>
KeywordCount(k, v)
}))
}

}
23 changes: 21 additions & 2 deletions newswires/app/db/FingerpostWireEntry.scala
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ object FingerpostWireEntry extends SQLSyntaxSupport[FingerpostWireEntry] {
.fold(sqls"")(inLastHours =>
sqls"WHERE ingested_at > now() - ($inLastHours::text || ' hours')::interval"
)
println(innerWhereClause)
val limitClause = maybeLimit
.map(limit => sqls"LIMIT $limit")
.orElse(maybeInLastHours.map(_ => sqls"LIMIT 10"))
Expand All @@ -117,7 +116,27 @@ object FingerpostWireEntry extends SQLSyntaxSupport[FingerpostWireEntry] {
.map(rs => rs.string("keyword") -> rs.int("count"))
.list()
.apply()
.toMap // TODO would a list be better?
.map { case (keyword, count) => KeywordCount(keyword, count) }
}

def getKeywordsWithTimestamps(inLastHours: Int) =
DB readOnly { implicit session =>
sql"""| SELECT jsonb_array_elements(content -> 'keywords') as keyword, ingested_at
| FROM fingerpost_wire_entry
| WHERE ingested_at > now() - ($inLastHours::text || ' hours')::interval
| """.stripMargin
.map(rs => rs.string("keyword") -> rs.zonedDateTime("ingested_at"))
.list()
.apply()
.map { case (keyword, timestamp) => KeywordUsage(keyword, timestamp) }
}

}

case class KeywordUsage(keyword: String, timestamp: ZonedDateTime)

case class KeywordCount(keyword: String, count: Int)

object KeywordCount {
implicit val format: OFormat[KeywordCount] = Json.format[KeywordCount]
}
77 changes: 77 additions & 0 deletions newswires/app/db/Trending.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package db

import java.time.ZonedDateTime
import java.time.temporal.ChronoUnit
import scala.annotation.tailrec
import scala.concurrent.duration.FiniteDuration
import scala.math.Numeric.Implicits._

object Trending {

def trendingScore(
pop: List[ZonedDateTime],
window: FiniteDuration,
timePeriodStart: ZonedDateTime,
timePeriodEnd: ZonedDateTime
): Double = {
val mostRecent =
bucketByTimePeriod(pop, window, timePeriodStart, timePeriodEnd)
zScore(mostRecent, 5)
}

def zScore(pop: List[Int], observation: Int): Double = {
val populationMean = mean(pop)
val standardDeviation = stdDev(pop)

// https://stackoverflow.com/questions/787496/what-is-the-best-way-to-compute-trending-topics-or-tags
(observation - populationMean) / standardDeviation
}

def bucketByTimePeriod(
pop: List[ZonedDateTime],
window: FiniteDuration,
timePeriodStart: ZonedDateTime,
timePeriodEnd: ZonedDateTime
): List[Int] = {
val sortedPop = pop.sorted

@tailrec
def buckets(
bucketsSoFar: List[Int],
bucketEndTime: ZonedDateTime,
remainingPopulation: List[ZonedDateTime]
): List[Int] = {
if (bucketEndTime isAfter timePeriodEnd) {
bucketsSoFar
} else {
val (bucket, remaining) =
remainingPopulation.span(_ isBefore bucketEndTime)
buckets(
bucket.size :: bucketsSoFar,
bucketEndTime.plus(window.toMillis, ChronoUnit.MILLIS),
remaining
)
}
}

buckets(
List(),
timePeriodStart.plus(window.toMillis, ChronoUnit.MILLIS),
sortedPop
)
}

private def mean[T: Numeric](xs: Iterable[T]): Double =
xs.sum.toDouble / xs.size

// https://gist.github.com/navicore/7973711f300f00f9d878026eaf84bed2
private def variance[T: Numeric](xs: Iterable[T]): Double = {
val avg = mean(xs)

xs.map(_.toDouble).map(a => math.pow(a - avg, 2)).sum / xs.size
}

private def stdDev[T: Numeric](xs: Iterable[T]): Double =
math.sqrt(variance(xs))

}
12 changes: 11 additions & 1 deletion newswires/client/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion newswires/client/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
"moment": "2.30.1",
"react": "^18.3.1",
"react-dom": "^18.3.1",
"sanitize-html": "^2.13.0"
"sanitize-html": "^2.13.0",
"zod": "3.23.8"
},
"devDependencies": {
"@eslint/js": "^9.9.0",
Expand Down
18 changes: 5 additions & 13 deletions newswires/client/src/App.tsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import {
EuiEmptyPrompt,
EuiHeader,
EuiHeaderSectionItem,
EuiPageTemplate,
Expand All @@ -8,6 +7,7 @@ import {
} from '@elastic/eui';
import '@elastic/eui/dist/eui_theme_light.css';
import { Feed } from './Feed';
import { Home } from './Home';
import { SearchBox } from './SearchBox';
import { useHistory } from './urlState';

Expand All @@ -24,7 +24,9 @@ export function App() {
<EuiHeader position="fixed">
<EuiHeaderSectionItem>
<EuiTitle size={'s'}>
<h1>Newswires</h1>
<a href="/">
<h1>Newswires</h1>
</a>
</EuiTitle>
</EuiHeaderSectionItem>
{currentState.location !== '' && (
Expand All @@ -40,17 +42,7 @@ export function App() {
{currentState.location === 'feed' && (
<Feed searchQuery={currentState.params?.q ?? ''} />
)}
{currentState.location === '' && (
<EuiEmptyPrompt
title={<h2>Search wires</h2>}
body={
<SearchBox
initialQuery={currentState.params?.q ?? ''}
update={updateQuery}
/>
}
/>
)}
{currentState.location === '' && <Home updateQuery={updateQuery} />}
</EuiPageTemplate>
</EuiProvider>
);
Expand Down
111 changes: 111 additions & 0 deletions newswires/client/src/Home.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import {
EuiBadge,
EuiButton,
EuiEmptyPrompt,
EuiFlexGroup,
EuiFlexItem,
EuiListGroup,
} from '@elastic/eui';
import { Fragment, useEffect, useMemo, useState } from 'react';

Check warning on line 9 in newswires/client/src/Home.tsx

View workflow job for this annotation

GitHub Actions / Build and upload to riffraff

'Fragment' is defined but never used. Allowed unused vars must match /^_/u
import { SearchBox } from './SearchBox';
import type { KeywordCounts } from './sharedTypes';
import { KeywordCountsSchema } from './sharedTypes';
import { useHistory } from './urlState';

export function Home({
updateQuery,
}: {
updateQuery: (newQuery: string) => void;
}) {
const { currentState } = useHistory();
const [keywords, setKeywords] = useState<KeywordCounts>([]);
const [trendingKeywords, setTrendingKeywords] = useState<KeywordCounts>([]);

useEffect(() => {
fetch('api/keywords?limit=5')
.then((response) => {
if (response.ok) {
return response.json();
}
})
.then((data) => {
const maybeKeywords = KeywordCountsSchema.safeParse(data);
if (maybeKeywords.success) {
setKeywords(maybeKeywords.data);
} else {
console.error('Error parsing keywords:', maybeKeywords.error);
}
})
.catch((error) => {
console.error('Error fetching keywords:', error);
});
fetch('api/keywords/trending?limit=5')
.then((response) => {
if (response.ok) {
return response.json();
}
})
.then((data) => {
const maybeKeywords = KeywordCountsSchema.safeParse(data);
if (maybeKeywords.success) {
setTrendingKeywords(maybeKeywords.data);
} else {
console.error(
'Error parsing trending keywords:',
maybeKeywords.error,
);
}
})
.catch((error) => {
console.error('Error fetching trending keywords:', error);
});
});

const body = (
<EuiFlexGroup direction="column" justifyContent="center">
<SearchBox
initialQuery={currentState.params?.q ?? ''}
update={updateQuery}
/>
<EuiFlexGroup>
{keywords.length > 0 && (
<EuiFlexItem>
<h3>Top keywords</h3>
<KeywordsList keywords={keywords} />
</EuiFlexItem>
)}
{trendingKeywords.length > 0 && (
<EuiFlexItem>
<h3>Trending keywords</h3>
<KeywordsList keywords={trendingKeywords} />
</EuiFlexItem>
)}
</EuiFlexGroup>
</EuiFlexGroup>
);

return <EuiEmptyPrompt title={<h2>Search wires</h2>} body={body} />;
}

const KeywordsList = ({ keywords }: { keywords: KeywordCounts }) => {
const sortedKeywords = useMemo(() => {
return keywords.sort((a, b) => b.count - a.count);
}, [keywords]);

return (
<EuiFlexGroup>
<EuiListGroup flush={true}>
{sortedKeywords.map(({ keyword, count }) => (
// TODO: fix query when keyword support added to search:
// - specify as keyword rather than 'q'
// - make sure keyword text is properly encoded
// - handle quote marks if still present in response
<EuiButton key={keyword} color="text" href={`/feed?q=${keyword}`}>
{keyword.replaceAll('"', '')}{' '}
<EuiBadge color={'subdued'}>{count}</EuiBadge>
</EuiButton>
))}
</EuiListGroup>
</EuiFlexGroup>
);
};
2 changes: 1 addition & 1 deletion newswires/client/src/WiresCards.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ export const WireCardList = ({ wires }: { wires: WireData[] }) => {
flyout = (
<EuiFlyout
type="push"
size="s"
size="m"
onClose={() => setIsFlyoutVisible(false)}
aria-labelledby={pushedFlyoutTitleId}
>
Expand Down
Loading