From c0f73dbdf9595f1a6f52088e3ff48ac44023b4c6 Mon Sep 17 00:00:00 2001 From: Steven Crader Date: Fri, 9 Jun 2023 00:06:44 -0700 Subject: [PATCH] Add support to remove extra spaces when combining segment body Resolves #22 --- package.json | 2 +- src/segments.ts | 18 +++++++++++++++-- test/json.test.ts | 13 ++++++++++-- .../podnews_weekly_review_2023-06-02.json | 1 + ..._weekly_review_2023-06-02_json_parsed.json | 20 +++++++++++++++++++ test/test_utils.ts | 2 ++ 6 files changed, 51 insertions(+), 5 deletions(-) create mode 100644 test/test_files/podnews_weekly_review_2023-06-02.json create mode 100644 test/test_files/podnews_weekly_review_2023-06-02_json_parsed.json diff --git a/package.json b/package.json index 62dc61b..8632dfa 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "transcriptator", - "version": "1.1.0", + "version": "1.1.2+beta.0", "packageManager": "yarn@3.4.1", "description": "Library for converting the various transcript file formats to a common format.", "main": "index.ts", diff --git a/src/segments.ts b/src/segments.ts index f323109..f5940ad 100644 --- a/src/segments.ts +++ b/src/segments.ts @@ -6,6 +6,20 @@ import { DEFAULT_COMBINE_SEGMENTS_LENGTH, Segment } from "./types" * Regular Expression for detecting punctuation that should not be prefixed with a space */ const PATTERN_PUNCTUATIONS = /^ *[.,?!}\]>) *$]/ +/** + * Regular Expression for detecting space characters at the end of a string + */ +const PATTERN_TRAILING_SPACE = /^ *$/ + +/** + * Remove any trailing space characters from data + * + * @param data text to trim + * @returns text with any trailing space character removed + */ +const trimEndSpace = (data: string): string => { + return data.replace(PATTERN_TRAILING_SPACE, "") +} /** * Append `addition` to `body` with the character(s) specified. @@ -23,9 +37,9 @@ const joinBody = (body: string, addition: string, separator: string = undefined) if (PATTERN_PUNCTUATIONS.exec(addition)) { separatorToUse = "" } - return `${body}${separatorToUse}${addition}` + return `${trimEndSpace(body)}${separatorToUse}${trimEndSpace(addition)}` } - return addition + return trimEndSpace(addition) } /** diff --git a/test/json.test.ts b/test/json.test.ts index b36821c..cd419de 100644 --- a/test/json.test.ts +++ b/test/json.test.ts @@ -3,7 +3,7 @@ import { describe, expect, test } from "@jest/globals" import { IOptions, Options, Segment } from "../src" import { parseJSON } from "../src/formats/json" -import { readFile, TestFiles } from "./test_utils" +import { readFile, saveSegmentsToFile, TestFiles } from "./test_utils" describe("JSON formats test", () => { test.each<{ @@ -190,11 +190,20 @@ describe("Parse JSON file data", () => { }, id: "Podnews Weekly Review 2023-05-05, combine speaker", }, - ])("Parse JSON File ($id)", ({ filePath, expectedFilePath, options }) => { + { + filePath: TestFiles.TRANSCRIPT_JSON_PODNEWS_WEEKLY_REVIEW_2023_06_02, + expectedFilePath: TestFiles.TRANSCRIPT_JSON_PODNEWS_WEEKLY_REVIEW_2023_06_02_OUTPUT, + options: { + combineSpeaker: true, + }, + id: "Podnews Weekly Review 2023-06-02, extra space", + }, + ])("Parse JSON File ($id)", ({ filePath, expectedFilePath, options, id }) => { const data = readFile(filePath) const expectedJSONData = JSON.parse(readFile(expectedFilePath)) Options.setOptions(options) const segments = parseJSON(data) + saveSegmentsToFile(segments, `out_json_${id}.json`) expect(segments).toEqual(expectedJSONData.segments) }) }) diff --git a/test/test_files/podnews_weekly_review_2023-06-02.json b/test/test_files/podnews_weekly_review_2023-06-02.json new file mode 100644 index 0000000..98d0aa5 --- /dev/null +++ b/test/test_files/podnews_weekly_review_2023-06-02.json @@ -0,0 +1 @@ +{"version":"1.0.0","segments":[{"speaker":"Speaker 1","startTime":0.221,"endTime":0.361,"body":"It's"},{"speaker":"Speaker 1","startTime":0.361,"endTime":0.361,"body":" "},{"speaker":"Speaker 1","startTime":0.462,"endTime":0.883,"body":"Friday"},{"speaker":"Speaker 1","startTime":0.883,"endTime":0.883,"body":","},{"speaker":"Speaker 1","startTime":0.883,"endTime":0.883,"body":" "},{"speaker":"Speaker 1","startTime":0.963,"endTime":1.104,"body":"the"},{"speaker":"Speaker 1","startTime":1.104,"endTime":1.104,"body":" "},{"speaker":"Speaker 1","startTime":1.264,"endTime":1.605,"body":"2nd"},{"speaker":"Speaker 1","startTime":1.605,"endTime":1.605,"body":" "},{"speaker":"Speaker 1","startTime":1.625,"endTime":1.665,"body":"of"},{"speaker":"Speaker 1","startTime":1.665,"endTime":1.665,"body":" "},{"speaker":"Speaker 1","startTime":1.726,"endTime":1.987,"body":"June"},{"speaker":"Speaker 1","startTime":1.987,"endTime":1.987,"body":" "},{"speaker":"Speaker 1","startTime":2.007,"endTime":3.09,"body":"2023"},{"speaker":"Speaker 1","startTime":3.09,"endTime":3.09,"body":"."},{"speaker":"Speaker 1","startTime":3.09,"endTime":3.09,"body":" "},{"speaker":"Speaker 2","startTime":4.34,"endTime":4.44,"body":"The"},{"speaker":"Speaker 2","startTime":4.44,"endTime":4.44,"body":" "},{"speaker":"Speaker 2","startTime":4.521,"endTime":4.881,"body":"last"},{"speaker":"Speaker 2","startTime":4.881,"endTime":4.881,"body":" "},{"speaker":"Speaker 2","startTime":4.961,"endTime":5.262,"body":"word"},{"speaker":"Speaker 2","startTime":5.262,"endTime":5.262,"body":" "},{"speaker":"Speaker 2","startTime":5.382,"endTime":5.462,"body":"in"},{"speaker":"Speaker 2","startTime":5.462,"endTime":5.462,"body":" "},{"speaker":"Speaker 2","startTime":5.542,"endTime":6.224,"body":"podcasting"},{"speaker":"Speaker 2","startTime":6.224,"endTime":6.224,"body":" "},{"speaker":"Speaker 2","startTime":6.264,"endTime":6.604,"body":"news"},{"speaker":"Speaker 2","startTime":6.604,"endTime":6.604,"body":"."},{"speaker":"Speaker 2","startTime":6.604,"endTime":6.604,"body":" "},{"speaker":"Speaker 2","startTime":7.186,"endTime":7.506,"body":"This"},{"speaker":"Speaker 2","startTime":7.506,"endTime":7.506,"body":" "},{"speaker":"Speaker 2","startTime":7.887,"endTime":7.967,"body":"is"},{"speaker":"Speaker 2","startTime":7.967,"endTime":7.967,"body":" "},{"speaker":"Speaker 2","startTime":8.027,"endTime":8.087,"body":"the"},{"speaker":"Speaker 2","startTime":8.087,"endTime":8.087,"body":" "},{"speaker":"Speaker 2","startTime":8.147,"endTime":8.408,"body":"Pod"},{"speaker":"Speaker 2","startTime":8.408,"endTime":8.408,"body":" "},{"speaker":"Speaker 2","startTime":8.468,"endTime":8.729,"body":"News"},{"speaker":"Speaker 2","startTime":8.729,"endTime":8.729,"body":" "}]} \ No newline at end of file diff --git a/test/test_files/podnews_weekly_review_2023-06-02_json_parsed.json b/test/test_files/podnews_weekly_review_2023-06-02_json_parsed.json new file mode 100644 index 0000000..e450adb --- /dev/null +++ b/test/test_files/podnews_weekly_review_2023-06-02_json_parsed.json @@ -0,0 +1,20 @@ +{ + "segments": [ + { + "startTime": 0.221, + "startTimeFormatted": "00:00:00.221", + "endTime": 3.09, + "endTimeFormatted": "00:00:03.090", + "speaker": "Speaker 1", + "body": "It's Friday, the 2nd of June 2023." + }, + { + "startTime": 4.34, + "startTimeFormatted": "00:00:04.340", + "endTime": 8.729, + "endTimeFormatted": "00:00:08.729", + "speaker": "Speaker 2", + "body": "The last word in podcasting news. This is the Pod News" + } + ] +} diff --git a/test/test_utils.ts b/test/test_utils.ts index b16cfb0..a30d61b 100644 --- a/test/test_utils.ts +++ b/test/test_utils.ts @@ -11,6 +11,8 @@ export enum TestFiles { TRANSCRIPT_JSON_PODNEWS_WEEKLY_REVIEW_2023_05_05 = "podnews_weekly_review_2023-05-05.json", TRANSCRIPT_JSON_PODNEWS_WEEKLY_REVIEW_2023_05_05_OUTPUT = "podnews_weekly_review_2023-05-05_json_parsed.json", TRANSCRIPT_JSON_PODNEWS_WEEKLY_REVIEW_2023_05_05_COMBINE_SPEAKER_OUTPUT = "podnews_weekly_review_2023-05-05_combine_speaker_json_parsed.json", + TRANSCRIPT_JSON_PODNEWS_WEEKLY_REVIEW_2023_06_02 = "podnews_weekly_review_2023-06-02.json", + TRANSCRIPT_JSON_PODNEWS_WEEKLY_REVIEW_2023_06_02_OUTPUT = "podnews_weekly_review_2023-06-02_json_parsed.json", TRANSCRIPT_JSON_BUZZCAST = "buzzcast.json", TRANSCRIPT_JSON_BUZZCAST_OUTPUT = "buzzcast_json_parsed.json", TRANSCRIPT_JSON_BUZZCAST_COMBINE_EQUAL_TIME_OUTPUT = "buzzcast_json_combine_equal_time_parsed.json",