Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Moving complex segmenter logic into complex module #3349

Merged
merged 5 commits into from
Apr 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use icu_collections::char16trie::{Char16Trie, TrieResult};
use icu_provider::prelude::*;

/// A trait for dictionary based iterator
pub trait DictionaryType<'l, 's> {
trait DictionaryType<'l, 's> {
/// The iterator over characters.
type IterAttr: Iterator<Item = (usize, Self::CharType)> + Clone;

Expand All @@ -21,7 +21,7 @@ pub trait DictionaryType<'l, 's> {
fn char_len(c: Self::CharType) -> usize;
}

pub struct DictionaryBreakIterator<
struct DictionaryBreakIterator<
'l,
's,
Y: DictionaryType<'l, 's> + ?Sized,
Expand Down Expand Up @@ -137,13 +137,13 @@ impl<'l, 's> DictionaryType<'l, 's> for char {
}
}

pub(crate) struct DictionarySegmenter<'l> {
pub(super) struct DictionarySegmenter<'l> {
dict: &'l UCharDictionaryBreakDataV1<'l>,
grapheme: &'l RuleBreakDataV1<'l>,
}

impl<'l> DictionarySegmenter<'l> {
pub fn new(
pub(super) fn new(
dict: &'l DataPayload<UCharDictionaryBreakDataV1Marker>,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit (optional): Can you use this PR to address my unresolved comment from #3348

Nit: You should almost never have a

&'a DataPayload<FooV1Marker>

you should instead have a

&'a FooV1<'a>

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is the place

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually in this instance it really complicates things. Instead of being able to .map(Result::as_ref) I'd have to have a map that unpacks both values, and have that be conditional on the LSTM being the never type.

grapheme: &'l DataPayload<GraphemeClusterBreakDataV1Marker>,
) -> Self {
Expand All @@ -155,12 +155,9 @@ impl<'l> DictionarySegmenter<'l> {
}

/// Create a dictionary based break iterator for an `str` (a UTF-8 string).
pub fn segment_str<'s>(
&'s self,
input: &'s str,
) -> DictionaryBreakIterator<'l, 's, char, GraphemeClusterBreakIteratorUtf8> {
pub(super) fn segment_str(&'l self, input: &'l str) -> impl Iterator<Item = usize> + 'l {
let grapheme_iter = GraphemeClusterSegmenter::new_and_segment_str(input, self.grapheme);
DictionaryBreakIterator {
DictionaryBreakIterator::<char, GraphemeClusterBreakIteratorUtf8> {
trie: Char16Trie::new(self.dict.trie_data.clone()),
iter: input.char_indices(),
len: input.len(),
Expand All @@ -169,12 +166,9 @@ impl<'l> DictionarySegmenter<'l> {
}

/// Create a dictionary based break iterator for a UTF-16 string.
pub fn segment_utf16<'s>(
&'s self,
input: &'s [u16],
) -> DictionaryBreakIterator<'l, 's, u32, GraphemeClusterBreakIteratorUtf16> {
pub(super) fn segment_utf16(&'l self, input: &'l [u16]) -> impl Iterator<Item = usize> + 'l {
let grapheme_iter = GraphemeClusterSegmenter::new_and_segment_utf16(input, self.grapheme);
DictionaryBreakIterator {
DictionaryBreakIterator::<u32, GraphemeClusterBreakIteratorUtf16> {
trie: Char16Trie::new(self.dict.trie_data.clone()),
iter: Utf16Indices::new(input),
len: input.len(),
Expand All @@ -186,11 +180,8 @@ impl<'l> DictionarySegmenter<'l> {
#[cfg(test)]
#[cfg(feature = "serde")]
mod tests {
use crate::{
dictionary::DictionarySegmenter, provider::DictionaryForWordOnlyAutoV1Marker,
LineSegmenter, WordSegmenter,
};
use icu_provider::prelude::*;
use super::*;
use crate::{provider::DictionaryForWordOnlyAutoV1Marker, LineSegmenter, WordSegmenter};
use icu_provider_adapters::fork::ForkByKeyProvider;
use icu_provider_fs::FsDataProvider;
use std::path::PathBuf;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

#[derive(PartialEq, Debug, Copy, Clone)]
pub enum Language {
pub(super) enum Language {
Burmese,
ChineseOrJapanese,
Khmer,
Expand Down Expand Up @@ -43,12 +43,12 @@ fn get_language(codepoint: u32) -> Language {

/// This struct is an iterator that returns the string per language from the
/// given string.
pub struct LanguageIterator<'s> {
pub(super) struct LanguageIterator<'s> {
rest: &'s str,
}

impl<'s> LanguageIterator<'s> {
pub fn new(input: &'s str) -> Self {
pub(super) fn new(input: &'s str) -> Self {
Self { rest: input }
}
}
Expand All @@ -70,12 +70,12 @@ impl<'s> Iterator for LanguageIterator<'s> {
}
}

pub struct LanguageIteratorUtf16<'s> {
pub(super) struct LanguageIteratorUtf16<'s> {
rest: &'s [u16],
}

impl<'s> LanguageIteratorUtf16<'s> {
pub fn new(input: &'s [u16]) -> Self {
pub(super) fn new(input: &'s [u16]) -> Self {
Self { rest: input }
}
}
Expand Down
Loading