From 611992e472220722e108ac7b68fbc5df592517a1 Mon Sep 17 00:00:00 2001 From: kitsuyui Date: Tue, 3 Jan 2023 00:24:48 +0900 Subject: [PATCH 01/17] WIP: Implement main features From 3655d7c0e721e590c6a5b0830d45535885f01f9c Mon Sep 17 00:00:00 2001 From: kitsuyui Date: Tue, 3 Jan 2023 00:29:16 +0900 Subject: [PATCH 02/17] Add clippy.toml --- clippy.toml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 clippy.toml diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 0000000..0358cdb --- /dev/null +++ b/clippy.toml @@ -0,0 +1,2 @@ +allow-unwrap-in-tests = true +allow-expect-in-tests = true From 0d0d89a0f55d7e89231e977e9dbfc4fda48aef65 Mon Sep 17 00:00:00 2001 From: kitsuyui Date: Tue, 3 Jan 2023 00:29:32 +0900 Subject: [PATCH 03/17] Add sxd_xpath for extract tables --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index 1ed6081..6c30b5b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,4 +11,5 @@ repository = "https://github.com/kitsuyui/sxd_html_table" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +sxd-xpath = "0.4.2" sxd_html = "0.1.0" From 508f76a1c63a300768727c5a8be5facdf1f1195f Mon Sep 17 00:00:00 2001 From: kitsuyui Date: Tue, 3 Jan 2023 00:31:46 +0900 Subject: [PATCH 04/17] Implement find_table_from_document --- src/lib.rs | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 91 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 7d12d9a..554e005 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,37 @@ -pub fn add(left: usize, right: usize) -> usize { - left + right +use sxd_xpath::{nodeset::Node, Context, Factory, Value}; + +#[derive(Debug)] +pub enum Error { + TableNotFound, +} + +pub fn find_table_from_document(html: &str) -> Result, Error> { + let package = sxd_html::parse_html(html); + let document = package.as_document(); + #[allow(clippy::expect_used)] + let val = evaluate_xpath_node(document.root(), "//table").expect("XPath evaluation failed"); + let Value::Nodeset(set) = val else { + panic!("Expected node set"); + }; + + let mut tables = vec![]; + for node in set.document_order() { + tables.push(node.string_value()); + } + Ok(tables) +} + +fn evaluate_xpath_node<'d>( + node: impl Into>, + expr: &str, +) -> Result, sxd_xpath::Error> { + let factory = Factory::new(); + let expression = factory.build(expr)?; + let expression = expression.ok_or(sxd_xpath::Error::NoXPath)?; + let context = Context::new(); + expression + .evaluate(&context, node.into()) + .map_err(Into::into) } #[cfg(test)] @@ -7,8 +39,62 @@ mod tests { use super::*; #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); + fn test_find_table_from_document() { + // found 1 table + let html = r#" + + + + + + + +
12
+ + + "#; + let result = find_table_from_document(html).unwrap(); + assert_eq!(result.len(), 1); + + // found 2 tables + let html = r#" + + + + + + + +
12
+ + + + + +
12
+ + + "#; + let result = find_table_from_document(html).unwrap(); + assert_eq!(result.len(), 2); + + // found 0 table + let html = r#" + + +
+

1

+

2

+
+ + + "#; + let result = find_table_from_document(html).unwrap(); + assert_eq!(result.len(), 0); + + // empty html + let html = r#""#; + let result = find_table_from_document(html).unwrap(); + assert_eq!(result.len(), 0); } } From 0e28c46a282420ec94306b84105fd155ae82a332 Mon Sep 17 00:00:00 2001 From: kitsuyui Date: Tue, 3 Jan 2023 18:23:23 +0900 Subject: [PATCH 05/17] Implement the basics of CSV conversion - First, implement a rough implementation - The proper CSV conversion process will be implemented later - I realized that I need to fix a bug in sxd_html --- src/lib.rs | 153 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 125 insertions(+), 28 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 554e005..bf74b06 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,24 +3,118 @@ use sxd_xpath::{nodeset::Node, Context, Factory, Value}; #[derive(Debug)] pub enum Error { TableNotFound, + InvalidDocument, } -pub fn find_table_from_document(html: &str) -> Result, Error> { +#[derive(Debug, Eq, PartialEq)] +pub struct TableCell { + header: bool, + text: Option, +} + +#[derive(Debug, Eq, PartialEq)] +pub struct Table { + rows: Vec>, +} + +impl TableCell { + pub fn new(header: bool) -> Self { + Self { header, text: None } + } +} + +impl Default for Table { + fn default() -> Self { + Self::new() + } +} + +impl Table { + pub fn new() -> Self { + Self { rows: vec![] } + } + + pub fn expand_size(&mut self, row_size: usize, col_size: usize) { + if self.rows.len() < row_size { + self.rows.push(vec![]); + } + for row in &mut self.rows { + if row.len() < col_size { + row.push(TableCell::new(false)); + } + } + } + + pub fn set_cell(&mut self, text: &str, header: bool, row_index: usize, col_index: usize) { + self.expand_size(row_index + 1, col_index + 1); + self.rows[row_index][col_index].text = Some(text.to_string()); + self.rows[row_index][col_index].header = header; + } + + pub fn rows(&self) -> &Vec> { + &self.rows + } + + pub fn to_csv(&self) -> String { + let mut csv = String::new(); + for row in &self.rows { + let mut first = true; + for cell in row { + if first { + first = false; + } else { + csv.push(','); + } + if let Some(text) = &cell.text { + csv.push_str(text); + } + } + csv.push('\n'); + } + csv + } +} + +pub fn extract_tables_from_document(html: &str) -> Result, Error> { let package = sxd_html::parse_html(html); let document = package.as_document(); #[allow(clippy::expect_used)] - let val = evaluate_xpath_node(document.root(), "//table").expect("XPath evaluation failed"); - let Value::Nodeset(set) = val else { + let val = evaluate_xpath_node(document.root(), "//table2").expect("XPath evaluation failed"); + + let Value::Nodeset(table_nodes) = val else { panic!("Expected node set"); }; - let mut tables = vec![]; - for node in set.document_order() { - tables.push(node.string_value()); + for node in table_nodes.document_order() { + match extract_table(&node) { + Ok(table) => tables.push(table), + Err(e) => return Err(e), + } } Ok(tables) } +fn extract_table(node: &Node) -> Result { + let mut table = Table::new(); + let tr_nodes = match evaluate_xpath_node(*node, ".//tr2") { + Ok(Value::Nodeset(tr_nodes)) => tr_nodes, + _ => return Err(Error::InvalidDocument), + }; + let tr_nodes = tr_nodes.document_order(); + for (i, tr) in tr_nodes.iter().enumerate() { + let td_nodes = match evaluate_xpath_node(*tr, ".//td2") { + Ok(Value::Nodeset(td_nodes)) => td_nodes, + _ => return Err(Error::InvalidDocument), + }; + let td_nodes = td_nodes.document_order(); + for (j, td) in td_nodes.iter().enumerate() { + let header = false; + table.set_cell(&td.string_value(), header, i, j); + } + } + Ok(table) +} + fn evaluate_xpath_node<'d>( node: impl Into>, expr: &str, @@ -44,39 +138,42 @@ mod tests { let html = r#" - - - - - -
12
+ + + 1 + 2 + + "#; - let result = find_table_from_document(html).unwrap(); + let result = extract_tables_from_document(html).unwrap(); assert_eq!(result.len(), 1); + assert_eq!(result[0].to_csv(), "1,2\n",); // found 2 tables let html = r#" - - - - - -
12
- - - - - -
12
+ + + 1 + 2 + + + + + 3 + 4 + + "#; - let result = find_table_from_document(html).unwrap(); + let result = extract_tables_from_document(html).unwrap(); assert_eq!(result.len(), 2); + assert_eq!(result[0].to_csv(), "1,2\n",); + assert_eq!(result[1].to_csv(), "3,4\n",); // found 0 table let html = r#" @@ -89,12 +186,12 @@ mod tests { "#; - let result = find_table_from_document(html).unwrap(); + let result = extract_tables_from_document(html).unwrap(); assert_eq!(result.len(), 0); // empty html let html = r#""#; - let result = find_table_from_document(html).unwrap(); + let result = extract_tables_from_document(html).unwrap(); assert_eq!(result.len(), 0); } } From 822d6715c2a35a2441850ca06de2206511ac07db Mon Sep 17 00:00:00 2001 From: kitsuyui Date: Tue, 3 Jan 2023 23:53:31 +0900 Subject: [PATCH 06/17] Add tbody to XPath --- src/lib.rs | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index bf74b06..3631849 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -79,7 +79,7 @@ pub fn extract_tables_from_document(html: &str) -> Result, Error> { let package = sxd_html::parse_html(html); let document = package.as_document(); #[allow(clippy::expect_used)] - let val = evaluate_xpath_node(document.root(), "//table2").expect("XPath evaluation failed"); + let val = evaluate_xpath_node(document.root(), "//table").expect("XPath evaluation failed"); let Value::Nodeset(table_nodes) = val else { panic!("Expected node set"); @@ -96,13 +96,13 @@ pub fn extract_tables_from_document(html: &str) -> Result, Error> { fn extract_table(node: &Node) -> Result { let mut table = Table::new(); - let tr_nodes = match evaluate_xpath_node(*node, ".//tr2") { + let tr_nodes = match evaluate_xpath_node(*node, "./tbody/tr") { Ok(Value::Nodeset(tr_nodes)) => tr_nodes, _ => return Err(Error::InvalidDocument), }; let tr_nodes = tr_nodes.document_order(); for (i, tr) in tr_nodes.iter().enumerate() { - let td_nodes = match evaluate_xpath_node(*tr, ".//td2") { + let td_nodes = match evaluate_xpath_node(*tr, "./td") { Ok(Value::Nodeset(td_nodes)) => td_nodes, _ => return Err(Error::InvalidDocument), }; @@ -138,12 +138,12 @@ mod tests { let html = r#" - - - 1 - 2 - - + + + + + +
12
"#; @@ -155,18 +155,18 @@ mod tests { let html = r#" - - - 1 - 2 - - - - - 3 - 4 - - + + + + + +
12
+ + + + + +
34
"#; From b1e4855c3ac0ebc9c6bb6482a34a2ec71d7fbb45 Mon Sep 17 00:00:00 2001 From: kitsuyui Date: Wed, 4 Jan 2023 20:42:41 +0900 Subject: [PATCH 07/17] Update sxd_html --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 6c30b5b..4226921 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,4 +12,4 @@ repository = "https://github.com/kitsuyui/sxd_html_table" [dependencies] sxd-xpath = "0.4.2" -sxd_html = "0.1.0" +sxd_html = "0.1.1" From a48a8a58240a085e6a7321d5d4056e17e87a4e4e Mon Sep 17 00:00:00 2001 From: kitsuyui Date: Wed, 4 Jan 2023 21:01:41 +0900 Subject: [PATCH 08/17] Implement CSV conversion properly - Implement CSV conversion that has been a provisional implementation so far - Add the `csv` crate for this --- Cargo.toml | 1 + src/lib.rs | 34 ++++++++++++++++++++-------------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4226921..bf59417 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,5 +11,6 @@ repository = "https://github.com/kitsuyui/sxd_html_table" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +csv = "1.1.6" sxd-xpath = "0.4.2" sxd_html = "0.1.1" diff --git a/src/lib.rs b/src/lib.rs index 3631849..6232d28 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,7 @@ use sxd_xpath::{nodeset::Node, Context, Factory, Value}; pub enum Error { TableNotFound, InvalidDocument, + FailedToConvertToCSV, } #[derive(Debug, Eq, PartialEq)] @@ -55,23 +56,28 @@ impl Table { &self.rows } - pub fn to_csv(&self) -> String { - let mut csv = String::new(); + pub fn write_csv(&self, writer: &mut impl std::io::Write) -> Result<(), Error> { + let mut writer = csv::Writer::from_writer(writer); for row in &self.rows { - let mut first = true; + let mut record = csv::StringRecord::new(); for cell in row { - if first { - first = false; - } else { - csv.push(','); - } if let Some(text) = &cell.text { - csv.push_str(text); + record.push_field(text); } } - csv.push('\n'); + writer + .write_record(&record) + .map_err(|_| Error::FailedToConvertToCSV)?; } - csv + writer.flush().map_err(|_| Error::FailedToConvertToCSV)?; + Ok(()) + } + + pub fn to_csv(&self) -> Result { + let mut buf = std::io::BufWriter::new(Vec::new()); + self.write_csv(&mut buf)?; + let bytes = buf.into_inner().map_err(|_| Error::FailedToConvertToCSV)?; + String::from_utf8(bytes).map_err(|_| Error::FailedToConvertToCSV) } } @@ -149,7 +155,7 @@ mod tests { "#; let result = extract_tables_from_document(html).unwrap(); assert_eq!(result.len(), 1); - assert_eq!(result[0].to_csv(), "1,2\n",); + assert_eq!(result[0].to_csv().unwrap(), "1,2\n",); // found 2 tables let html = r#" @@ -172,8 +178,8 @@ mod tests { "#; let result = extract_tables_from_document(html).unwrap(); assert_eq!(result.len(), 2); - assert_eq!(result[0].to_csv(), "1,2\n",); - assert_eq!(result[1].to_csv(), "3,4\n",); + assert_eq!(result[0].to_csv().unwrap(), "1,2\n",); + assert_eq!(result[1].to_csv().unwrap(), "3,4\n",); // found 0 table let html = r#" From 91b5cf3c54714e3b4a809e48ab8666bc8d942308 Mon Sep 17 00:00:00 2001 From: kitsuyui Date: Wed, 4 Jan 2023 21:06:57 +0900 Subject: [PATCH 09/17] Support for th --- src/lib.rs | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 6232d28..254e990 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -108,14 +108,14 @@ fn extract_table(node: &Node) -> Result { }; let tr_nodes = tr_nodes.document_order(); for (i, tr) in tr_nodes.iter().enumerate() { - let td_nodes = match evaluate_xpath_node(*tr, "./td") { + let cell_nodes = match evaluate_xpath_node(*tr, "./td|./th") { Ok(Value::Nodeset(td_nodes)) => td_nodes, _ => return Err(Error::InvalidDocument), }; - let td_nodes = td_nodes.document_order(); - for (j, td) in td_nodes.iter().enumerate() { + let cell_nodes = cell_nodes.document_order(); + for (j, cell_node) in cell_nodes.iter().enumerate() { let header = false; - table.set_cell(&td.string_value(), header, i, j); + table.set_cell(&cell_node.string_value(), header, i, j); } } Ok(table) @@ -155,7 +155,7 @@ mod tests { "#; let result = extract_tables_from_document(html).unwrap(); assert_eq!(result.len(), 1); - assert_eq!(result[0].to_csv().unwrap(), "1,2\n",); + assert_eq!(result[0].to_csv().unwrap(), "1,2\n"); // found 2 tables let html = r#" @@ -200,4 +200,27 @@ mod tests { let result = extract_tables_from_document(html).unwrap(); assert_eq!(result.len(), 0); } + + #[test] + fn test_td_and_th() { + let html = r#" + + + + + + + + + + + +
12
34
+ + + "#; + let result = extract_tables_from_document(html).unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].to_csv().unwrap(), "1,2\n3,4\n"); + } } From d177a637a3b19b4f899758060e2a6036fa0c22af Mon Sep 17 00:00:00 2001 From: kitsuyui Date: Wed, 4 Jan 2023 22:02:44 +0900 Subject: [PATCH 10/17] Support for colspan and rowspan --- src/lib.rs | 178 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 134 insertions(+), 44 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 254e990..5fc5973 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; + use sxd_xpath::{nodeset::Node, Context, Factory, Value}; #[derive(Debug)] @@ -7,62 +9,47 @@ pub enum Error { FailedToConvertToCSV, } -#[derive(Debug, Eq, PartialEq)] -pub struct TableCell { - header: bool, - text: Option, -} - #[derive(Debug, Eq, PartialEq)] pub struct Table { - rows: Vec>, -} - -impl TableCell { - pub fn new(header: bool) -> Self { - Self { header, text: None } - } -} - -impl Default for Table { - fn default() -> Self { - Self::new() - } + size: (usize, usize), + cells: Vec>, + headers: Vec, } impl Table { - pub fn new() -> Self { - Self { rows: vec![] } - } - - pub fn expand_size(&mut self, row_size: usize, col_size: usize) { - if self.rows.len() < row_size { - self.rows.push(vec![]); - } - for row in &mut self.rows { - if row.len() < col_size { - row.push(TableCell::new(false)); - } + pub fn new(size: (usize, usize)) -> Self { + Self { + size, + cells: vec![None; size.0 * size.1], + headers: vec![false; size.0 * size.1], } } - pub fn set_cell(&mut self, text: &str, header: bool, row_index: usize, col_index: usize) { - self.expand_size(row_index + 1, col_index + 1); - self.rows[row_index][col_index].text = Some(text.to_string()); - self.rows[row_index][col_index].header = header; + pub fn is_header(&self, row: usize, col: usize) -> bool { + self.headers[row * self.size.1 + col] } - pub fn rows(&self) -> &Vec> { - &self.rows + pub fn rows(&self) -> Vec>> { + let mut rows = vec![]; + for i in 0..self.size.0 { + let mut row = vec![]; + for j in 0..self.size.1 { + row.push(self.cells[i * self.size.1 + j].as_deref()); + } + rows.push(row); + } + rows } pub fn write_csv(&self, writer: &mut impl std::io::Write) -> Result<(), Error> { let mut writer = csv::Writer::from_writer(writer); - for row in &self.rows { + for row in &self.rows() { let mut record = csv::StringRecord::new(); for cell in row { - if let Some(text) = &cell.text { + if let Some(text) = cell { record.push_field(text); + } else { + record.push_field(""); } } writer @@ -100,24 +87,63 @@ pub fn extract_tables_from_document(html: &str) -> Result, Error> { Ok(tables) } +fn extract_rowspan_and_colspan(node: &Node) -> (usize, usize) { + #[allow(clippy::expect_used)] + let element = node.element().expect("Expected element"); + let rowspan = element + .attribute_value("rowspan") + .unwrap_or("1") + .parse::() + .unwrap_or(1); + let colspan = element + .attribute_value("colspan") + .unwrap_or("1") + .parse::() + .unwrap_or(1); + (rowspan, colspan) +} + fn extract_table(node: &Node) -> Result { - let mut table = Table::new(); let tr_nodes = match evaluate_xpath_node(*node, "./tbody/tr") { Ok(Value::Nodeset(tr_nodes)) => tr_nodes, _ => return Err(Error::InvalidDocument), }; let tr_nodes = tr_nodes.document_order(); - for (i, tr) in tr_nodes.iter().enumerate() { + + let mut map: HashMap<(usize, usize), String> = HashMap::new(); + let mut header_map: HashMap<(usize, usize), bool> = HashMap::new(); + for (row_index, tr) in tr_nodes.iter().enumerate() { let cell_nodes = match evaluate_xpath_node(*tr, "./td|./th") { Ok(Value::Nodeset(td_nodes)) => td_nodes, _ => return Err(Error::InvalidDocument), }; let cell_nodes = cell_nodes.document_order(); - for (j, cell_node) in cell_nodes.iter().enumerate() { - let header = false; - table.set_cell(&cell_node.string_value(), header, i, j); + let mut col_index = 0; + for (_, cell_node) in cell_nodes.iter().enumerate() { + let (row_size, col_size) = extract_rowspan_and_colspan(cell_node); + let text = &cell_node.string_value(); + #[allow(clippy::expect_used)] + let is_header = cell_node.element().expect("Expected element").name() == "th".into(); + while map.contains_key(&(row_index, col_index)) { + col_index += 1; + } + for k in 0..row_size { + for l in 0..col_size { + map.insert((row_index + k, col_index + l), text.to_string()); + header_map.insert((row_index + k, col_index + l), is_header); + } + } } } + let rows = map.keys().map(|(i, _)| i).max().unwrap_or(&0) + 1; + let cols = map.keys().map(|(_, j)| j).max().unwrap_or(&0) + 1; + let mut table = Table::new((rows, cols)); + for ((i, j), text) in map { + table.cells[i * table.size.1 + j] = Some(text); + } + for ((i, j), is_header) in header_map { + table.headers[i * table.size.1 + j] = is_header; + } Ok(table) } @@ -223,4 +249,68 @@ mod tests { assert_eq!(result.len(), 1); assert_eq!(result[0].to_csv().unwrap(), "1,2\n3,4\n"); } + + #[test] + fn test_rowspan_and_colspan() { + let html = r#" + + + + + + + + + + +
AB
C
+ + + "#; + let result = extract_tables_from_document(html).unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].to_csv().unwrap(), "A,B\nA,C\n"); + + let html = r#" + + + + + + + + + + + + +
AB
CDE
+ + + "#; + let result = extract_tables_from_document(html).unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].to_csv().unwrap(), "A,A,B\nC,D,E\n"); + + // more complex + let html = r#" + + + + + + + + + + +
AB
C
+ + + "#; + + let result = extract_tables_from_document(html).unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].to_csv().unwrap(), "A,A,B\nA,A,C\n"); + } } From 253a3597d523af90260885fb189da2ea10522584 Mon Sep 17 00:00:00 2001 From: kitsuyui Date: Wed, 4 Jan 2023 22:16:06 +0900 Subject: [PATCH 11/17] Support for more complex cases --- src/lib.rs | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 5fc5973..ea676bb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -301,16 +301,45 @@ mod tests { A B - - C - + C + + + + +
abc
def
+ + + + +
abcd
ef
ijkl
+ + + + +
abcd
ef
ijkl
+ + + + +
abcd
efg
hi
+ + + + + +
abcd
efg
hi
"#; let result = extract_tables_from_document(html).unwrap(); - assert_eq!(result.len(), 1); + assert_eq!(result.len(), 6); assert_eq!(result[0].to_csv().unwrap(), "A,A,B\nA,A,C\n"); + assert_eq!(result[1].to_csv().unwrap(), "a,b,c\nd,e,f\n"); + assert_eq!(result[2].to_csv().unwrap(), "a,b,c,d\ne,f,f,d\ni,j,k,l\n"); + assert_eq!(result[3].to_csv().unwrap(), "a,b,c,d\ne,f,f,f\ni,j,k,l\n"); + assert_eq!(result[4].to_csv().unwrap(), "a,b,c,d\ne,f,f,g\nh,f,f,i\n"); + assert_eq!(result[5].to_csv().unwrap(), "a,b,c,d\ne,f,g,\nh,i,,\n"); } } From 3947dc1ea67f597ef62afe006d18ea5079e83bed Mon Sep 17 00:00:00 2001 From: kitsuyui Date: Wed, 4 Jan 2023 22:54:39 +0900 Subject: [PATCH 12/17] refactor --- Cargo.toml | 1 + src/element_utils.rs | 13 ++++ src/lib.rs | 155 ++++++++++++++++++------------------------- src/table.rs | 68 +++++++++++++++++++ 4 files changed, 148 insertions(+), 89 deletions(-) create mode 100644 src/element_utils.rs create mode 100644 src/table.rs diff --git a/Cargo.toml b/Cargo.toml index bf59417..bb87e10 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,5 +12,6 @@ repository = "https://github.com/kitsuyui/sxd_html_table" [dependencies] csv = "1.1.6" +sxd-document = "0.3.2" sxd-xpath = "0.4.2" sxd_html = "0.1.1" diff --git a/src/element_utils.rs b/src/element_utils.rs new file mode 100644 index 0000000..200f3bd --- /dev/null +++ b/src/element_utils.rs @@ -0,0 +1,13 @@ +pub fn extract_rowspan_and_colspan(element: sxd_document::dom::Element) -> (usize, usize) { + let rowspan = extract_span(element, "rowspan"); + let colspan = extract_span(element, "colspan"); + (rowspan, colspan) +} + +fn extract_span(element: sxd_document::dom::Element, name: &str) -> usize { + element + .attribute_value(name) + .unwrap_or("1") + .parse::() + .unwrap_or(1) +} diff --git a/src/lib.rs b/src/lib.rs index ea676bb..59c7929 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,9 @@ use std::collections::HashMap; use sxd_xpath::{nodeset::Node, Context, Factory, Value}; +pub mod element_utils; +pub mod table; +use crate::table::Table; #[derive(Debug)] pub enum Error { @@ -9,66 +12,26 @@ pub enum Error { FailedToConvertToCSV, } -#[derive(Debug, Eq, PartialEq)] -pub struct Table { - size: (usize, usize), - cells: Vec>, - headers: Vec, -} - -impl Table { - pub fn new(size: (usize, usize)) -> Self { - Self { - size, - cells: vec![None; size.0 * size.1], - headers: vec![false; size.0 * size.1], - } - } - - pub fn is_header(&self, row: usize, col: usize) -> bool { - self.headers[row * self.size.1 + col] - } - - pub fn rows(&self) -> Vec>> { - let mut rows = vec![]; - for i in 0..self.size.0 { - let mut row = vec![]; - for j in 0..self.size.1 { - row.push(self.cells[i * self.size.1 + j].as_deref()); - } - rows.push(row); - } - rows - } +pub fn extract_table_texts_from_document(html: &str) -> Result, Error> { + let package = sxd_html::parse_html(html); + let document = package.as_document(); + #[allow(clippy::expect_used)] + let val = evaluate_xpath_node(document.root(), "//table").expect("XPath evaluation failed"); - pub fn write_csv(&self, writer: &mut impl std::io::Write) -> Result<(), Error> { - let mut writer = csv::Writer::from_writer(writer); - for row in &self.rows() { - let mut record = csv::StringRecord::new(); - for cell in row { - if let Some(text) = cell { - record.push_field(text); - } else { - record.push_field(""); - } - } - writer - .write_record(&record) - .map_err(|_| Error::FailedToConvertToCSV)?; + let Value::Nodeset(table_nodes) = val else { + panic!("Expected node set"); + }; + let mut tables = vec![]; + for node in table_nodes.document_order() { + match extract_table_texts(&node) { + Ok(table) => tables.push(table), + Err(e) => return Err(e), } - writer.flush().map_err(|_| Error::FailedToConvertToCSV)?; - Ok(()) - } - - pub fn to_csv(&self) -> Result { - let mut buf = std::io::BufWriter::new(Vec::new()); - self.write_csv(&mut buf)?; - let bytes = buf.into_inner().map_err(|_| Error::FailedToConvertToCSV)?; - String::from_utf8(bytes).map_err(|_| Error::FailedToConvertToCSV) } + Ok(tables) } -pub fn extract_tables_from_document(html: &str) -> Result, Error> { +pub fn extract_table_elements_from_document(html: &str) -> Result, Error> { let package = sxd_html::parse_html(html); let document = package.as_document(); #[allow(clippy::expect_used)] @@ -79,7 +42,7 @@ pub fn extract_tables_from_document(html: &str) -> Result, Error> { }; let mut tables = vec![]; for node in table_nodes.document_order() { - match extract_table(&node) { + match extract_table_elements(&node) { Ok(table) => tables.push(table), Err(e) => return Err(e), } @@ -87,29 +50,12 @@ pub fn extract_tables_from_document(html: &str) -> Result, Error> { Ok(tables) } -fn extract_rowspan_and_colspan(node: &Node) -> (usize, usize) { - #[allow(clippy::expect_used)] - let element = node.element().expect("Expected element"); - let rowspan = element - .attribute_value("rowspan") - .unwrap_or("1") - .parse::() - .unwrap_or(1); - let colspan = element - .attribute_value("colspan") - .unwrap_or("1") - .parse::() - .unwrap_or(1); - (rowspan, colspan) -} - -fn extract_table(node: &Node) -> Result { +pub fn map_table_cell(node: &Node, f: fn(&Node) -> String) -> Result { let tr_nodes = match evaluate_xpath_node(*node, "./tbody/tr") { Ok(Value::Nodeset(tr_nodes)) => tr_nodes, _ => return Err(Error::InvalidDocument), }; let tr_nodes = tr_nodes.document_order(); - let mut map: HashMap<(usize, usize), String> = HashMap::new(); let mut header_map: HashMap<(usize, usize), bool> = HashMap::new(); for (row_index, tr) in tr_nodes.iter().enumerate() { @@ -120,10 +66,12 @@ fn extract_table(node: &Node) -> Result { let cell_nodes = cell_nodes.document_order(); let mut col_index = 0; for (_, cell_node) in cell_nodes.iter().enumerate() { - let (row_size, col_size) = extract_rowspan_and_colspan(cell_node); - let text = &cell_node.string_value(); #[allow(clippy::expect_used)] - let is_header = cell_node.element().expect("Expected element").name() == "th".into(); + let element = cell_node.element().expect("Expected element"); + let (row_size, col_size) = element_utils::extract_rowspan_and_colspan(element); + let text = f(cell_node); + #[allow(clippy::expect_used)] + let is_header = element.name() == "th".into(); while map.contains_key(&(row_index, col_index)) { col_index += 1; } @@ -135,16 +83,45 @@ fn extract_table(node: &Node) -> Result { } } } + let mut table = map_to_table(&map); + for ((i, j), is_header) in header_map { + if is_header { + table.set_header(i, j); + } + } + Ok(table) +} + +fn map_to_table(map: &HashMap<(usize, usize), String>) -> Table { let rows = map.keys().map(|(i, _)| i).max().unwrap_or(&0) + 1; let cols = map.keys().map(|(_, j)| j).max().unwrap_or(&0) + 1; let mut table = Table::new((rows, cols)); for ((i, j), text) in map { - table.cells[i * table.size.1 + j] = Some(text); + table.set(*i, *j, text.to_string()); } - for ((i, j), is_header) in header_map { - table.headers[i * table.size.1 + j] = is_header; + table +} + +fn extract_table_texts(node: &Node) -> Result { + map_table_cell(node, |node| node.string_value()) +} + +fn extract_table_elements(node: &Node) -> Result { + map_table_cell(node, element_to_html) +} + +fn element_to_html(node: &Node) -> String { + let mut buf = Vec::new(); + let package = sxd_document::Package::new(); + let doc = package.as_document(); + let root = doc.root(); + if let Some(element) = node.element() { + root.append_child(element); } - Ok(table) + #[allow(clippy::expect_used)] + sxd_document::writer::format_document(&doc, &mut buf).expect("Failed to format document"); + #[allow(clippy::expect_used)] + String::from_utf8(buf).expect("Failed to convert to UTF-8") } fn evaluate_xpath_node<'d>( @@ -179,7 +156,7 @@ mod tests { "#; - let result = extract_tables_from_document(html).unwrap(); + let result = extract_table_texts_from_document(html).unwrap(); assert_eq!(result.len(), 1); assert_eq!(result[0].to_csv().unwrap(), "1,2\n"); @@ -202,7 +179,7 @@ mod tests { "#; - let result = extract_tables_from_document(html).unwrap(); + let result = extract_table_texts_from_document(html).unwrap(); assert_eq!(result.len(), 2); assert_eq!(result[0].to_csv().unwrap(), "1,2\n",); assert_eq!(result[1].to_csv().unwrap(), "3,4\n",); @@ -218,12 +195,12 @@ mod tests { "#; - let result = extract_tables_from_document(html).unwrap(); + let result = extract_table_texts_from_document(html).unwrap(); assert_eq!(result.len(), 0); // empty html let html = r#""#; - let result = extract_tables_from_document(html).unwrap(); + let result = extract_table_texts_from_document(html).unwrap(); assert_eq!(result.len(), 0); } @@ -245,7 +222,7 @@ mod tests { "#; - let result = extract_tables_from_document(html).unwrap(); + let result = extract_table_texts_from_document(html).unwrap(); assert_eq!(result.len(), 1); assert_eq!(result[0].to_csv().unwrap(), "1,2\n3,4\n"); } @@ -267,7 +244,7 @@ mod tests { "#; - let result = extract_tables_from_document(html).unwrap(); + let result = extract_table_texts_from_document(html).unwrap(); assert_eq!(result.len(), 1); assert_eq!(result[0].to_csv().unwrap(), "A,B\nA,C\n"); @@ -288,7 +265,7 @@ mod tests { "#; - let result = extract_tables_from_document(html).unwrap(); + let result = extract_table_texts_from_document(html).unwrap(); assert_eq!(result.len(), 1); assert_eq!(result[0].to_csv().unwrap(), "A,A,B\nC,D,E\n"); @@ -333,7 +310,7 @@ mod tests { "#; - let result = extract_tables_from_document(html).unwrap(); + let result = extract_table_texts_from_document(html).unwrap(); assert_eq!(result.len(), 6); assert_eq!(result[0].to_csv().unwrap(), "A,A,B\nA,A,C\n"); assert_eq!(result[1].to_csv().unwrap(), "a,b,c\nd,e,f\n"); diff --git a/src/table.rs b/src/table.rs new file mode 100644 index 0000000..b03e5f7 --- /dev/null +++ b/src/table.rs @@ -0,0 +1,68 @@ +use crate::Error; + +#[derive(Debug, Eq, PartialEq)] +pub struct Table { + size: (usize, usize), + cells: Vec>, + headers: Vec, +} + +impl Table { + pub fn new(size: (usize, usize)) -> Self { + Self { + size, + cells: vec![None; size.0 * size.1], + headers: vec![false; size.0 * size.1], + } + } + + pub fn is_header(&self, row: usize, col: usize) -> bool { + self.headers[row * self.size.1 + col] + } + + pub fn set(&mut self, row: usize, col: usize, text: String) { + self.cells[row * self.size.1 + col] = Some(text); + } + + pub fn set_header(&mut self, row: usize, col: usize) { + self.headers[row * self.size.1 + col] = true; + } + + pub fn rows(&self) -> Vec>> { + let mut rows = vec![]; + for i in 0..self.size.0 { + let mut row = vec![]; + for j in 0..self.size.1 { + row.push(self.cells[i * self.size.1 + j].as_deref()); + } + rows.push(row); + } + rows + } + + pub fn write_csv(&self, writer: &mut impl std::io::Write) -> Result<(), Error> { + let mut writer = csv::Writer::from_writer(writer); + for row in &self.rows() { + let mut record = csv::StringRecord::new(); + for cell in row { + if let Some(text) = cell { + record.push_field(text); + } else { + record.push_field(""); + } + } + writer + .write_record(&record) + .map_err(|_| Error::FailedToConvertToCSV)?; + } + writer.flush().map_err(|_| Error::FailedToConvertToCSV)?; + Ok(()) + } + + pub fn to_csv(&self) -> Result { + let mut buf = std::io::BufWriter::new(Vec::new()); + self.write_csv(&mut buf)?; + let bytes = buf.into_inner().map_err(|_| Error::FailedToConvertToCSV)?; + String::from_utf8(bytes).map_err(|_| Error::FailedToConvertToCSV) + } +} From b7c283e43e2710f7cccf4dc18ae9c0ffb867696d Mon Sep 17 00:00:00 2001 From: kitsuyui Date: Thu, 5 Jan 2023 21:05:08 +0900 Subject: [PATCH 13/17] Make map_table_cell generic and take a closure as an argument --- src/lib.rs | 51 ++++++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 59c7929..6e59a57 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use sxd_xpath::{nodeset::Node, Context, Factory, Value}; pub mod element_utils; @@ -50,14 +50,16 @@ pub fn extract_table_elements_from_document(html: &str) -> Result, Er Ok(tables) } -pub fn map_table_cell(node: &Node, f: fn(&Node) -> String) -> Result { +pub fn map_table_cell(node: &Node, mut f: F) -> Result<(), Error> +where + F: FnMut(&Node, usize, usize) -> T, +{ let tr_nodes = match evaluate_xpath_node(*node, "./tbody/tr") { Ok(Value::Nodeset(tr_nodes)) => tr_nodes, _ => return Err(Error::InvalidDocument), }; let tr_nodes = tr_nodes.document_order(); - let mut map: HashMap<(usize, usize), String> = HashMap::new(); - let mut header_map: HashMap<(usize, usize), bool> = HashMap::new(); + let mut set: HashSet<(usize, usize)> = HashSet::new(); for (row_index, tr) in tr_nodes.iter().enumerate() { let cell_nodes = match evaluate_xpath_node(*tr, "./td|./th") { Ok(Value::Nodeset(td_nodes)) => td_nodes, @@ -69,45 +71,48 @@ pub fn map_table_cell(node: &Node, f: fn(&Node) -> String) -> Result) -> Table { +fn map_table_cell_obsoleted(node: &Node, f: fn(&Node) -> String) -> Result { + let mut map: HashMap<(usize, usize), (String, bool)> = HashMap::new(); + map_table_cell(node, |cell_node: &Node, i: usize, j: usize| { + #[allow(clippy::expect_used)] + let element = cell_node.element().expect("Expected element"); + let is_header = element.name() == "th".into(); + map.insert((i, j), (f(cell_node), is_header)); + })?; let rows = map.keys().map(|(i, _)| i).max().unwrap_or(&0) + 1; let cols = map.keys().map(|(_, j)| j).max().unwrap_or(&0) + 1; let mut table = Table::new((rows, cols)); - for ((i, j), text) in map { - table.set(*i, *j, text.to_string()); + for ((i, j), (text, is_header)) in map { + table.set(i, j, text); + if is_header { + table.set_header(i, j); + } } - table + Ok(table) } fn extract_table_texts(node: &Node) -> Result { - map_table_cell(node, |node| node.string_value()) + map_table_cell_obsoleted(node, |node| node.string_value()) } fn extract_table_elements(node: &Node) -> Result { - map_table_cell(node, element_to_html) + map_table_cell_obsoleted(node, element_to_html) } fn element_to_html(node: &Node) -> String { From 539a972018ac69de17dbc614ef54c5226be4e5d8 Mon Sep 17 00:00:00 2001 From: kitsuyui Date: Thu, 5 Jan 2023 22:56:18 +0900 Subject: [PATCH 14/17] Make it even more generic --- src/lib.rs | 98 ++++++++--------------------------------------- src/node_utils.rs | 65 +++++++++++++++++++++++++++++++ src/table.rs | 76 +++++++++++++++++++++++++----------- 3 files changed, 136 insertions(+), 103 deletions(-) create mode 100644 src/node_utils.rs diff --git a/src/lib.rs b/src/lib.rs index 6e59a57..f74c038 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,9 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; +use node_utils::map_table_cell; use sxd_xpath::{nodeset::Node, Context, Factory, Value}; pub mod element_utils; +pub mod node_utils; pub mod table; use crate::table::Table; @@ -12,7 +14,7 @@ pub enum Error { FailedToConvertToCSV, } -pub fn extract_table_texts_from_document(html: &str) -> Result, Error> { +pub fn extract_table_texts_from_document(html: &str) -> Result>, Error> { let package = sxd_html::parse_html(html); let document = package.as_document(); #[allow(clippy::expect_used)] @@ -23,7 +25,7 @@ pub fn extract_table_texts_from_document(html: &str) -> Result, Error }; let mut tables = vec![]; for node in table_nodes.document_order() { - match extract_table_texts(&node) { + match extract_table_texts(node) { Ok(table) => tables.push(table), Err(e) => return Err(e), } @@ -31,75 +33,27 @@ pub fn extract_table_texts_from_document(html: &str) -> Result, Error Ok(tables) } -pub fn extract_table_elements_from_document(html: &str) -> Result, Error> { - let package = sxd_html::parse_html(html); - let document = package.as_document(); - #[allow(clippy::expect_used)] - let val = evaluate_xpath_node(document.root(), "//table").expect("XPath evaluation failed"); - - let Value::Nodeset(table_nodes) = val else { - panic!("Expected node set"); - }; - let mut tables = vec![]; - for node in table_nodes.document_order() { - match extract_table_elements(&node) { - Ok(table) => tables.push(table), - Err(e) => return Err(e), - } - } - Ok(tables) -} - -pub fn map_table_cell(node: &Node, mut f: F) -> Result<(), Error> +pub fn map_table_cell_obsoleted(node: Node, mut f: F) -> Result, Error> where - F: FnMut(&Node, usize, usize) -> T, + T: Clone + std::fmt::Debug, + F: FnMut(Node) -> T, { - let tr_nodes = match evaluate_xpath_node(*node, "./tbody/tr") { - Ok(Value::Nodeset(tr_nodes)) => tr_nodes, - _ => return Err(Error::InvalidDocument), - }; - let tr_nodes = tr_nodes.document_order(); - let mut set: HashSet<(usize, usize)> = HashSet::new(); - for (row_index, tr) in tr_nodes.iter().enumerate() { - let cell_nodes = match evaluate_xpath_node(*tr, "./td|./th") { - Ok(Value::Nodeset(td_nodes)) => td_nodes, - _ => return Err(Error::InvalidDocument), - }; - let cell_nodes = cell_nodes.document_order(); - let mut col_index = 0; - for (_, cell_node) in cell_nodes.iter().enumerate() { - #[allow(clippy::expect_used)] - let element = cell_node.element().expect("Expected element"); - let (row_size, col_size) = element_utils::extract_rowspan_and_colspan(element); - while set.contains(&(row_index, col_index)) { - col_index += 1; - } - for k in 0..row_size { - for l in 0..col_size { - let row = row_index + k; - let col = col_index + l; - set.insert((row, col)); - f(cell_node, row, col); - } - } - } - } - Ok(()) -} - -fn map_table_cell_obsoleted(node: &Node, f: fn(&Node) -> String) -> Result { - let mut map: HashMap<(usize, usize), (String, bool)> = HashMap::new(); + let mut map: HashMap<(usize, usize), T> = HashMap::new(); + let mut header_map: HashMap<(usize, usize), bool> = HashMap::new(); map_table_cell(node, |cell_node: &Node, i: usize, j: usize| { #[allow(clippy::expect_used)] let element = cell_node.element().expect("Expected element"); let is_header = element.name() == "th".into(); - map.insert((i, j), (f(cell_node), is_header)); + map.insert((i, j), f(*cell_node)); + header_map.insert((i, j), is_header); })?; let rows = map.keys().map(|(i, _)| i).max().unwrap_or(&0) + 1; let cols = map.keys().map(|(_, j)| j).max().unwrap_or(&0) + 1; let mut table = Table::new((rows, cols)); - for ((i, j), (text, is_header)) in map { - table.set(i, j, text); + for ((i, j), item) in map { + table.set(i, j, item); + } + for ((i, j), is_header) in header_map { if is_header { table.set_header(i, j); } @@ -107,28 +61,10 @@ fn map_table_cell_obsoleted(node: &Node, f: fn(&Node) -> String) -> Result Result { +fn extract_table_texts(node: Node) -> Result, Error> { map_table_cell_obsoleted(node, |node| node.string_value()) } -fn extract_table_elements(node: &Node) -> Result { - map_table_cell_obsoleted(node, element_to_html) -} - -fn element_to_html(node: &Node) -> String { - let mut buf = Vec::new(); - let package = sxd_document::Package::new(); - let doc = package.as_document(); - let root = doc.root(); - if let Some(element) = node.element() { - root.append_child(element); - } - #[allow(clippy::expect_used)] - sxd_document::writer::format_document(&doc, &mut buf).expect("Failed to format document"); - #[allow(clippy::expect_used)] - String::from_utf8(buf).expect("Failed to convert to UTF-8") -} - fn evaluate_xpath_node<'d>( node: impl Into>, expr: &str, diff --git a/src/node_utils.rs b/src/node_utils.rs new file mode 100644 index 0000000..860bf2d --- /dev/null +++ b/src/node_utils.rs @@ -0,0 +1,65 @@ +use std::collections::HashSet; + +use sxd_xpath::{nodeset::Node, Context, Factory, Value}; + +use crate::{element_utils, Error}; + +struct TableSupport<'a>(&'a Node<'a>); + +impl<'a> TableSupport<'a> { + fn tr_nodes(&self) -> Result>, Error> { + let tr_nodes = match evaluate_xpath_node(*self.0, "./tbody/tr") { + Ok(Value::Nodeset(tr_nodes)) => tr_nodes, + _ => return Err(Error::InvalidDocument), + }; + Ok(tr_nodes.document_order()) + } + fn td_nodes(&self, tr: &Node<'a>) -> Result>, Error> { + let td_nodes = match evaluate_xpath_node(*tr, "./td|./th") { + Ok(Value::Nodeset(td_nodes)) => td_nodes, + _ => return Err(Error::InvalidDocument), + }; + Ok(td_nodes.document_order()) + } +} + +pub fn map_table_cell(node: Node, mut f: F) -> Result<(), Error> +where + F: FnMut(&Node, usize, usize) -> T, +{ + let t = TableSupport(&node); + let mut set: HashSet<(usize, usize)> = HashSet::new(); + for (row_index, tr_node) in t.tr_nodes()?.iter().enumerate() { + for td_node in t.td_nodes(tr_node)? { + let mut col_index = 0; + #[allow(clippy::expect_used)] + let element = td_node.element().expect("Expected element"); + let (row_size, col_size) = element_utils::extract_rowspan_and_colspan(element); + while set.contains(&(row_index, col_index)) { + col_index += 1; + } + for k in 0..row_size { + for l in 0..col_size { + let row = row_index + k; + let col = col_index + l; + set.insert((row, col)); + f(&td_node, row, col); + } + } + } + } + Ok(()) +} + +fn evaluate_xpath_node<'d>( + node: impl Into>, + expr: &str, +) -> Result, sxd_xpath::Error> { + let factory = Factory::new(); + let expression = factory.build(expr)?; + let expression = expression.ok_or(sxd_xpath::Error::NoXPath)?; + let context = Context::new(); + expression + .evaluate(&context, node.into()) + .map_err(Into::into) +} diff --git a/src/table.rs b/src/table.rs index b03e5f7..ac703c4 100644 --- a/src/table.rs +++ b/src/table.rs @@ -1,52 +1,84 @@ use crate::Error; -#[derive(Debug, Eq, PartialEq)] -pub struct Table { +pub struct Table { size: (usize, usize), - cells: Vec>, + cells: Vec>, headers: Vec, } -impl Table { - pub fn new(size: (usize, usize)) -> Self { - Self { - size, - cells: vec![None; size.0 * size.1], - headers: vec![false; size.0 * size.1], - } - } - - pub fn is_header(&self, row: usize, col: usize) -> bool { - self.headers[row * self.size.1 + col] - } - - pub fn set(&mut self, row: usize, col: usize, text: String) { - self.cells[row * self.size.1 + col] = Some(text); +impl Table { + pub fn set(&mut self, row: usize, col: usize, item: T) { + self.cells[row * self.size.1 + col] = Some(item); } pub fn set_header(&mut self, row: usize, col: usize) { self.headers[row * self.size.1 + col] = true; } - pub fn rows(&self) -> Vec>> { + pub fn is_header(&self, row: usize, col: usize) -> bool { + self.headers[row * self.size.1 + col] + } + + pub fn rows(&self) -> Vec>> { let mut rows = vec![]; for i in 0..self.size.0 { let mut row = vec![]; for j in 0..self.size.1 { - row.push(self.cells[i * self.size.1 + j].as_deref()); + row.push(self.cells[i * self.size.1 + j].as_ref()); } rows.push(row); } rows } +} + +impl Table +where + T: Clone, +{ + pub fn new(size: (usize, usize)) -> Self { + Self { + size, + cells: vec![None; size.0 * size.1], + headers: vec![false; size.0 * size.1], + } + } + + pub fn map(&self, f: impl Fn(&T) -> T2) -> Table + where + T2: Clone, + { + map_table(self, f) + } +} + +fn map_table(table: &Table, f: F) -> Table +where + F: Fn(&T) -> S, + S: Clone, +{ + let mut new_table = Table::new(table.size); + for i in 0..table.size.0 { + for j in 0..table.size.1 { + if let Some(item) = &table.cells[i * table.size.1 + j] { + new_table.set(i, j, f(item)); + } + } + } + new_table +} +impl Table +where + T: std::fmt::Display, +{ pub fn write_csv(&self, writer: &mut impl std::io::Write) -> Result<(), Error> { let mut writer = csv::Writer::from_writer(writer); for row in &self.rows() { let mut record = csv::StringRecord::new(); for cell in row { - if let Some(text) = cell { - record.push_field(text); + if let Some(item) = cell { + record.push_field(&item.to_string()); } else { record.push_field(""); } From ab6a86668c04b82220eb4ea7bcd643b2c0a69959 Mon Sep 17 00:00:00 2001 From: kitsuyui Date: Fri, 6 Jan 2023 22:29:25 +0900 Subject: [PATCH 15/17] Do not use panic, expect for possible errors --- src/lib.rs | 13 ++++++++----- src/node_utils.rs | 5 +++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index f74c038..891d459 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,16 +12,17 @@ pub enum Error { TableNotFound, InvalidDocument, FailedToConvertToCSV, + XPathEvaluationError(sxd_xpath::Error), } pub fn extract_table_texts_from_document(html: &str) -> Result>, Error> { let package = sxd_html::parse_html(html); let document = package.as_document(); - #[allow(clippy::expect_used)] - let val = evaluate_xpath_node(document.root(), "//table").expect("XPath evaluation failed"); + let val = + evaluate_xpath_node(document.root(), "//table").map_err(Error::XPathEvaluationError)?; let Value::Nodeset(table_nodes) = val else { - panic!("Expected node set"); + return Err(Error::TableNotFound); }; let mut tables = vec![]; for node in table_nodes.document_order() { @@ -41,11 +42,13 @@ where let mut map: HashMap<(usize, usize), T> = HashMap::new(); let mut header_map: HashMap<(usize, usize), bool> = HashMap::new(); map_table_cell(node, |cell_node: &Node, i: usize, j: usize| { - #[allow(clippy::expect_used)] - let element = cell_node.element().expect("Expected element"); + let Some(element) = cell_node.element() else { + return Err(Error::InvalidDocument); + }; let is_header = element.name() == "th".into(); map.insert((i, j), f(*cell_node)); header_map.insert((i, j), is_header); + Ok(()) })?; let rows = map.keys().map(|(i, _)| i).max().unwrap_or(&0) + 1; let cols = map.keys().map(|(_, j)| j).max().unwrap_or(&0) + 1; diff --git a/src/node_utils.rs b/src/node_utils.rs index 860bf2d..88ff1bc 100644 --- a/src/node_utils.rs +++ b/src/node_utils.rs @@ -32,8 +32,9 @@ where for (row_index, tr_node) in t.tr_nodes()?.iter().enumerate() { for td_node in t.td_nodes(tr_node)? { let mut col_index = 0; - #[allow(clippy::expect_used)] - let element = td_node.element().expect("Expected element"); + let Some(element) = td_node.element() else { + return Err(Error::InvalidDocument); + }; let (row_size, col_size) = element_utils::extract_rowspan_and_colspan(element); while set.contains(&(row_index, col_index)) { col_index += 1; From 1f6a0bd2310a625789f23fc354394caabaea0159 Mon Sep 17 00:00:00 2001 From: kitsuyui Date: Fri, 6 Jan 2023 23:03:58 +0900 Subject: [PATCH 16/17] Consolidate conversion processing that was distributed in each function to node_to_table --- src/lib.rs | 147 ++++++++++++++++++++++++---------------------- src/node_utils.rs | 84 ++++++++++++++++---------- src/table.rs | 34 +++++++---- 3 files changed, 149 insertions(+), 116 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 891d459..4c7e9b5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,11 +1,8 @@ -use std::collections::HashMap; - -use node_utils::map_table_cell; -use sxd_xpath::{nodeset::Node, Context, Factory, Value}; pub mod element_utils; pub mod node_utils; pub mod table; -use crate::table::Table; +pub use crate::node_utils::extract_table_nodes_to_table; +pub use crate::table::Table; #[derive(Debug)] pub enum Error { @@ -15,76 +12,21 @@ pub enum Error { XPathEvaluationError(sxd_xpath::Error), } -pub fn extract_table_texts_from_document(html: &str) -> Result>, Error> { - let package = sxd_html::parse_html(html); - let document = package.as_document(); - let val = - evaluate_xpath_node(document.root(), "//table").map_err(Error::XPathEvaluationError)?; - - let Value::Nodeset(table_nodes) = val else { - return Err(Error::TableNotFound); - }; - let mut tables = vec![]; - for node in table_nodes.document_order() { - match extract_table_texts(node) { - Ok(table) => tables.push(table), - Err(e) => return Err(e), - } - } - Ok(tables) -} - -pub fn map_table_cell_obsoleted(node: Node, mut f: F) -> Result, Error> -where - T: Clone + std::fmt::Debug, - F: FnMut(Node) -> T, -{ - let mut map: HashMap<(usize, usize), T> = HashMap::new(); - let mut header_map: HashMap<(usize, usize), bool> = HashMap::new(); - map_table_cell(node, |cell_node: &Node, i: usize, j: usize| { - let Some(element) = cell_node.element() else { - return Err(Error::InvalidDocument); - }; - let is_header = element.name() == "th".into(); - map.insert((i, j), f(*cell_node)); - header_map.insert((i, j), is_header); - Ok(()) - })?; - let rows = map.keys().map(|(i, _)| i).max().unwrap_or(&0) + 1; - let cols = map.keys().map(|(_, j)| j).max().unwrap_or(&0) + 1; - let mut table = Table::new((rows, cols)); - for ((i, j), item) in map { - table.set(i, j, item); - } - for ((i, j), is_header) in header_map { - if is_header { - table.set_header(i, j); - } - } - Ok(table) -} - -fn extract_table_texts(node: Node) -> Result, Error> { - map_table_cell_obsoleted(node, |node| node.string_value()) -} - -fn evaluate_xpath_node<'d>( - node: impl Into>, - expr: &str, -) -> Result, sxd_xpath::Error> { - let factory = Factory::new(); - let expression = factory.build(expr)?; - let expression = expression.ok_or(sxd_xpath::Error::NoXPath)?; - let context = Context::new(); - expression - .evaluate(&context, node.into()) - .map_err(Into::into) -} - #[cfg(test)] mod tests { use super::*; + fn extract_table_texts_from_document(html: &str) -> Result>, Error> { + let package = sxd_html::parse_html(html); + let document = package.as_document(); + let tables = extract_table_nodes_to_table(document.root())?; + let tables = tables + .into_iter() + .map(|table| table.to_string_table()) + .collect(); + Ok(tables) + } + #[test] fn test_find_table_from_document() { // found 1 table @@ -171,6 +113,69 @@ mod tests { assert_eq!(result[0].to_csv().unwrap(), "1,2\n3,4\n"); } + #[test] + fn test_table_item_xpath() { + let html = r#" + + +
+ + + + + + + + +
1Hello, World!
+
+

3

+

4

+
+
4
+ + + "#; + let package = sxd_html::parse_html(html); + let document = package.as_document(); + let tables = extract_table_nodes_to_table(document.root()).unwrap(); + assert_eq!(tables.len(), 1); + let csv1 = tables[0] + .map(|_, _, node| match node.element() { + Some(element) => { + if let Some(cls) = element.attribute_value("class") { + return cls; + } + "empty" + } + None => "empty", + }) + .to_csv(); + assert_eq!(csv1.unwrap(), "aaa,empty\nempty,empty\n"); + + let csv2 = tables[0] + .map(|_, _, node| { + for node in node.children().iter() { + if let Some(element) = node.element() { + if let Some(href) = element.attribute_value("href") { + return href.to_string(); + } + } + } + "empty".to_string() + }) + .to_csv(); + assert_eq!(csv2.unwrap(), "empty,https:://example.com/\nempty,empty\n"); + + let csv3 = tables[0] + .map(|_, _, node| node.string_value().trim().to_string()) + .to_csv(); + assert_eq!( + csv3.unwrap(), + "1,\"Hello, World!\"\n\"3\n 4\",4\n" + ); + } + #[test] fn test_rowspan_and_colspan() { let html = r#" diff --git a/src/node_utils.rs b/src/node_utils.rs index 88ff1bc..22a5cc1 100644 --- a/src/node_utils.rs +++ b/src/node_utils.rs @@ -1,21 +1,21 @@ -use std::collections::HashSet; +use std::collections::HashMap; use sxd_xpath::{nodeset::Node, Context, Factory, Value}; -use crate::{element_utils, Error}; +use crate::{element_utils, table::Table, Error}; -struct TableSupport<'a>(&'a Node<'a>); +struct TableSupport<'a>(Node<'a>); impl<'a> TableSupport<'a> { fn tr_nodes(&self) -> Result>, Error> { - let tr_nodes = match evaluate_xpath_node(*self.0, "./tbody/tr") { + let tr_nodes = match evaluate_xpath_node(self.0, "./tbody/tr") { Ok(Value::Nodeset(tr_nodes)) => tr_nodes, _ => return Err(Error::InvalidDocument), }; Ok(tr_nodes.document_order()) } - fn td_nodes(&self, tr: &Node<'a>) -> Result>, Error> { - let td_nodes = match evaluate_xpath_node(*tr, "./td|./th") { + fn td_nodes(&self, tr: Node<'a>) -> Result>, Error> { + let td_nodes = match evaluate_xpath_node(tr, "./td|./th") { Ok(Value::Nodeset(td_nodes)) => td_nodes, _ => return Err(Error::InvalidDocument), }; @@ -23,44 +23,64 @@ impl<'a> TableSupport<'a> { } } -pub fn map_table_cell(node: Node, mut f: F) -> Result<(), Error> -where - F: FnMut(&Node, usize, usize) -> T, -{ - let t = TableSupport(&node); - let mut set: HashSet<(usize, usize)> = HashSet::new(); +pub fn evaluate_xpath_node<'a>( + node: impl Into>, + expr: &str, +) -> Result, sxd_xpath::Error> { + let factory = Factory::new(); + let expression = factory.build(expr)?; + let expression = expression.ok_or(sxd_xpath::Error::NoXPath)?; + let context = Context::new(); + expression + .evaluate(&context, node.into()) + .map_err(Into::into) +} + +fn extract_table_nodes<'a>(node: impl Into>) -> Result>, Error> { + let val = evaluate_xpath_node(node, "//table").map_err(Error::XPathEvaluationError)?; + let Value::Nodeset(table_nodes) = val else { + return Err(Error::TableNotFound); + }; + Ok(table_nodes.document_order()) +} + +pub fn extract_table_nodes_to_table<'a>( + node: impl Into>, +) -> Result>>, Error> { + let mut tables = vec![]; + for node in extract_table_nodes(node)? { + tables.push(node_to_table(node)?); + } + Ok(tables) +} + +fn node_to_table<'a>(node: impl Into>) -> Result>, Error> { + let mut map: HashMap<(usize, usize), Node> = HashMap::new(); + let t = TableSupport(node.into()); for (row_index, tr_node) in t.tr_nodes()?.iter().enumerate() { - for td_node in t.td_nodes(tr_node)? { + for td_node in t.td_nodes(*tr_node)? { let mut col_index = 0; let Some(element) = td_node.element() else { - return Err(Error::InvalidDocument); - }; + return Err(Error::InvalidDocument); + }; let (row_size, col_size) = element_utils::extract_rowspan_and_colspan(element); - while set.contains(&(row_index, col_index)) { + while map.contains_key(&(row_index, col_index)) { col_index += 1; } for k in 0..row_size { for l in 0..col_size { let row = row_index + k; let col = col_index + l; - set.insert((row, col)); - f(&td_node, row, col); + map.insert((row, col), td_node); } } } } - Ok(()) -} - -fn evaluate_xpath_node<'d>( - node: impl Into>, - expr: &str, -) -> Result, sxd_xpath::Error> { - let factory = Factory::new(); - let expression = factory.build(expr)?; - let expression = expression.ok_or(sxd_xpath::Error::NoXPath)?; - let context = Context::new(); - expression - .evaluate(&context, node.into()) - .map_err(Into::into) + let rows = map.keys().map(|(i, _)| i).max().unwrap_or(&0) + 1; + let cols = map.keys().map(|(_, j)| j).max().unwrap_or(&0) + 1; + let mut table = Table::new((rows, cols)); + for ((i, j), item) in map { + table.set(i, j, item); + } + Ok(table) } diff --git a/src/table.rs b/src/table.rs index ac703c4..44f2ea6 100644 --- a/src/table.rs +++ b/src/table.rs @@ -1,9 +1,10 @@ +use sxd_xpath::nodeset::Node; + use crate::Error; pub struct Table { size: (usize, usize), cells: Vec>, - headers: Vec, } impl Table { @@ -11,14 +12,6 @@ impl Table { self.cells[row * self.size.1 + col] = Some(item); } - pub fn set_header(&mut self, row: usize, col: usize) { - self.headers[row * self.size.1 + col] = true; - } - - pub fn is_header(&self, row: usize, col: usize) -> bool { - self.headers[row * self.size.1 + col] - } - pub fn rows(&self) -> Vec>> { let mut rows = vec![]; for i in 0..self.size.0 { @@ -40,11 +33,10 @@ where Self { size, cells: vec![None; size.0 * size.1], - headers: vec![false; size.0 * size.1], } } - pub fn map(&self, f: impl Fn(&T) -> T2) -> Table + pub fn map(&self, f: impl Fn(usize, usize, &T) -> T2) -> Table where T2: Clone, { @@ -54,20 +46,36 @@ where fn map_table(table: &Table, f: F) -> Table where - F: Fn(&T) -> S, + F: Fn(usize, usize, &T) -> S, S: Clone, { let mut new_table = Table::new(table.size); for i in 0..table.size.0 { for j in 0..table.size.1 { if let Some(item) = &table.cells[i * table.size.1 + j] { - new_table.set(i, j, f(item)); + new_table.set(i, j, f(i, j, item)); } } } new_table } +impl Table> { + pub fn to_string_table(&self) -> Table { + self.map(|_, _, node| node.string_value()) + } + + pub fn to_string_table_with_header(&self) -> Table<(String, bool)> { + self.map(|_, _, node| { + let Some(element) = node.element() else { + return (node.string_value(), false); + }; + let is_header = element.name() == "th".into(); + (node.string_value(), is_header) + }) + } +} + impl Table where T: std::fmt::Display, From 7eccbd69c0ebb68652e456552f91ae20e1956a2c Mon Sep 17 00:00:00 2001 From: kitsuyui Date: Sat, 7 Jan 2023 01:31:14 +0900 Subject: [PATCH 17/17] Update README --- README.md | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 64e03a6..16c9210 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,4 @@ -# WIP: sxd_html_table - -!!! This library is still under development !!! +# sxd_html_table # Provide features related to HTML tables @@ -12,6 +10,40 @@ There are some complexities to deal with when dealing with HTML tables. This library hides these complexities and makes it easy to deal with the structure of the table. For example, you can convert an HTML table tag to a CSV file. +## Usage + +```rust +use sxd_html_table::Table; + +let html = r#" + + + + + + + + + +
header1header2
data1data2
+"#; + +fn extract_table_texts_from_document(html: &str) -> Result>, Error> { + let package = sxd_html::parse_html(html); + let document = package.as_document(); + let tables = extract_table_nodes_to_table(document.root())?; + let tables = tables + .into_iter() + .map(|table| table.to_string_table()) + .collect(); + Ok(tables) +} + +let table = extract_table_texts_from_document(html).unwrap(); +let csv = table.to_csv().unwrap(); +assert_eq!(csv, "header1,header2\ndata1,data2\n"); +``` + ## License Licensed under either of