Skip to content

Commit

Permalink
Merge pull request #1 from kitsuyui/implement
Browse files Browse the repository at this point in the history
Implement main features
  • Loading branch information
kitsuyui authored Jan 6, 2023
2 parents b551d93 + 7eccbd6 commit 6f527f2
Show file tree
Hide file tree
Showing 7 changed files with 510 additions and 9 deletions.
5 changes: 4 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,7 @@ repository = "https://github.com/kitsuyui/sxd_html_table"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
sxd_html = "0.1.0"
csv = "1.1.6"
sxd-document = "0.3.2"
sxd-xpath = "0.4.2"
sxd_html = "0.1.1"
38 changes: 35 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
# WIP: sxd_html_table

!!! This library is still under development !!!
# sxd_html_table

# Provide features related to HTML tables

Expand All @@ -12,6 +10,40 @@ There are some complexities to deal with when dealing with HTML tables.
This library hides these complexities and makes it easy to deal with the structure of the table.
For example, you can convert an HTML table tag to a CSV file.

## Usage

```rust
use sxd_html_table::Table;

let html = r#"
<table>
<tr>
<th>header1</th>
<th>header2</th>
</tr>
<tr>
<td>data1</td>
<td>data2</td>
</tr>
</table>
"#;

fn extract_table_texts_from_document(html: &str) -> Result<Vec<Table<String>>, Error> {
let package = sxd_html::parse_html(html);
let document = package.as_document();
let tables = extract_table_nodes_to_table(document.root())?;
let tables = tables
.into_iter()
.map(|table| table.to_string_table())
.collect();
Ok(tables)
}

let table = extract_table_texts_from_document(html).unwrap();
let csv = table.to_csv().unwrap();
assert_eq!(csv, "header1,header2\ndata1,data2\n");
```

## License

Licensed under either of
Expand Down
2 changes: 2 additions & 0 deletions clippy.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
allow-unwrap-in-tests = true
allow-expect-in-tests = true
13 changes: 13 additions & 0 deletions src/element_utils.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
pub fn extract_rowspan_and_colspan(element: sxd_document::dom::Element) -> (usize, usize) {
let rowspan = extract_span(element, "rowspan");
let colspan = extract_span(element, "colspan");
(rowspan, colspan)
}

fn extract_span(element: sxd_document::dom::Element, name: &str) -> usize {
element
.attribute_value(name)
.unwrap_or("1")
.parse::<usize>()
.unwrap_or(1)
}
267 changes: 262 additions & 5 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,271 @@
pub fn add(left: usize, right: usize) -> usize {
left + right
pub mod element_utils;
pub mod node_utils;
pub mod table;
pub use crate::node_utils::extract_table_nodes_to_table;
pub use crate::table::Table;

#[derive(Debug)]
pub enum Error {
TableNotFound,
InvalidDocument,
FailedToConvertToCSV,
XPathEvaluationError(sxd_xpath::Error),
}

#[cfg(test)]
mod tests {
use super::*;

fn extract_table_texts_from_document(html: &str) -> Result<Vec<Table<String>>, Error> {
let package = sxd_html::parse_html(html);
let document = package.as_document();
let tables = extract_table_nodes_to_table(document.root())?;
let tables = tables
.into_iter()
.map(|table| table.to_string_table())
.collect();
Ok(tables)
}

#[test]
fn it_works() {
let result = add(2, 2);
assert_eq!(result, 4);
fn test_find_table_from_document() {
// found 1 table
let html = r#"
<html>
<body>
<table>
<tr>
<td>1</td>
<td>2</td>
</tr>
</table>
</body>
</html>
"#;
let result = extract_table_texts_from_document(html).unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].to_csv().unwrap(), "1,2\n");

// found 2 tables
let html = r#"
<html>
<body>
<table>
<tr>
<td>1</td>
<td>2</td>
</tr>
</table>
<table>
<tr>
<td>3</td>
<td>4</td>
</tr>
</table>
</body>
</html>
"#;
let result = extract_table_texts_from_document(html).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(result[0].to_csv().unwrap(), "1,2\n",);
assert_eq!(result[1].to_csv().unwrap(), "3,4\n",);

// found 0 table
let html = r#"
<html>
<body>
<div>
<p>1</p>
<p>2</p>
</div>
</body>
</html>
"#;
let result = extract_table_texts_from_document(html).unwrap();
assert_eq!(result.len(), 0);

// empty html
let html = r#""#;
let result = extract_table_texts_from_document(html).unwrap();
assert_eq!(result.len(), 0);
}

#[test]
fn test_td_and_th() {
let html = r#"
<html>
<body>
<table>
<tr>
<th>1</th>
<td>2</td>
</tr>
<tr>
<td>3</td>
<td>4</td>
</tr>
</table>
</body>
</html>
"#;
let result = extract_table_texts_from_document(html).unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].to_csv().unwrap(), "1,2\n3,4\n");
}

#[test]
fn test_table_item_xpath() {
let html = r#"
<html>
<body>
<table>
<tr>
<th class="aaa">1</th>
<td><a href="https:://example.com/">Hello, World!</a></td>
</tr>
<tr>
<td>
<div>
<p>3</p>
<p>4</p>
</div>
</td>
<td>4</td>
</tr>
</table>
</body>
</html>
"#;
let package = sxd_html::parse_html(html);
let document = package.as_document();
let tables = extract_table_nodes_to_table(document.root()).unwrap();
assert_eq!(tables.len(), 1);
let csv1 = tables[0]
.map(|_, _, node| match node.element() {
Some(element) => {
if let Some(cls) = element.attribute_value("class") {
return cls;
}
"empty"
}
None => "empty",
})
.to_csv();
assert_eq!(csv1.unwrap(), "aaa,empty\nempty,empty\n");

let csv2 = tables[0]
.map(|_, _, node| {
for node in node.children().iter() {
if let Some(element) = node.element() {
if let Some(href) = element.attribute_value("href") {
return href.to_string();
}
}
}
"empty".to_string()
})
.to_csv();
assert_eq!(csv2.unwrap(), "empty,https:://example.com/\nempty,empty\n");

let csv3 = tables[0]
.map(|_, _, node| node.string_value().trim().to_string())
.to_csv();
assert_eq!(
csv3.unwrap(),
"1,\"Hello, World!\"\n\"3\n 4\",4\n"
);
}

#[test]
fn test_rowspan_and_colspan() {
let html = r#"
<html>
<body>
<table>
<tr>
<td rowspan="2">A</td>
<td>B</td>
</tr>
<tr>
<td>C</td>
</tr>
</table>
</body>
</html>
"#;
let result = extract_table_texts_from_document(html).unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].to_csv().unwrap(), "A,B\nA,C\n");

let html = r#"
<html>
<body>
<table>
<tr>
<td colspan="2">A</td>
<td>B</td>
</tr>
<tr>
<td>C</td>
<td>D</td>
<td>E</td>
</tr>
</table>
</body>
</html>
"#;
let result = extract_table_texts_from_document(html).unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].to_csv().unwrap(), "A,A,B\nC,D,E\n");

// more complex
let html = r#"
<html>
<body>
<table>
<tr>
<td rowspan="2" colspan="2">A</td>
<td>B</td>
</tr>
<tr><td>C</td></tr>
</table>
<table>
<tr><td>a</td><td>b</td><td>c</td></tr>
<tr><td>d</td><td>e</td><td>f</td></tr>
</table>
<table>
<tr><td>a</td><td>b</td><td>c</td><td rowspan="2">d</td></tr>
<tr><td>e</td><td colspan="2">f</td></tr>
<tr><td>i</td><td>j</td><td>k</td><td>l</td></tr>
</table>
<table>
<tr><td>a</td><td>b</td><td rowspan="2">c</td><td>d</td></tr>
<tr><td>e</td><td colspan="3">f</td></tr>
<tr><td>i</td><td>j</td><td>k</td><td>l</td></tr>
</table>
<table>
<tr><td>a</td><td>b</td><td>c</td><td>d</td></tr>
<tr><td>e</td><td rowspan="2" colspan="2">f</td><td>g</td></tr>
<tr><td>h</td><td>i</td></tr>
</table>
<!-- invalid rowspan -->
<table>
<tr><td>a</td><td>b</td><td>c</td><td>d</td></tr>
<tr><td>e</td><td rowspan="a" colspan="b">f</td><td>g</td></tr>
<tr><td>h</td><td>i</td></tr>
</table>
</body>
</html>
"#;

let result = extract_table_texts_from_document(html).unwrap();
assert_eq!(result.len(), 6);
assert_eq!(result[0].to_csv().unwrap(), "A,A,B\nA,A,C\n");
assert_eq!(result[1].to_csv().unwrap(), "a,b,c\nd,e,f\n");
assert_eq!(result[2].to_csv().unwrap(), "a,b,c,d\ne,f,f,d\ni,j,k,l\n");
assert_eq!(result[3].to_csv().unwrap(), "a,b,c,d\ne,f,f,f\ni,j,k,l\n");
assert_eq!(result[4].to_csv().unwrap(), "a,b,c,d\ne,f,f,g\nh,f,f,i\n");
assert_eq!(result[5].to_csv().unwrap(), "a,b,c,d\ne,f,g,\nh,i,,\n");
}
}
Loading

0 comments on commit 6f527f2

Please sign in to comment.