-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from kitsuyui/implement
Implement main features
- Loading branch information
Showing
7 changed files
with
510 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
allow-unwrap-in-tests = true | ||
allow-expect-in-tests = true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
pub fn extract_rowspan_and_colspan(element: sxd_document::dom::Element) -> (usize, usize) { | ||
let rowspan = extract_span(element, "rowspan"); | ||
let colspan = extract_span(element, "colspan"); | ||
(rowspan, colspan) | ||
} | ||
|
||
fn extract_span(element: sxd_document::dom::Element, name: &str) -> usize { | ||
element | ||
.attribute_value(name) | ||
.unwrap_or("1") | ||
.parse::<usize>() | ||
.unwrap_or(1) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,271 @@ | ||
pub fn add(left: usize, right: usize) -> usize { | ||
left + right | ||
pub mod element_utils; | ||
pub mod node_utils; | ||
pub mod table; | ||
pub use crate::node_utils::extract_table_nodes_to_table; | ||
pub use crate::table::Table; | ||
|
||
#[derive(Debug)] | ||
pub enum Error { | ||
TableNotFound, | ||
InvalidDocument, | ||
FailedToConvertToCSV, | ||
XPathEvaluationError(sxd_xpath::Error), | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
fn extract_table_texts_from_document(html: &str) -> Result<Vec<Table<String>>, Error> { | ||
let package = sxd_html::parse_html(html); | ||
let document = package.as_document(); | ||
let tables = extract_table_nodes_to_table(document.root())?; | ||
let tables = tables | ||
.into_iter() | ||
.map(|table| table.to_string_table()) | ||
.collect(); | ||
Ok(tables) | ||
} | ||
|
||
#[test] | ||
fn it_works() { | ||
let result = add(2, 2); | ||
assert_eq!(result, 4); | ||
fn test_find_table_from_document() { | ||
// found 1 table | ||
let html = r#" | ||
<html> | ||
<body> | ||
<table> | ||
<tr> | ||
<td>1</td> | ||
<td>2</td> | ||
</tr> | ||
</table> | ||
</body> | ||
</html> | ||
"#; | ||
let result = extract_table_texts_from_document(html).unwrap(); | ||
assert_eq!(result.len(), 1); | ||
assert_eq!(result[0].to_csv().unwrap(), "1,2\n"); | ||
|
||
// found 2 tables | ||
let html = r#" | ||
<html> | ||
<body> | ||
<table> | ||
<tr> | ||
<td>1</td> | ||
<td>2</td> | ||
</tr> | ||
</table> | ||
<table> | ||
<tr> | ||
<td>3</td> | ||
<td>4</td> | ||
</tr> | ||
</table> | ||
</body> | ||
</html> | ||
"#; | ||
let result = extract_table_texts_from_document(html).unwrap(); | ||
assert_eq!(result.len(), 2); | ||
assert_eq!(result[0].to_csv().unwrap(), "1,2\n",); | ||
assert_eq!(result[1].to_csv().unwrap(), "3,4\n",); | ||
|
||
// found 0 table | ||
let html = r#" | ||
<html> | ||
<body> | ||
<div> | ||
<p>1</p> | ||
<p>2</p> | ||
</div> | ||
</body> | ||
</html> | ||
"#; | ||
let result = extract_table_texts_from_document(html).unwrap(); | ||
assert_eq!(result.len(), 0); | ||
|
||
// empty html | ||
let html = r#""#; | ||
let result = extract_table_texts_from_document(html).unwrap(); | ||
assert_eq!(result.len(), 0); | ||
} | ||
|
||
#[test] | ||
fn test_td_and_th() { | ||
let html = r#" | ||
<html> | ||
<body> | ||
<table> | ||
<tr> | ||
<th>1</th> | ||
<td>2</td> | ||
</tr> | ||
<tr> | ||
<td>3</td> | ||
<td>4</td> | ||
</tr> | ||
</table> | ||
</body> | ||
</html> | ||
"#; | ||
let result = extract_table_texts_from_document(html).unwrap(); | ||
assert_eq!(result.len(), 1); | ||
assert_eq!(result[0].to_csv().unwrap(), "1,2\n3,4\n"); | ||
} | ||
|
||
#[test] | ||
fn test_table_item_xpath() { | ||
let html = r#" | ||
<html> | ||
<body> | ||
<table> | ||
<tr> | ||
<th class="aaa">1</th> | ||
<td><a href="https:://example.com/">Hello, World!</a></td> | ||
</tr> | ||
<tr> | ||
<td> | ||
<div> | ||
<p>3</p> | ||
<p>4</p> | ||
</div> | ||
</td> | ||
<td>4</td> | ||
</tr> | ||
</table> | ||
</body> | ||
</html> | ||
"#; | ||
let package = sxd_html::parse_html(html); | ||
let document = package.as_document(); | ||
let tables = extract_table_nodes_to_table(document.root()).unwrap(); | ||
assert_eq!(tables.len(), 1); | ||
let csv1 = tables[0] | ||
.map(|_, _, node| match node.element() { | ||
Some(element) => { | ||
if let Some(cls) = element.attribute_value("class") { | ||
return cls; | ||
} | ||
"empty" | ||
} | ||
None => "empty", | ||
}) | ||
.to_csv(); | ||
assert_eq!(csv1.unwrap(), "aaa,empty\nempty,empty\n"); | ||
|
||
let csv2 = tables[0] | ||
.map(|_, _, node| { | ||
for node in node.children().iter() { | ||
if let Some(element) = node.element() { | ||
if let Some(href) = element.attribute_value("href") { | ||
return href.to_string(); | ||
} | ||
} | ||
} | ||
"empty".to_string() | ||
}) | ||
.to_csv(); | ||
assert_eq!(csv2.unwrap(), "empty,https:://example.com/\nempty,empty\n"); | ||
|
||
let csv3 = tables[0] | ||
.map(|_, _, node| node.string_value().trim().to_string()) | ||
.to_csv(); | ||
assert_eq!( | ||
csv3.unwrap(), | ||
"1,\"Hello, World!\"\n\"3\n 4\",4\n" | ||
); | ||
} | ||
|
||
#[test] | ||
fn test_rowspan_and_colspan() { | ||
let html = r#" | ||
<html> | ||
<body> | ||
<table> | ||
<tr> | ||
<td rowspan="2">A</td> | ||
<td>B</td> | ||
</tr> | ||
<tr> | ||
<td>C</td> | ||
</tr> | ||
</table> | ||
</body> | ||
</html> | ||
"#; | ||
let result = extract_table_texts_from_document(html).unwrap(); | ||
assert_eq!(result.len(), 1); | ||
assert_eq!(result[0].to_csv().unwrap(), "A,B\nA,C\n"); | ||
|
||
let html = r#" | ||
<html> | ||
<body> | ||
<table> | ||
<tr> | ||
<td colspan="2">A</td> | ||
<td>B</td> | ||
</tr> | ||
<tr> | ||
<td>C</td> | ||
<td>D</td> | ||
<td>E</td> | ||
</tr> | ||
</table> | ||
</body> | ||
</html> | ||
"#; | ||
let result = extract_table_texts_from_document(html).unwrap(); | ||
assert_eq!(result.len(), 1); | ||
assert_eq!(result[0].to_csv().unwrap(), "A,A,B\nC,D,E\n"); | ||
|
||
// more complex | ||
let html = r#" | ||
<html> | ||
<body> | ||
<table> | ||
<tr> | ||
<td rowspan="2" colspan="2">A</td> | ||
<td>B</td> | ||
</tr> | ||
<tr><td>C</td></tr> | ||
</table> | ||
<table> | ||
<tr><td>a</td><td>b</td><td>c</td></tr> | ||
<tr><td>d</td><td>e</td><td>f</td></tr> | ||
</table> | ||
<table> | ||
<tr><td>a</td><td>b</td><td>c</td><td rowspan="2">d</td></tr> | ||
<tr><td>e</td><td colspan="2">f</td></tr> | ||
<tr><td>i</td><td>j</td><td>k</td><td>l</td></tr> | ||
</table> | ||
<table> | ||
<tr><td>a</td><td>b</td><td rowspan="2">c</td><td>d</td></tr> | ||
<tr><td>e</td><td colspan="3">f</td></tr> | ||
<tr><td>i</td><td>j</td><td>k</td><td>l</td></tr> | ||
</table> | ||
<table> | ||
<tr><td>a</td><td>b</td><td>c</td><td>d</td></tr> | ||
<tr><td>e</td><td rowspan="2" colspan="2">f</td><td>g</td></tr> | ||
<tr><td>h</td><td>i</td></tr> | ||
</table> | ||
<!-- invalid rowspan --> | ||
<table> | ||
<tr><td>a</td><td>b</td><td>c</td><td>d</td></tr> | ||
<tr><td>e</td><td rowspan="a" colspan="b">f</td><td>g</td></tr> | ||
<tr><td>h</td><td>i</td></tr> | ||
</table> | ||
</body> | ||
</html> | ||
"#; | ||
|
||
let result = extract_table_texts_from_document(html).unwrap(); | ||
assert_eq!(result.len(), 6); | ||
assert_eq!(result[0].to_csv().unwrap(), "A,A,B\nA,A,C\n"); | ||
assert_eq!(result[1].to_csv().unwrap(), "a,b,c\nd,e,f\n"); | ||
assert_eq!(result[2].to_csv().unwrap(), "a,b,c,d\ne,f,f,d\ni,j,k,l\n"); | ||
assert_eq!(result[3].to_csv().unwrap(), "a,b,c,d\ne,f,f,f\ni,j,k,l\n"); | ||
assert_eq!(result[4].to_csv().unwrap(), "a,b,c,d\ne,f,f,g\nh,f,f,i\n"); | ||
assert_eq!(result[5].to_csv().unwrap(), "a,b,c,d\ne,f,g,\nh,i,,\n"); | ||
} | ||
} |
Oops, something went wrong.