diff --git a/.vscode/launch.json b/.vscode/launch.json index cce2776..a2ed34b 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -98,6 +98,7 @@ "hn.html", "--log-level", "DEBUG", + "--show-simplified", "--config", "/tmp/html-extraction-2025-01-26-17-42-34.yaml" ], diff --git a/changelog.md b/changelog.md index f0536fd..95f40e1 100644 --- a/changelog.md +++ b/changelog.md @@ -690,4 +690,19 @@ Added debug mode and tracing capabilities to ShellToolProvider for better debugg - Added debug mode to log detailed information about tool calls and arguments - Added tracing directory support to save input/output JSON files for each tool call - Implemented functional options pattern for configuration -- Added timestamp-based file naming for trace files \ No newline at end of file +- Added timestamp-based file naming for trace files + +# Switch to html-to-markdown Library + +Replaced manual markdown conversion with html-to-markdown library for better and more consistent HTML to Markdown conversion: +- Removed manual markdown conversion code and maps +- Added html-to-markdown library integration +- Simplified TextSimplifier implementation +- Maintained existing logging and error handling + +# HTML Selector Show Simplified Flag + +Added a new flag to control whether simplified HTML is shown in the output to reduce verbosity. + +- Added `--show-simplified` flag (default: false) to control whether simplified HTML is included in output +- Modified output to only include simplified HTML and context when explicitly requested \ No newline at end of file diff --git a/cmd/tools/test-html-selector/main.go b/cmd/tools/test-html-selector/main.go index e2c602d..a71d485 100644 --- a/cmd/tools/test-html-selector/main.go +++ b/cmd/tools/test-html-selector/main.go @@ -60,8 +60,8 @@ type SimplifiedResult struct { type SourceResult struct { Source string `yaml:"source"` - Data map[string][]interface{} `yaml:"data"` - SelectorResults []SelectorResult `yaml:"selector_results"` + Data map[string][]interface{} `yaml:"data,omitempty"` + SelectorResults []SelectorResult `yaml:"selector_results,omitempty"` } type HTMLSelectorCommand struct { @@ -79,6 +79,7 @@ type HTMLSelectorSettings struct { ExtractTemplate string `glazed.parameter:"extract-template"` ShowContext bool `glazed.parameter:"show-context"` ShowPath bool `glazed.parameter:"show-path"` + ShowSimplified bool `glazed.parameter:"show-simplified"` SampleCount int `glazed.parameter:"sample-count"` ContextChars int `glazed.parameter:"context-chars"` StripScripts bool `glazed.parameter:"strip-scripts"` @@ -155,6 +156,12 @@ It provides match counts and contextual examples to verify selector accuracy.`), parameters.WithHelp("Show path to matched elements"), parameters.WithDefault(true), ), + parameters.NewParameterDefinition( + "show-simplified", + parameters.ParameterTypeBool, + parameters.WithHelp("Show simplified HTML in output"), + parameters.WithDefault(false), + ), parameters.NewParameterDefinition( "sample-count", parameters.ParameterTypeInteger, @@ -290,7 +297,7 @@ func (c *HTMLSelectorCommand) RunIntoWriter( MaxTableRows: s.MaxTableRows, }) - var sourceResults []SourceResult + var sourceResults []*SourceResult // Process files for _, file := range s.Files { @@ -312,6 +319,11 @@ func (c *HTMLSelectorCommand) RunIntoWriter( // If using extract or extract-template, process all matches without sample limit if s.Extract || s.ExtractTemplate != "" { + // clear the selector results + for _, sourceResult := range sourceResults { + sourceResult.SelectorResults = []SelectorResult{} + } + // If extract-data is true, output raw data regardless of templates if s.ExtractData { return yaml.NewEncoder(w).Encode(sourceResults) @@ -373,9 +385,12 @@ func (c *HTMLSelectorCommand) RunIntoWriter( } sample := SimplifiedSample{ - SimplifiedHTML: htmlDocs, - HTML: selectorSample.HTML, - Markdown: markdown, + HTML: selectorSample.HTML, + Markdown: markdown, + } + + if s.ShowSimplified { + sample.SimplifiedHTML = htmlDocs } if s.ShowPath { @@ -386,7 +401,9 @@ func (c *HTMLSelectorCommand) RunIntoWriter( if err != nil { return fmt.Errorf("failed to process HTML: %w", err) } - sample.SimplifiedContext = htmlDocs + if s.ShowSimplified { + sample.SimplifiedContext = htmlDocs + } sample.Context = selectorSample.Context } newResults[selectorResult.Name].Samples = append(newResults[selectorResult.Name].Samples, sample) @@ -413,9 +430,10 @@ func processSource( selectors []Selector, s *HTMLSelectorSettings, simplifier *htmlsimplifier.Simplifier, -) (SourceResult, error) { - var result SourceResult - result.Source = source +) (*SourceResult, error) { + result := &SourceResult{ + Source: source, + } var f io.ReadCloser var err error @@ -468,20 +486,22 @@ func processSource( for _, r := range results { var matches []interface{} for _, selectorSample := range r.Samples { - // Process HTML content - htmlDocs, err := simplifier.ProcessHTML(selectorSample.HTML) - if err == nil { - - for _, doc := range htmlDocs { - if doc.Text != "" { - matches = append(matches, doc.Text) - } else if doc.Markdown != "" { - matches = append(matches, doc.Markdown) - } else { - matches = append(matches, doc) + // Convert sample to markdown if requested + if s.Markdown { + // Create markdown converter + converter := md.NewConverter("", true, nil) + var markdown string + + // Convert HTML to markdown if present + if selectorSample.HTML != "" { + markdown, err = converter.ConvertString(selectorSample.HTML) + if err == nil { + matches = append(matches, markdown) + continue } } } + matches = append(matches, selectorSample.HTML) } result.Data[r.Name] = matches } diff --git a/pkg/htmlsimplifier/node_handler.go b/pkg/htmlsimplifier/node_handler.go index b6ab81e..10deb65 100644 --- a/pkg/htmlsimplifier/node_handler.go +++ b/pkg/htmlsimplifier/node_handler.go @@ -133,6 +133,13 @@ func (h *NodeHandler) IsTextOnly(node *html.Node) bool { return false } + // Nodes with class or id attributes that are text-only strategy are not text-only + for _, attr := range node.Attr { + if attr.Key == "class" || attr.Key == "id" { + return false + } + } + // Check all children for child := node.FirstChild; child != nil; child = child.NextSibling { if !h.IsTextOnly(child) { @@ -166,6 +173,12 @@ func (h *NodeHandler) IsMarkdownable(node *html.Node) bool { // For non-markdown elements that are text-only, we need to check if they contain any non-markdown elements if strategy == StrategyTextOnly { + // Nodes with class or id attributes that are text-only strategy are not markdownable + for _, attr := range node.Attr { + if attr.Key == "class" || attr.Key == "id" { + return false + } + } for child := node.FirstChild; child != nil; child = child.NextSibling { if child.Type == html.ElementNode { childStrategy := h.GetStrategy(child) diff --git a/pkg/htmlsimplifier/simplifier.go b/pkg/htmlsimplifier/simplifier.go index a795511..3f9173e 100644 --- a/pkg/htmlsimplifier/simplifier.go +++ b/pkg/htmlsimplifier/simplifier.go @@ -255,6 +255,12 @@ func (s *Simplifier) processNode(node *html.Node) []Document { }} } } + + // If node has class or id, fall through to default processing + if len(classes) > 0 || id != "" { + break + } + // If text simplification fails or is disabled, extract text normally text := s.textSimplifier.ExtractText(node) if text != "" { diff --git a/pkg/htmlsimplifier/text_simplifier.go b/pkg/htmlsimplifier/text_simplifier.go index 71aca8e..30bea0c 100644 --- a/pkg/htmlsimplifier/text_simplifier.go +++ b/pkg/htmlsimplifier/text_simplifier.go @@ -1,9 +1,9 @@ package htmlsimplifier import ( - "fmt" "strings" + md "github.com/JohannesKaufmann/html-to-markdown" "github.com/rs/zerolog/log" "golang.org/x/net/html" ) @@ -12,6 +12,7 @@ import ( type TextSimplifier struct { markdownEnabled bool nodeHandler *NodeHandler + mdConverter *md.Converter } // NewTextSimplifier creates a new text simplifier @@ -20,39 +21,10 @@ func NewTextSimplifier(markdownEnabled bool) *TextSimplifier { return &TextSimplifier{ markdownEnabled: markdownEnabled, nodeHandler: NewNodeHandler(opts), + mdConverter: md.NewConverter("", true, nil), } } -// MarkdownElements defines HTML elements that can be converted to markdown -var MarkdownElements = map[string]bool{ - "a": true, // Links (only within p or span) - "strong": true, // Bold text - "em": true, // Italic text - "b": true, // Bold text (alternative) - "i": true, // Italic text (alternative) - "code": true, // Code snippets -} - -// MarkdownStart defines the opening markdown syntax for each element type -var MarkdownStart = map[string]string{ - "a": "[", // Links - "strong": "**", // Bold text - "em": "*", // Italic text - "b": "**", // Bold text (alternative) - "i": "*", // Italic text (alternative) - "code": "`", // Code snippets -} - -// MarkdownEnd defines the closing markdown syntax for each element type -var MarkdownEnd = map[string]string{ - "a": "](%s)", // Links (format with href) - "strong": "**", // Bold text - "em": "*", // Italic text - "b": "**", // Bold text (alternative) - "i": "*", // Italic text (alternative) - "code": "`", // Code snippets -} - // SimplifyText attempts to convert a node and its children to a single text string func (t *TextSimplifier) SimplifyText(node *html.Node) (string, bool) { if node == nil { @@ -161,95 +133,29 @@ func (t *TextSimplifier) ConvertToMarkdown(node *html.Node) (string, bool) { return text, true } - // Check if markdown is enabled for this node - if !t.markdownEnabled && MarkdownElements[node.Data] { - log.Trace().Str("node_type", node.Data).Msg("ConvertToMarkdown: markdown disabled for this element") + // Convert the node to HTML string + var buf strings.Builder + err := html.Render(&buf, node) + if err != nil { + log.Error().Err(err).Msg("ConvertToMarkdown: failed to render HTML") return "", false } - log.Trace().Str("node_type", node.Data).Msg("ConvertToMarkdown: processing element node") - - // Process children first - var parts []string - for child := node.FirstChild; child != nil; child = child.NextSibling { - if child.Type == html.TextNode { - text := strings.TrimSpace(child.Data) - if text != "" { - parts = append(parts, text) - log.Trace().Str("text", text).Msg("ConvertToMarkdown: added text node content") - } - continue - } - - switch child.Data { - case "a": - href := "" - for _, attr := range child.Attr { - if attr.Key == "href" { - href = attr.Val - break - } - } - text, ok := t.ConvertToMarkdown(child) - if !ok || text == "" { - log.Trace().Msg("ConvertToMarkdown: failed to process link content") - return "", false - } - link := fmt.Sprintf("[%s](%s)", text, href) - parts = append(parts, link) - log.Trace().Str("link", link).Msg("ConvertToMarkdown: processed link") - case "strong", "b": - text, ok := t.ConvertToMarkdown(child) - if !ok || text == "" { - log.Trace().Msg("ConvertToMarkdown: failed to process strong/bold content") - return "", false - } - bold := fmt.Sprintf("**%s**", text) - parts = append(parts, bold) - log.Trace().Str("bold", bold).Msg("ConvertToMarkdown: processed strong/bold") - case "em", "i": - text, ok := t.ConvertToMarkdown(child) - if !ok || text == "" { - log.Trace().Msg("ConvertToMarkdown: failed to process emphasis content") - return "", false - } - em := fmt.Sprintf("*%s*", text) - parts = append(parts, em) - log.Trace().Str("emphasis", em).Msg("ConvertToMarkdown: processed emphasis") - case "code": - text, ok := t.ConvertToMarkdown(child) - if !ok || text == "" { - log.Trace().Msg("ConvertToMarkdown: failed to process code content") - return "", false - } - code := fmt.Sprintf("`%s`", text) - parts = append(parts, code) - log.Trace().Str("code", code).Msg("ConvertToMarkdown: processed code") - case "br": - parts = append(parts, "\n") - log.Trace().Msg("ConvertToMarkdown: processed line break") - default: - text, ok := t.ConvertToMarkdown(child) - if !ok { - log.Trace().Str("node_type", child.Data).Msg("ConvertToMarkdown: failed to process unknown element") - return "", false - } - if text != "" { - parts = append(parts, text) - log.Trace().Str("text", text).Msg("ConvertToMarkdown: processed unknown element") - } - } + // Convert to markdown using html-to-markdown + markdown, err := t.mdConverter.ConvertString(buf.String()) + if err != nil { + log.Error().Err(err).Msg("ConvertToMarkdown: failed to convert to markdown") + return "", false } - result := strings.Join(parts, " ") - if result == "" { + if markdown == "" { log.Trace().Msg("ConvertToMarkdown: empty result") return "", false } // replace ' \n ' with '\n' - result = strings.ReplaceAll(result, " \n ", "\n") + markdown = strings.ReplaceAll(markdown, " \n ", "\n") - log.Trace().Str("result", result).Msg("ConvertToMarkdown: final result") - return result, true + log.Trace().Str("result", markdown).Msg("ConvertToMarkdown: final result") + return markdown, true }