Skip to content

Commit

Permalink
💩 Continue hacking on output format
Browse files Browse the repository at this point in the history
  • Loading branch information
wesen committed Jan 26, 2025
1 parent 4c30b52 commit 882ad29
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 133 deletions.
1 change: 1 addition & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@
"hn.html",
"--log-level",
"DEBUG",
"--show-simplified",
"--config",
"/tmp/html-extraction-2025-01-26-17-42-34.yaml"
],
Expand Down
17 changes: 16 additions & 1 deletion changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -690,4 +690,19 @@ Added debug mode and tracing capabilities to ShellToolProvider for better debugg
- Added debug mode to log detailed information about tool calls and arguments
- Added tracing directory support to save input/output JSON files for each tool call
- Implemented functional options pattern for configuration
- Added timestamp-based file naming for trace files
- Added timestamp-based file naming for trace files

# Switch to html-to-markdown Library

Replaced manual markdown conversion with html-to-markdown library for better and more consistent HTML to Markdown conversion:
- Removed manual markdown conversion code and maps
- Added html-to-markdown library integration
- Simplified TextSimplifier implementation
- Maintained existing logging and error handling

# HTML Selector Show Simplified Flag

Added a new flag to control whether simplified HTML is shown in the output to reduce verbosity.

- Added `--show-simplified` flag (default: false) to control whether simplified HTML is included in output
- Modified output to only include simplified HTML and context when explicitly requested
62 changes: 41 additions & 21 deletions cmd/tools/test-html-selector/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ type SimplifiedResult struct {

type SourceResult struct {
Source string `yaml:"source"`
Data map[string][]interface{} `yaml:"data"`
SelectorResults []SelectorResult `yaml:"selector_results"`
Data map[string][]interface{} `yaml:"data,omitempty"`
SelectorResults []SelectorResult `yaml:"selector_results,omitempty"`
}

type HTMLSelectorCommand struct {
Expand All @@ -79,6 +79,7 @@ type HTMLSelectorSettings struct {
ExtractTemplate string `glazed.parameter:"extract-template"`
ShowContext bool `glazed.parameter:"show-context"`
ShowPath bool `glazed.parameter:"show-path"`
ShowSimplified bool `glazed.parameter:"show-simplified"`
SampleCount int `glazed.parameter:"sample-count"`
ContextChars int `glazed.parameter:"context-chars"`
StripScripts bool `glazed.parameter:"strip-scripts"`
Expand Down Expand Up @@ -155,6 +156,12 @@ It provides match counts and contextual examples to verify selector accuracy.`),
parameters.WithHelp("Show path to matched elements"),
parameters.WithDefault(true),
),
parameters.NewParameterDefinition(
"show-simplified",
parameters.ParameterTypeBool,
parameters.WithHelp("Show simplified HTML in output"),
parameters.WithDefault(false),
),
parameters.NewParameterDefinition(
"sample-count",
parameters.ParameterTypeInteger,
Expand Down Expand Up @@ -290,7 +297,7 @@ func (c *HTMLSelectorCommand) RunIntoWriter(
MaxTableRows: s.MaxTableRows,
})

var sourceResults []SourceResult
var sourceResults []*SourceResult

// Process files
for _, file := range s.Files {
Expand All @@ -312,6 +319,11 @@ func (c *HTMLSelectorCommand) RunIntoWriter(

// If using extract or extract-template, process all matches without sample limit
if s.Extract || s.ExtractTemplate != "" {
// clear the selector results
for _, sourceResult := range sourceResults {
sourceResult.SelectorResults = []SelectorResult{}
}

// If extract-data is true, output raw data regardless of templates
if s.ExtractData {
return yaml.NewEncoder(w).Encode(sourceResults)
Expand Down Expand Up @@ -373,9 +385,12 @@ func (c *HTMLSelectorCommand) RunIntoWriter(
}

sample := SimplifiedSample{
SimplifiedHTML: htmlDocs,
HTML: selectorSample.HTML,
Markdown: markdown,
HTML: selectorSample.HTML,
Markdown: markdown,
}

if s.ShowSimplified {
sample.SimplifiedHTML = htmlDocs
}

if s.ShowPath {
Expand All @@ -386,7 +401,9 @@ func (c *HTMLSelectorCommand) RunIntoWriter(
if err != nil {
return fmt.Errorf("failed to process HTML: %w", err)
}
sample.SimplifiedContext = htmlDocs
if s.ShowSimplified {
sample.SimplifiedContext = htmlDocs
}
sample.Context = selectorSample.Context
}
newResults[selectorResult.Name].Samples = append(newResults[selectorResult.Name].Samples, sample)
Expand All @@ -413,9 +430,10 @@ func processSource(
selectors []Selector,
s *HTMLSelectorSettings,
simplifier *htmlsimplifier.Simplifier,
) (SourceResult, error) {
var result SourceResult
result.Source = source
) (*SourceResult, error) {
result := &SourceResult{
Source: source,
}

var f io.ReadCloser
var err error
Expand Down Expand Up @@ -468,20 +486,22 @@ func processSource(
for _, r := range results {
var matches []interface{}
for _, selectorSample := range r.Samples {
// Process HTML content
htmlDocs, err := simplifier.ProcessHTML(selectorSample.HTML)
if err == nil {

for _, doc := range htmlDocs {
if doc.Text != "" {
matches = append(matches, doc.Text)
} else if doc.Markdown != "" {
matches = append(matches, doc.Markdown)
} else {
matches = append(matches, doc)
// Convert sample to markdown if requested
if s.Markdown {
// Create markdown converter
converter := md.NewConverter("", true, nil)
var markdown string

// Convert HTML to markdown if present
if selectorSample.HTML != "" {
markdown, err = converter.ConvertString(selectorSample.HTML)
if err == nil {
matches = append(matches, markdown)
continue
}
}
}
matches = append(matches, selectorSample.HTML)
}
result.Data[r.Name] = matches
}
Expand Down
13 changes: 13 additions & 0 deletions pkg/htmlsimplifier/node_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,13 @@ func (h *NodeHandler) IsTextOnly(node *html.Node) bool {
return false
}

// Nodes with class or id attributes that are text-only strategy are not text-only
for _, attr := range node.Attr {
if attr.Key == "class" || attr.Key == "id" {
return false
}
}

// Check all children
for child := node.FirstChild; child != nil; child = child.NextSibling {
if !h.IsTextOnly(child) {
Expand Down Expand Up @@ -166,6 +173,12 @@ func (h *NodeHandler) IsMarkdownable(node *html.Node) bool {

// For non-markdown elements that are text-only, we need to check if they contain any non-markdown elements
if strategy == StrategyTextOnly {
// Nodes with class or id attributes that are text-only strategy are not markdownable
for _, attr := range node.Attr {
if attr.Key == "class" || attr.Key == "id" {
return false
}
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
if child.Type == html.ElementNode {
childStrategy := h.GetStrategy(child)
Expand Down
6 changes: 6 additions & 0 deletions pkg/htmlsimplifier/simplifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,12 @@ func (s *Simplifier) processNode(node *html.Node) []Document {
}}
}
}

// If node has class or id, fall through to default processing
if len(classes) > 0 || id != "" {
break
}

// If text simplification fails or is disabled, extract text normally
text := s.textSimplifier.ExtractText(node)
if text != "" {
Expand Down
128 changes: 17 additions & 111 deletions pkg/htmlsimplifier/text_simplifier.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
package htmlsimplifier

import (
"fmt"
"strings"

md "github.com/JohannesKaufmann/html-to-markdown"
"github.com/rs/zerolog/log"
"golang.org/x/net/html"
)
Expand All @@ -12,6 +12,7 @@ import (
type TextSimplifier struct {
markdownEnabled bool
nodeHandler *NodeHandler
mdConverter *md.Converter
}

// NewTextSimplifier creates a new text simplifier
Expand All @@ -20,39 +21,10 @@ func NewTextSimplifier(markdownEnabled bool) *TextSimplifier {
return &TextSimplifier{
markdownEnabled: markdownEnabled,
nodeHandler: NewNodeHandler(opts),
mdConverter: md.NewConverter("", true, nil),
}
}

// MarkdownElements defines HTML elements that can be converted to markdown
var MarkdownElements = map[string]bool{
"a": true, // Links (only within p or span)
"strong": true, // Bold text
"em": true, // Italic text
"b": true, // Bold text (alternative)
"i": true, // Italic text (alternative)
"code": true, // Code snippets
}

// MarkdownStart defines the opening markdown syntax for each element type
var MarkdownStart = map[string]string{
"a": "[", // Links
"strong": "**", // Bold text
"em": "*", // Italic text
"b": "**", // Bold text (alternative)
"i": "*", // Italic text (alternative)
"code": "`", // Code snippets
}

// MarkdownEnd defines the closing markdown syntax for each element type
var MarkdownEnd = map[string]string{
"a": "](%s)", // Links (format with href)
"strong": "**", // Bold text
"em": "*", // Italic text
"b": "**", // Bold text (alternative)
"i": "*", // Italic text (alternative)
"code": "`", // Code snippets
}

// SimplifyText attempts to convert a node and its children to a single text string
func (t *TextSimplifier) SimplifyText(node *html.Node) (string, bool) {
if node == nil {
Expand Down Expand Up @@ -161,95 +133,29 @@ func (t *TextSimplifier) ConvertToMarkdown(node *html.Node) (string, bool) {
return text, true
}

// Check if markdown is enabled for this node
if !t.markdownEnabled && MarkdownElements[node.Data] {
log.Trace().Str("node_type", node.Data).Msg("ConvertToMarkdown: markdown disabled for this element")
// Convert the node to HTML string
var buf strings.Builder
err := html.Render(&buf, node)
if err != nil {
log.Error().Err(err).Msg("ConvertToMarkdown: failed to render HTML")
return "", false
}

log.Trace().Str("node_type", node.Data).Msg("ConvertToMarkdown: processing element node")

// Process children first
var parts []string
for child := node.FirstChild; child != nil; child = child.NextSibling {
if child.Type == html.TextNode {
text := strings.TrimSpace(child.Data)
if text != "" {
parts = append(parts, text)
log.Trace().Str("text", text).Msg("ConvertToMarkdown: added text node content")
}
continue
}

switch child.Data {
case "a":
href := ""
for _, attr := range child.Attr {
if attr.Key == "href" {
href = attr.Val
break
}
}
text, ok := t.ConvertToMarkdown(child)
if !ok || text == "" {
log.Trace().Msg("ConvertToMarkdown: failed to process link content")
return "", false
}
link := fmt.Sprintf("[%s](%s)", text, href)
parts = append(parts, link)
log.Trace().Str("link", link).Msg("ConvertToMarkdown: processed link")
case "strong", "b":
text, ok := t.ConvertToMarkdown(child)
if !ok || text == "" {
log.Trace().Msg("ConvertToMarkdown: failed to process strong/bold content")
return "", false
}
bold := fmt.Sprintf("**%s**", text)
parts = append(parts, bold)
log.Trace().Str("bold", bold).Msg("ConvertToMarkdown: processed strong/bold")
case "em", "i":
text, ok := t.ConvertToMarkdown(child)
if !ok || text == "" {
log.Trace().Msg("ConvertToMarkdown: failed to process emphasis content")
return "", false
}
em := fmt.Sprintf("*%s*", text)
parts = append(parts, em)
log.Trace().Str("emphasis", em).Msg("ConvertToMarkdown: processed emphasis")
case "code":
text, ok := t.ConvertToMarkdown(child)
if !ok || text == "" {
log.Trace().Msg("ConvertToMarkdown: failed to process code content")
return "", false
}
code := fmt.Sprintf("`%s`", text)
parts = append(parts, code)
log.Trace().Str("code", code).Msg("ConvertToMarkdown: processed code")
case "br":
parts = append(parts, "\n")
log.Trace().Msg("ConvertToMarkdown: processed line break")
default:
text, ok := t.ConvertToMarkdown(child)
if !ok {
log.Trace().Str("node_type", child.Data).Msg("ConvertToMarkdown: failed to process unknown element")
return "", false
}
if text != "" {
parts = append(parts, text)
log.Trace().Str("text", text).Msg("ConvertToMarkdown: processed unknown element")
}
}
// Convert to markdown using html-to-markdown
markdown, err := t.mdConverter.ConvertString(buf.String())
if err != nil {
log.Error().Err(err).Msg("ConvertToMarkdown: failed to convert to markdown")
return "", false
}

result := strings.Join(parts, " ")
if result == "" {
if markdown == "" {
log.Trace().Msg("ConvertToMarkdown: empty result")
return "", false
}

// replace ' \n ' with '\n'
result = strings.ReplaceAll(result, " \n ", "\n")
markdown = strings.ReplaceAll(markdown, " \n ", "\n")

log.Trace().Str("result", result).Msg("ConvertToMarkdown: final result")
return result, true
log.Trace().Str("result", markdown).Msg("ConvertToMarkdown: final result")
return markdown, true
}

0 comments on commit 882ad29

Please sign in to comment.