Skip to content

Commit

Permalink
✨ Include id and class in tag names for better readability
Browse files Browse the repository at this point in the history
  • Loading branch information
wesen committed Jan 26, 2025
1 parent 0e9dd0c commit c6e8d48
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 12 deletions.
9 changes: 8 additions & 1 deletion changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -523,4 +523,11 @@ Added support for both select and filter modes in HTML simplification:
- New `mode` field in selectors config: "select" or "filter"
- Select mode keeps only matching elements and their parents
- Filter mode removes matching elements
- Selectors are applied in order: first selects, then filters
- Selectors are applied in order: first selects, then filters

HTML Simplifier Tag Format Enhancement
Enhanced the HTML simplifier to include id and class attributes in the tag name for better readability and CSS-like format.

- Modified tag format to include id and classes (e.g. div#myid.class1.class2)
- Removed id and class from regular attributes list
- Improved readability of HTML structure in YAML output
41 changes: 30 additions & 11 deletions pkg/htmlsimplifier/simplifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,8 @@ func (s *Simplifier) processNode(node *html.Node) []Document {

// Process attributes for all nodes
var attrs []string
var classes []string
var id string
for _, attr := range node.Attr {
if s.opts.StripCSS && attr.Key == "style" {
continue
Expand All @@ -180,6 +182,14 @@ func (s *Simplifier) processNode(node *html.Node) []Document {
(attr.Key == "d" || attr.Key == "viewBox" || attr.Key == "transform") {
continue
}
if attr.Key == "class" {
classes = strings.Fields(attr.Val)
continue
}
if attr.Key == "id" {
id = attr.Val
continue
}
attrs = append(attrs, fmt.Sprintf("%s=%s", attr.Key, attr.Val))
}
attrsStr := strings.Join(attrs, " ")
Expand All @@ -197,6 +207,15 @@ func (s *Simplifier) processNode(node *html.Node) []Document {
}}
}

// Build tag name with id and classes
tagName := node.Data
if id != "" {
tagName = fmt.Sprintf("%s#%s", tagName, id)
}
for _, class := range classes {
tagName = fmt.Sprintf("%s.%s", tagName, class)
}

switch strategy {
case StrategyFilter:
return nil
Expand All @@ -213,7 +232,7 @@ func (s *Simplifier) processNode(node *html.Node) []Document {
if s.opts.Markdown && s.nodeHandler.IsMarkdownable(node) {
if markdown, ok := s.textSimplifier.ConvertToMarkdown(node); ok {
return []Document{{
Tag: node.Data,
Tag: tagName,
Attrs: attrsStr,
Markdown: markdown,
}}
Expand All @@ -223,7 +242,7 @@ func (s *Simplifier) processNode(node *html.Node) []Document {
if s.opts.SimplifyText && s.nodeHandler.IsTextOnly(node) {
if text, ok := s.textSimplifier.SimplifyText(node); ok {
return []Document{{
Tag: node.Data,
Tag: tagName,
Attrs: attrsStr,
Text: text,
}}
Expand All @@ -233,7 +252,7 @@ func (s *Simplifier) processNode(node *html.Node) []Document {
text := s.textSimplifier.ExtractText(node)
if text != "" {
return []Document{{
Tag: node.Data,
Tag: tagName,
Attrs: attrsStr,
Text: text,
}}
Expand All @@ -255,7 +274,7 @@ func (s *Simplifier) processNode(node *html.Node) []Document {
if s.opts.Markdown && s.nodeHandler.IsMarkdownable(node) {
if markdown, ok := s.textSimplifier.ConvertToMarkdown(node); ok {
return []Document{{
Tag: node.Data,
Tag: tagName,
Attrs: attrsStr,
Markdown: markdown,
}}
Expand All @@ -266,14 +285,14 @@ func (s *Simplifier) processNode(node *html.Node) []Document {
case StrategyDefault:
// Check if all children are markdown-able
if s.opts.Markdown {
if docs, ok := s.tryMarkdownConversion(node, attrsStr); ok {
if docs, ok := s.tryMarkdownConversion(node, tagName, attrsStr); ok {
return docs
}
}

// Check if all children are text-only
if s.opts.SimplifyText {
if docs, ok := s.tryTextSimplification(node, attrsStr); ok {
if docs, ok := s.tryTextSimplification(node, tagName, attrsStr); ok {
return docs
}
}
Expand All @@ -282,7 +301,7 @@ func (s *Simplifier) processNode(node *html.Node) []Document {

// Default processing: keep the node and process children
doc := Document{
Tag: node.Data,
Tag: tagName,
Attrs: attrsStr,
IsSVG: node.Data == "svg" || (node.Parent != nil && node.Parent.Data == "svg"),
}
Expand Down Expand Up @@ -320,7 +339,7 @@ func (s *Simplifier) processNode(node *html.Node) []Document {
return []Document{doc}
}

func (s *Simplifier) tryMarkdownConversion(node *html.Node, attrsStr string) ([]Document, bool) {
func (s *Simplifier) tryMarkdownConversion(node *html.Node, tagName string, attrsStr string) ([]Document, bool) {
allMarkdownable := true
for child := node.FirstChild; child != nil; child = child.NextSibling {
if !s.nodeHandler.IsMarkdownable(child) {
Expand All @@ -332,7 +351,7 @@ func (s *Simplifier) tryMarkdownConversion(node *html.Node, attrsStr string) ([]
markdown, ok := s.textSimplifier.ConvertToMarkdown(node)
if ok {
return []Document{{
Tag: node.Data,
Tag: tagName,
Attrs: attrsStr,
Markdown: markdown,
}}, true
Expand All @@ -341,7 +360,7 @@ func (s *Simplifier) tryMarkdownConversion(node *html.Node, attrsStr string) ([]
return nil, false
}

func (s *Simplifier) tryTextSimplification(node *html.Node, attrsStr string) ([]Document, bool) {
func (s *Simplifier) tryTextSimplification(node *html.Node, tagName string, attrsStr string) ([]Document, bool) {
allTextable := true
var textParts []string
for child := node.FirstChild; child != nil; child = child.NextSibling {
Expand All @@ -355,7 +374,7 @@ func (s *Simplifier) tryTextSimplification(node *html.Node, attrsStr string) ([]
}
if allTextable && len(textParts) > 0 {
return []Document{{
Tag: node.Data,
Tag: tagName,
Attrs: attrsStr,
Text: strings.Join(textParts, " "),
}}, true
Expand Down

0 comments on commit c6e8d48

Please sign in to comment.