Mix.install([
{:kino, github: "livebook-dev/kino", override: true},
{:kino_lab, "~> 0.1.0-dev", github: "jonatanklosko/kino_lab"},
{:floki, "~> 0.32"},
{:pandex, "~> 0.2.0"},
{:utilities, path: "utilities"}
])
Using the excellent req
library, we want to get the HTML of the job post url and convert the contents to markdown. The conversion to markdown is handled via pandex
and the pandoc application.
NOTE: We require pandoc to be installed to convert from HTML to Markdown
- If
<temporary html file does not exist>:
- Use
req
to download the full HTML page intodata/#{host}/#{hash}.html
.
- Use
- If
<temporary html file exists>:
- Ask to overwrite?
- Skip to next step (3).
- Detect ATS system, one of
[breezyhr, greenhouse, lever, unknown]
- Parse HTML into Markdown
- Pandoc
- Manually
- EasyHTML - Was unaware this was just a wrapper around Floki
- Add
[ ]
to every list item- We do this specifically to help with a manual checklist.
- I would not apply to a position that had very few checked off.
- This list conversion has been almost 100% universal, to the point that this process should "be a thing."
url = Kino.Input.url("URL")
url_value = Kino.Input.read(url)
{directory, filename} = Utilities.Persistence.get_path(url_value)
transform_title = fn document ->
title =
Floki.find(document, "title")
|> hd()
|> Floki.raw_html()
# Since this is a single tag we can just convert it to Markdown directly
title |> String.replace("<title>", "") |> String.replace("</title>", "")
end
transform_description = fn document ->
{:ok, markdown} = Pandex.html_to_markdown_strict(document)
markdown_title = transform_title.(document)
# Cleanup escaped brackets [ ]
content = markdown |> String.replace("\\[ \\]", "[ ]") |> String.replace("- [ ]", "* [ ]")
# Inject our title as the first line
markdown_content = "# #{markdown_title}\n\n" <> content
[title: markdown_title, content: markdown_content]
end
html = Utilities.Persistence.read(url_value)
{type, description} = Utilities.Detection.find_description(html)
# [title, content] =
# case type do
# # :breezy_hr -> Utilities.Detection.BreezyHR.transform_description(description)
# # :lever_co -> Utilities.Detection.LeverCo.transform_description(description)
# # :greenhouse_io -> Utilities.Detection.GreenhouseIo.transform_description(description)
# :greenhouse_io -> [title: "", content: description]
# :unknown -> [title: "", content: description]
# end
# transformed = description
# |> Floki.find("#app_body")
# |> hd()
# |> Floki.children(include_text: false)
# |> Floki.traverse_and_update(fn
# {"li", attrs, [text]} ->
# cond do
# is_binary(text) -> {"li", attrs, ["[ ] " <> text]}
# is_tuple(text) ->
# Floki.traverse_and_update(text, fn
# {_, _attrs, [text]} -> {"li", [], ["[ ] " <> text]}
# end)
# true -> {"li", attrs, [text]}
# end
# tag -> tag
# end)
[title, content] = transform_description.(html)
{_, title_markdown} = title
{_, content_markdown} = content
Utilities.Persistence.save(url_value, content_markdown, "md")
title_markdown
Kino.Markdown.new(content_markdown)