Skip to content

Commit

Permalink
feat: allow import of newer pocket data export files in csv format (#…
Browse files Browse the repository at this point in the history
…1023)

* feat: allow import of newer pocket data export files in csv format

Signed-off-by: Mel <einebeere@gmail.com>

* fix: linter error

Signed-off-by: Mel <einebeere@gmail.com>

---------

Signed-off-by: Mel <einebeere@gmail.com>
Co-authored-by: Felipe Martin <812088+fmartingr@users.noreply.github.com>
  • Loading branch information
melnary and fmartingr authored Dec 11, 2024
1 parent 87bc7a8 commit c2821ff
Showing 1 changed file with 144 additions and 43 deletions.
187 changes: 144 additions & 43 deletions internal/cmd/pocket.go
Original file line number Diff line number Diff line change
@@ -1,22 +1,29 @@
package cmd

import (
"context"
"encoding/csv"
"errors"
"fmt"
"os"
"path/filepath"
"regexp"
"slices"
"strconv"
"strings"
"time"

"github.com/PuerkitoBio/goquery"
"github.com/go-shiori/shiori/internal/core"
"github.com/go-shiori/shiori/internal/database"
"github.com/go-shiori/shiori/internal/model"
"github.com/spf13/cobra"
)

func pocketCmd() *cobra.Command {
cmd := &cobra.Command{
Use: "pocket source-file",
Short: "Import bookmarks from Pocket's exported HTML file",
Short: "Import bookmarks from Pocket's data export file",
Args: cobra.ExactArgs(1),
Run: pocketHandler,
}
Expand All @@ -25,17 +32,43 @@ func pocketCmd() *cobra.Command {
}

func pocketHandler(cmd *cobra.Command, args []string) {
_, deps := initShiori(cmd.Context(), cmd)
ctx := cmd.Context()
_, deps := initShiori(ctx, cmd)

// Open pocket's file
srcFile, err := os.Open(args[0])
filePath := args[0]
srcFile, err := os.Open(filePath)
if err != nil {
cError.Println(err)
os.Exit(1)
}
defer srcFile.Close()

// Parse pocket's file
var bookmarks []model.BookmarkDTO
switch filepath.Ext(filePath) {
case ".html":
bookmarks = parseHtmlExport(ctx, deps.Database, srcFile)
case ".csv":
bookmarks = parseCsvExport(ctx, deps.Database, srcFile)
default:
cError.Println("Invalid file format. Only HTML and CSV are supported.")
os.Exit(1)
}

// Save bookmark to database
bookmarks, err = deps.Database.SaveBookmarks(ctx, true, bookmarks...)
if err != nil {
cError.Printf("Failed to save bookmarks: %v\n", err)
os.Exit(1)
}

// Print imported bookmarks
fmt.Println()
printBookmarks(bookmarks...)
}

// Parse bookmarks from HTML file
func parseHtmlExport(ctx context.Context, db database.DB, srcFile *os.File) []model.BookmarkDTO {
bookmarks := []model.BookmarkDTO{}
mapURL := make(map[string]struct{})

Expand All @@ -49,69 +82,137 @@ func pocketHandler(cmd *cobra.Command, args []string) {
// Get metadata
title := a.Text()
url, _ := a.Attr("href")
strTags, _ := a.Attr("tags")
strModified, _ := a.Attr("time_added")
intModified, _ := strconv.ParseInt(strModified, 10, 64)
modified := time.Unix(intModified, 0)

// Clean up URL
var err error
url, err = core.RemoveUTMParams(url)
tagsStr, _ := a.Attr("tags")
timeAddedStr, _ := a.Attr("time_added")

title, url, timeAdded, tags, err := verifyMetadata(title, url, timeAddedStr, tagsStr)
if err != nil {
cError.Printf("Skip %s: URL is not valid\n", url)
cError.Printf("Skip %s: %v\n", url, err)
return
}

// Make sure title is valid Utf-8
title = validateTitle(title, url)

// Check if the URL already exist before, both in bookmark
// file or in database
if _, exist := mapURL[url]; exist {
cError.Printf("Skip %s: URL already exists\n", url)
if err = handleDuplicates(ctx, db, mapURL, url); err != nil {
cError.Printf("Skip %s: %v\n", url, err)
return
}

_, exist, err := deps.Database.GetBookmark(cmd.Context(), 0, url)
if err != nil {
cError.Printf("Skip %s: Get Bookmark fail, %v", url, err)
return
// Add item to list
bookmark := model.BookmarkDTO{
URL: url,
Title: title,
ModifiedAt: timeAdded.Format(model.DatabaseDateFormat),
CreatedAt: timeAdded.Format(model.DatabaseDateFormat),
Tags: tags,
}

if exist {
cError.Printf("Skip %s: URL already exists\n", url)
mapURL[url] = struct{}{}
return
}
mapURL[url] = struct{}{}
bookmarks = append(bookmarks, bookmark)
})

return bookmarks
}

// Parse bookmarks from CSV file
func parseCsvExport(ctx context.Context, db database.DB, srcFile *os.File) []model.BookmarkDTO {
bookmarks := []model.BookmarkDTO{}
mapURL := make(map[string]struct{})

// Get bookmark tags
tags := []model.Tag{}
for _, strTag := range strings.Split(strTags, ",") {
if strTag != "" {
tags = append(tags, model.Tag{Name: strTag})
reader := csv.NewReader(srcFile)
records, err := reader.ReadAll()
if err != nil {
cError.Println(err)
os.Exit(1)
}

for i, cols := range records {
// Check and skip header
if i == 0 {
expected := []string{"title", "url", "time_added", "cursor", "tags", "status"}
if slices.Compare(cols, expected) != 0 {
cError.Printf("Invalid CSV format. Header must be: %s\n", strings.Join(expected, ","))
os.Exit(1)
}
continue
}

// Get metadata
title, url, timeAdded, tags, err := verifyMetadata(cols[0], cols[1], cols[2], cols[4])
if err != nil {
cError.Printf("Skip %s: %v\n", url, err)
continue
}

if err = handleDuplicates(ctx, db, mapURL, url); err != nil {
cError.Printf("Skip %s: %v\n", url, err)
continue
}

// Add item to list
bookmark := model.BookmarkDTO{
URL: url,
Title: title,
ModifiedAt: modified.Format(model.DatabaseDateFormat),
ModifiedAt: timeAdded.Format(model.DatabaseDateFormat),
CreatedAt: timeAdded.Format(model.DatabaseDateFormat),
Tags: tags,
}

mapURL[url] = struct{}{}
bookmarks = append(bookmarks, bookmark)
})
}

// Save bookmark to database
bookmarks, err = deps.Database.SaveBookmarks(cmd.Context(), true, bookmarks...)
return bookmarks
}

// Parse metadata and verify it's validity
func verifyMetadata(title, url, timeAddedStr, tags string) (string, string, time.Time, []model.Tag, error) {
// Clean up URL
var err error
url, err = core.RemoveUTMParams(url)
if err != nil {
cError.Printf("Failed to save bookmarks: %v\n", err)
os.Exit(1)
err = fmt.Errorf("URL is not valid, %w", err)
return "", "", time.Time{}, nil, err
}

// Print imported bookmark
fmt.Println()
printBookmarks(bookmarks...)
// Make sure title is valid Utf-8
title = validateTitle(title, url)

// Parse time added
timeAddedInt, err := strconv.ParseInt(timeAddedStr, 10, 64)
if err != nil {
err = fmt.Errorf("Invalid time added, %w", err)
return "", "", time.Time{}, nil, err
}
timeAdded := time.Unix(timeAddedInt, 0)

// Get bookmark tags
tagsList := []model.Tag{}
// We need to split tags by both comma or pipe,
// because Pocket's CSV export use pipe as separator,
// while HTML export use comma.
for _, tag := range regexp.MustCompile(`[,|]`).Split(tags, -1) {
if tag != "" {
tagsList = append(tagsList, model.Tag{Name: tag})
}
}

return title, url, timeAdded, tagsList, nil
}

// Checks if the URL already exist, both in bookmark
// file or in database
func handleDuplicates(ctx context.Context, db database.DB, mapURL map[string]struct{}, url string) error {
if _, exists := mapURL[url]; exists {
return errors.New("URL already exists")
}

_, exists, err := db.GetBookmark(ctx, 0, url)
if err != nil {
return fmt.Errorf("Failed getting bookmark, %w", err)
}

if exists {
return errors.New("URL already exists")
}

return nil
}

0 comments on commit c2821ff

Please sign in to comment.