From 1040fc006066f107d7cf22222fccbb2f2a7e6e7f Mon Sep 17 00:00:00 2001 From: Harold Date: Wed, 7 Dec 2022 17:34:39 +0800 Subject: [PATCH] cosineSimilarity --- README.md | 30 +++-- pom.xml | 10 ++ .../alist_tvbox/service/IndexService.java | 119 +++++++++++++----- .../alist_tvbox/tvbox/IndexContext.java | 23 ++-- .../alist_tvbox/tvbox/IndexRequest.java | 5 +- 5 files changed, 129 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index 57ebf9535c..a9d6d7a244 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ mvn clean package # Run ```bash -java -jar target/alist-tvbox-1.0.jar +java -jar target/alist-tvbox-1.0.jar --server.port=5678 ``` # Deploy @@ -25,18 +25,18 @@ scp config/install-service.sh user@your-server:~ # Docker ```bash ./build.sh -docker run -d -p 8080:8080 --restart=always --name=alist-tvbox alist-tvbox +docker run -d -p 5678:8080 --restart=always --name=alist-tvbox alist-tvbox ``` Or run container from Docker hub. ```bash -docker run -d -p 8080:8080 --restart=always --name=alist-tvbox haroldli/alist-tvbox +docker run -d -p 5678:8080 --restart=always --name=alist-tvbox haroldli/alist-tvbox ``` # TvBox Config ```json { "sites": [ - {"key":"Alist","name":"Alist┃转发","type":1,"api":"http://ip:8080/vod","searchable":1,"quickSearch":1,"filterable":1} + {"key":"Alist","name":"Alist┃转发","type":1,"api":"http://ip:5678/vod","searchable":1,"quickSearch":1,"filterable":1} ], "rules": [ {"host":"pdsapi.aliyundrive.com","rule":["/redirect"]}, @@ -46,7 +46,7 @@ docker run -d -p 8080:8080 --restart=always --name=alist-tvbox haroldli/alist-tv } ``` -Or use this config url `http://ip:8080/sub`. +Or use this config url `http://ip:5678/sub/1`. Change the backend config url in application.yaml ```yaml @@ -56,21 +56,25 @@ app: # Index And Search ```http request -POST http://localhost:8080/index +POST http://localhost:5678/index Content-Type: application/json { "site": "小雅", - "collection": [ + "indexName": "index.xiaoya", + "excludeExternal": false, + "paths": [ "/电视剧", "/动漫", "/综艺", - "/纪录片" - ], - "single": [ + "/纪录片", "/电影", "/音乐" ], + "stopWords": [ + ], + "excludes": [ + ], "maxDepth": 10 } @@ -83,5 +87,9 @@ app: - name: 小雅 url: http://alist.xiaoya.pro searchable: true - indexFile: /the/path/to/index.txt + indexFile: /the/path/to/index.xiaoya.txt + - name: Har01d + url: http://alist.har01d.cn + searchable: true + indexFile: http://d.har01d.cn/index.full.zip ``` diff --git a/pom.xml b/pom.xml index 6bd360ea0d..1b2984f5a1 100644 --- a/pom.xml +++ b/pom.xml @@ -26,6 +26,16 @@ commons-io 2.11.0 + + org.apache.commons + commons-text + 1.10.0 + + + com.hankcs + hanlp + portable-1.8.3 + org.projectlombok diff --git a/src/main/java/cn/har01d/alist_tvbox/service/IndexService.java b/src/main/java/cn/har01d/alist_tvbox/service/IndexService.java index 5d304782a2..ab2ea391a1 100644 --- a/src/main/java/cn/har01d/alist_tvbox/service/IndexService.java +++ b/src/main/java/cn/har01d/alist_tvbox/service/IndexService.java @@ -5,7 +5,10 @@ import cn.har01d.alist_tvbox.model.FsResponse; import cn.har01d.alist_tvbox.tvbox.IndexContext; import cn.har01d.alist_tvbox.tvbox.IndexRequest; +import com.hankcs.hanlp.HanLP; +import com.hankcs.hanlp.seg.common.Term; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.text.similarity.CosineSimilarity; import org.springframework.stereotype.Service; import org.springframework.util.StopWatch; @@ -14,9 +17,8 @@ import java.io.FileWriter; import java.io.IOException; import java.nio.file.Files; -import java.util.ArrayList; -import java.util.List; -import java.util.Set; +import java.time.Duration; +import java.util.*; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; @@ -32,33 +34,26 @@ public IndexService(AListService aListService, AppProperties appProperties) { } public void index(IndexRequest indexRequest) throws IOException { - StopWatch stopWatch = new StopWatch(); - File dir = new File("data/" + indexRequest.getSite()); + StopWatch stopWatch = new StopWatch("index"); + File dir = new File("data/index/" + indexRequest.getSite()); Files.createDirectories(dir.toPath()); File file = new File(dir, indexRequest.getIndexName() + ".txt"); Files.deleteIfExists(file.toPath()); - File fullFile = new File(dir, indexRequest.getIndexName() + ".full.txt"); - Files.deleteIfExists(fullFile.toPath()); - try (FileWriter writer = new FileWriter(file, true); - FileWriter fullWriter = new FileWriter(fullFile, true)) { - IndexContext context = new IndexContext(indexRequest, writer, fullWriter); - for (String path : indexRequest.getCollection()) { - stopWatch.start("index " + path); - index(context, path, 0); - stopWatch.stop(); - } - - context.setIncludeFile(true); - for (String path : indexRequest.getSingle()) { + try (FileWriter writer = new FileWriter(file, true)) { + IndexContext context = new IndexContext(indexRequest, writer); + for (String path : indexRequest.getPaths()) { stopWatch.start("index " + path); index(context, path, 0); stopWatch.stop(); } + log.info("index stats: {}", context.stats); } - zipFile(file, new File(dir, indexRequest.getIndexName() + ".zip")); - log.info("index done: {}", stopWatch.prettyPrint()); + File zipFIle = new File(dir, indexRequest.getIndexName() + ".zip"); + zipFile(file, zipFIle); + log.info("index done, total time : {} {}", Duration.ofNanos(stopWatch.getTotalTimeNanos()), stopWatch.prettyPrint()); + log.info("index file: {}", file.getAbsolutePath()); } private void zipFile(File file, File output) throws IOException { @@ -81,15 +76,21 @@ private void index(IndexContext context, String path, int depth) throws IOExcept } FsResponse fsResponse = aListService.listFiles(context.getSite(), path, 1, 0); - if (fsResponse == null || (context.isExcludeExternal() && fsResponse.getProvider().contains("AList"))) { + if (fsResponse == null) { + context.stats.errors++; + return; + } + if (context.isExcludeExternal() && fsResponse.getProvider().contains("AList")) { return; } + List files = new ArrayList<>(); for (FsInfo fsInfo : fsResponse.getFiles()) { if (fsInfo.getType() == 1) { String newPath = fixPath(path + "/" + fsInfo.getName()); if (exclude(context.getExcludes(), newPath)) { + context.stats.excluded++; continue; } @@ -97,30 +98,29 @@ private void index(IndexContext context, String path, int depth) throws IOExcept } else if (isMediaFormat(fsInfo.getName())) { String newPath = fixPath(path + "/" + fsInfo.getName()); if (exclude(context.getExcludes(), newPath)) { + context.stats.excluded++; continue; } - files.add(newPath); + context.stats.files++; + files.add(fsInfo.getName()); } } if (files.size() > 0 && !context.contains(path)) { context.write(path); - if (context.isWriteFull()) { - context.writeFull(path); - } } - for (String line : files) { - if (context.contains(line)) { + if (isSimilar(path, files, context.getStopWords())) { + return; + } + + for (String name : files) { + String newPath = fixPath(path + "/" + name); + if (context.contains(newPath)) { continue; } - if (context.isIncludeFile()) { - context.write(line); - } - if (context.isWriteFull()) { - context.writeFull(line); - } + context.write(newPath); } } @@ -155,4 +155,57 @@ private String fixPath(String path) { return path.replaceAll("/+", "/").replaceAll("\n", "%20"); } + private String getFolderName(String path) { + int index = path.lastIndexOf('/'); + if (index > 0) { + return path.substring(index + 1); + } + return path; + } + + public boolean isSimilar(String path, List sentences, Set stopWords) { + if (sentences.isEmpty()) { + return true; + } + if (sentences.size() == 1) { + String folderName = getFolderName(path); + List list = new ArrayList<>(sentences); + list.add(folderName); + return isSimilar(path, list, stopWords); + } + + double sum = 0.0; + CosineSimilarity cosineSimilarity = new CosineSimilarity(); + Map leftVector = getVector(stopWords, sentences.get(0)); + for (int i = 1; i < sentences.size(); ++i) { + Map rightVector = getVector(stopWords, sentences.get(i)); + sum += cosineSimilarity.cosineSimilarity(leftVector, rightVector); + leftVector = rightVector; + } + double result = sum / (sentences.size() - 1); + + log.debug("cosineSimilarity {} : {}", path, result); + return result > 0.9; + } + + private Map getVector(Set stopWords, String text) { + Map result = new HashMap<>(); + for (String stopWord : stopWords) { + text = text.replaceAll(stopWord, ""); + } + text = text.replaceAll("\\d+", " ").replaceAll("\\s+", " "); + List termList = HanLP.segment(text); + for (Term term : termList) { + int frequency = term.getFrequency(); + if (frequency == 0) { + frequency = 1; + } + if (result.containsKey(term.word)) { + result.put(term.word, result.get(term.word) + frequency); + } else { + result.put(term.word, frequency); + } + } + return result; + } } diff --git a/src/main/java/cn/har01d/alist_tvbox/tvbox/IndexContext.java b/src/main/java/cn/har01d/alist_tvbox/tvbox/IndexContext.java index 3e68d439dd..a88f0a060d 100644 --- a/src/main/java/cn/har01d/alist_tvbox/tvbox/IndexContext.java +++ b/src/main/java/cn/har01d/alist_tvbox/tvbox/IndexContext.java @@ -9,20 +9,15 @@ @Data public class IndexContext { + public Stats stats = new Stats(); private final IndexRequest indexRequest; private final FileWriter writer; - private final FileWriter fullWriter; - private boolean includeFile; private Set set = new HashSet<>(); public String getSite() { return indexRequest.getSite(); } - public boolean isWriteFull() { - return indexRequest.isWriteFull(); - } - public boolean isExcludeExternal() { return indexRequest.isExcludeExternal(); } @@ -35,19 +30,25 @@ public Set getExcludes() { return indexRequest.getExcludes(); } + public Set getStopWords() { + return indexRequest.getStopWords(); + } + public boolean contains(String key) { return set.contains(key); } public void write(String path) throws IOException { set.add(path); + stats.indexed++; writer.write(path + "\n"); } - public void writeFull(String path) throws IOException { - if (indexRequest.isWriteFull()) { - set.add(path); - fullWriter.write(path + "\n"); - } + @Data + public static class Stats { + public int files; + public int indexed; + public int errors; + public int excluded; } } diff --git a/src/main/java/cn/har01d/alist_tvbox/tvbox/IndexRequest.java b/src/main/java/cn/har01d/alist_tvbox/tvbox/IndexRequest.java index 626a6f0529..e631a1ebb0 100644 --- a/src/main/java/cn/har01d/alist_tvbox/tvbox/IndexRequest.java +++ b/src/main/java/cn/har01d/alist_tvbox/tvbox/IndexRequest.java @@ -9,10 +9,9 @@ public class IndexRequest { private String site; private String indexName = "index"; - private boolean writeFull; private boolean excludeExternal; private int maxDepth = 10; - private Set collection = new HashSet<>(); - private Set single = new HashSet<>(); + private Set paths = new HashSet<>(); + private Set stopWords = new HashSet<>(); private Set excludes = new HashSet<>(); }