Skip to content

Commit

Permalink
cosineSimilarity
Browse files Browse the repository at this point in the history
  • Loading branch information
power721 committed Dec 8, 2022
1 parent 4578833 commit 1040fc0
Show file tree
Hide file tree
Showing 5 changed files with 129 additions and 58 deletions.
30 changes: 19 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ mvn clean package

# Run
```bash
java -jar target/alist-tvbox-1.0.jar
java -jar target/alist-tvbox-1.0.jar --server.port=5678
```

# Deploy
Expand All @@ -25,18 +25,18 @@ scp config/install-service.sh user@your-server:~
# Docker
```bash
./build.sh
docker run -d -p 8080:8080 --restart=always --name=alist-tvbox alist-tvbox
docker run -d -p 5678:8080 --restart=always --name=alist-tvbox alist-tvbox
```
Or run container from Docker hub.
```bash
docker run -d -p 8080:8080 --restart=always --name=alist-tvbox haroldli/alist-tvbox
docker run -d -p 5678:8080 --restart=always --name=alist-tvbox haroldli/alist-tvbox
```

# TvBox Config
```json
{
"sites": [
{"key":"Alist","name":"Alist┃转发","type":1,"api":"http://ip:8080/vod","searchable":1,"quickSearch":1,"filterable":1}
{"key":"Alist","name":"Alist┃转发","type":1,"api":"http://ip:5678/vod","searchable":1,"quickSearch":1,"filterable":1}
],
"rules": [
{"host":"pdsapi.aliyundrive.com","rule":["/redirect"]},
Expand All @@ -46,7 +46,7 @@ docker run -d -p 8080:8080 --restart=always --name=alist-tvbox haroldli/alist-tv
}
```

Or use this config url `http://ip:8080/sub`.
Or use this config url `http://ip:5678/sub/1`.

Change the backend config url in application.yaml
```yaml
Expand All @@ -56,21 +56,25 @@ app:
# Index And Search
```http request
POST http://localhost:8080/index
POST http://localhost:5678/index
Content-Type: application/json

{
"site": "小雅",
"collection": [
"indexName": "index.xiaoya",
"excludeExternal": false,
"paths": [
"/电视剧",
"/动漫",
"/综艺",
"/纪录片"
],
"single": [
"/纪录片",
"/电影",
"/音乐"
],
"stopWords": [
],
"excludes": [
],
"maxDepth": 10
}

Expand All @@ -83,5 +87,9 @@ app:
- name: 小雅
url: http://alist.xiaoya.pro
searchable: true
indexFile: /the/path/to/index.txt
indexFile: /the/path/to/index.xiaoya.txt
- name: Har01d
url: http://alist.har01d.cn
searchable: true
indexFile: http://d.har01d.cn/index.full.zip
```
10 changes: 10 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,16 @@
<artifactId>commons-io</artifactId>
<version>2.11.0</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
<version>1.10.0</version>
</dependency>
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.8.3</version>
</dependency>

<dependency>
<groupId>org.projectlombok</groupId>
Expand Down
119 changes: 86 additions & 33 deletions src/main/java/cn/har01d/alist_tvbox/service/IndexService.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
import cn.har01d.alist_tvbox.model.FsResponse;
import cn.har01d.alist_tvbox.tvbox.IndexContext;
import cn.har01d.alist_tvbox.tvbox.IndexRequest;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.text.similarity.CosineSimilarity;
import org.springframework.stereotype.Service;
import org.springframework.util.StopWatch;

Expand All @@ -14,9 +17,8 @@
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.time.Duration;
import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

Expand All @@ -32,33 +34,26 @@ public IndexService(AListService aListService, AppProperties appProperties) {
}

public void index(IndexRequest indexRequest) throws IOException {
StopWatch stopWatch = new StopWatch();
File dir = new File("data/" + indexRequest.getSite());
StopWatch stopWatch = new StopWatch("index");
File dir = new File("data/index/" + indexRequest.getSite());
Files.createDirectories(dir.toPath());
File file = new File(dir, indexRequest.getIndexName() + ".txt");
Files.deleteIfExists(file.toPath());
File fullFile = new File(dir, indexRequest.getIndexName() + ".full.txt");
Files.deleteIfExists(fullFile.toPath());

try (FileWriter writer = new FileWriter(file, true);
FileWriter fullWriter = new FileWriter(fullFile, true)) {
IndexContext context = new IndexContext(indexRequest, writer, fullWriter);
for (String path : indexRequest.getCollection()) {
stopWatch.start("index " + path);
index(context, path, 0);
stopWatch.stop();
}

context.setIncludeFile(true);
for (String path : indexRequest.getSingle()) {
try (FileWriter writer = new FileWriter(file, true)) {
IndexContext context = new IndexContext(indexRequest, writer);
for (String path : indexRequest.getPaths()) {
stopWatch.start("index " + path);
index(context, path, 0);
stopWatch.stop();
}
log.info("index stats: {}", context.stats);
}

zipFile(file, new File(dir, indexRequest.getIndexName() + ".zip"));
log.info("index done: {}", stopWatch.prettyPrint());
File zipFIle = new File(dir, indexRequest.getIndexName() + ".zip");
zipFile(file, zipFIle);
log.info("index done, total time : {} {}", Duration.ofNanos(stopWatch.getTotalTimeNanos()), stopWatch.prettyPrint());
log.info("index file: {}", file.getAbsolutePath());
}

private void zipFile(File file, File output) throws IOException {
Expand All @@ -81,46 +76,51 @@ private void index(IndexContext context, String path, int depth) throws IOExcept
}

FsResponse fsResponse = aListService.listFiles(context.getSite(), path, 1, 0);
if (fsResponse == null || (context.isExcludeExternal() && fsResponse.getProvider().contains("AList"))) {
if (fsResponse == null) {
context.stats.errors++;
return;
}
if (context.isExcludeExternal() && fsResponse.getProvider().contains("AList")) {
return;
}


List<String> files = new ArrayList<>();
for (FsInfo fsInfo : fsResponse.getFiles()) {
if (fsInfo.getType() == 1) {
String newPath = fixPath(path + "/" + fsInfo.getName());
if (exclude(context.getExcludes(), newPath)) {
context.stats.excluded++;
continue;
}

index(context, newPath, depth + 1);
} else if (isMediaFormat(fsInfo.getName())) {
String newPath = fixPath(path + "/" + fsInfo.getName());
if (exclude(context.getExcludes(), newPath)) {
context.stats.excluded++;
continue;
}

files.add(newPath);
context.stats.files++;
files.add(fsInfo.getName());
}
}

if (files.size() > 0 && !context.contains(path)) {
context.write(path);
if (context.isWriteFull()) {
context.writeFull(path);
}
}

for (String line : files) {
if (context.contains(line)) {
if (isSimilar(path, files, context.getStopWords())) {
return;
}

for (String name : files) {
String newPath = fixPath(path + "/" + name);
if (context.contains(newPath)) {
continue;
}
if (context.isIncludeFile()) {
context.write(line);
}
if (context.isWriteFull()) {
context.writeFull(line);
}
context.write(newPath);
}
}

Expand Down Expand Up @@ -155,4 +155,57 @@ private String fixPath(String path) {
return path.replaceAll("/+", "/").replaceAll("\n", "%20");
}

private String getFolderName(String path) {
int index = path.lastIndexOf('/');
if (index > 0) {
return path.substring(index + 1);
}
return path;
}

public boolean isSimilar(String path, List<String> sentences, Set<String> stopWords) {
if (sentences.isEmpty()) {
return true;
}
if (sentences.size() == 1) {
String folderName = getFolderName(path);
List<String> list = new ArrayList<>(sentences);
list.add(folderName);
return isSimilar(path, list, stopWords);
}

double sum = 0.0;
CosineSimilarity cosineSimilarity = new CosineSimilarity();
Map<CharSequence, Integer> leftVector = getVector(stopWords, sentences.get(0));
for (int i = 1; i < sentences.size(); ++i) {
Map<CharSequence, Integer> rightVector = getVector(stopWords, sentences.get(i));
sum += cosineSimilarity.cosineSimilarity(leftVector, rightVector);
leftVector = rightVector;
}
double result = sum / (sentences.size() - 1);

log.debug("cosineSimilarity {} : {}", path, result);
return result > 0.9;
}

private Map<CharSequence, Integer> getVector(Set<String> stopWords, String text) {
Map<CharSequence, Integer> result = new HashMap<>();
for (String stopWord : stopWords) {
text = text.replaceAll(stopWord, "");
}
text = text.replaceAll("\\d+", " ").replaceAll("\\s+", " ");
List<Term> termList = HanLP.segment(text);
for (Term term : termList) {
int frequency = term.getFrequency();
if (frequency == 0) {
frequency = 1;
}
if (result.containsKey(term.word)) {
result.put(term.word, result.get(term.word) + frequency);
} else {
result.put(term.word, frequency);
}
}
return result;
}
}
23 changes: 12 additions & 11 deletions src/main/java/cn/har01d/alist_tvbox/tvbox/IndexContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,15 @@

@Data
public class IndexContext {
public Stats stats = new Stats();
private final IndexRequest indexRequest;
private final FileWriter writer;
private final FileWriter fullWriter;
private boolean includeFile;
private Set<String> set = new HashSet<>();

public String getSite() {
return indexRequest.getSite();
}

public boolean isWriteFull() {
return indexRequest.isWriteFull();
}

public boolean isExcludeExternal() {
return indexRequest.isExcludeExternal();
}
Expand All @@ -35,19 +30,25 @@ public Set<String> getExcludes() {
return indexRequest.getExcludes();
}

public Set<String> getStopWords() {
return indexRequest.getStopWords();
}

public boolean contains(String key) {
return set.contains(key);
}

public void write(String path) throws IOException {
set.add(path);
stats.indexed++;
writer.write(path + "\n");
}

public void writeFull(String path) throws IOException {
if (indexRequest.isWriteFull()) {
set.add(path);
fullWriter.write(path + "\n");
}
@Data
public static class Stats {
public int files;
public int indexed;
public int errors;
public int excluded;
}
}
5 changes: 2 additions & 3 deletions src/main/java/cn/har01d/alist_tvbox/tvbox/IndexRequest.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@
public class IndexRequest {
private String site;
private String indexName = "index";
private boolean writeFull;
private boolean excludeExternal;
private int maxDepth = 10;
private Set<String> collection = new HashSet<>();
private Set<String> single = new HashSet<>();
private Set<String> paths = new HashSet<>();
private Set<String> stopWords = new HashSet<>();
private Set<String> excludes = new HashSet<>();
}

0 comments on commit 1040fc0

Please sign in to comment.