请教方法
1
wolfie 2023-12-07 18:09:33 +08:00
cn.hutool.core.text.TextSimilarity
|
2
28Sv0ngQfIE7Yloe 2023-12-07 18:11:20 +08:00
相似度的定义是什么
|
3
yyf1234 2023-12-07 18:12:28 +08:00 via iPhone
关键字:编辑距离
|
4
xuld 2023-12-07 18:17:33 +08:00
public class LevenshteinDistance {
public static int distance(String s1, String s2) { int l1 = s1.length(); int l2 = s2.length(); // base cases if (l1 == 0) return l2; if (l2 == 0) return l1; int[][] distance = new int[l1+1][l2+1]; for (int i = 0; i <= l1; i++) { distance[i][0] = i; } for (int j = 0; j <= l2; j++) { distance[0][j] = j; } for (int i = 1; i <= l1; i++) { for (int j = 1; j <= l2; j++) { int cost = (s1.charAt(i-1) == s2.charAt(j-1)) ? 0 : 1; distance[i][j] = Math.min( distance[i-1][j] + 1, // deletion distance[i][j-1] + 1, // insertion distance[i-1][j-1] + cost // substitution ); } } return distance[l1][l2]; } } |
5
chippai 2023-12-07 19:15:08 +08:00
编辑距离,曾经给一个账号注册做优化,前人暴力比较,千万+的账号比完一个请求跑 3s 。
|
6
ldyisbest 2023-12-07 19:17:09 +08:00
先定义相似度,问题要明确
|
7
forgottencoast 2023-12-07 22:20:21 +08:00
|
8
crazyweeds 2023-12-07 22:21:34 +08:00
相似度算法。。
|
9
knightdf 2023-12-07 22:43:03 +08:00
相似度的算法有好多,这个就要看你的数据类型了
|
10
Aresxue 2023-12-08 11:07:14 +08:00
https://github.com/tdebatty/java-string-similarity
推荐标准莱茵斯坦算法和余弦算法,字符串较大就用余弦,对准确率要求高就莱茵斯坦算法。 附上测试一个测试类自己修改下,用自己的实际数据多跑跑 public class SimilarityAlgorithmTest { @Test public void test() { String str1 = "ares"; String str2 = "kele"; LevenshteinDistance levenshtein = LevenshteinDistance.getDefaultInstance(); System.out.println("Levenshtein distance: " + levenshtein.apply(str1, str2)); System.out.println("Levenshtein distance: " + levenshtein.apply(str2, str2)); LevenshteinDistance levenshteinWithThreshold = new LevenshteinDistance(3); // Returns -1 since the actual distance, 4, is higher than the threshold System.out.println("Levenshtein distance: " + levenshteinWithThreshold.apply(str1, str2)); LevenshteinDetailedDistance levenshteinDetailed = LevenshteinDetailedDistance.getDefaultInstance(); System.out.println("Levenshtein detailed distance: " + levenshteinDetailed.apply(str1, str2)); } @Test public void testLevenshteinDistance() { int sLength = 13_162; int strLength = 4_000; String s = StringUtil.random(sLength); String s1 = StringUtil.random(strLength) + s; String s2 = StringUtil.random(strLength) + s; int threshold = 4_096; LevenshteinDistance levenshtein = new LevenshteinDistance(threshold); long startTime = System.currentTimeMillis(); /* threshold 为 32766 时花费 7022ms threshold 为 16384 时花费 2164ms threshold 为 8192 时花费 463ms threshold 为 4096 时花费 146ms threshold 为 2048 时花费 62ms threshold 为 1024 时花费 41ms threshold 为 512 时花费 29ms */ Integer result = levenshtein.apply(s1, s2); System.out.println(System.currentTimeMillis() - startTime); System.out.println("Levenshtein distance: " + result); /* 这种是截取模式比较靠前的指定字符 threshold 为 4096 时花费 67ms */ levenshtein = LevenshteinDistance.getDefaultInstance(); startTime = System.currentTimeMillis(); result = levenshtein.apply(s1.substring(0, threshold), s2.substring(0, threshold)); System.out.println(System.currentTimeMillis() - startTime); System.out.println("Levenshtein distance: " + result); System.out.println(); result = levenshtein.apply(s1, s2); System.out.println("Levenshtein: " + (1.0 - (double) result / (strLength + sLength))); NormalizedLevenshtein normalizedLevenshtein = new NormalizedLevenshtein(); double similarity = normalizedLevenshtein.similarity(s1, s2); System.out.println("NormalizedLevenshtein: " + similarity); Jaccard jaccard = new Jaccard(); similarity = jaccard.similarity(s1, s2); System.out.println("Jaccard: " + similarity); Cosine cosine = new Cosine(); similarity = cosine.similarity(s1, s2); System.out.println("Cosine: " + similarity); QGram qGram = new QGram(); similarity = qGram.distance(s1, s2); System.out.println("QGram: " + (1.0 - similarity / (strLength + sLength))); } @Test public void testAlgorithmPerformance() { String s1 = StringUtil.random(13_162); String s2 = StringUtil.random(13_162); LevenshteinDistance levenshtein = LevenshteinDistance.getDefaultInstance(); long startTime = System.currentTimeMillis(); levenshtein.apply(s1, s2); System.out.println("LevenshteinDistance:" + (System.currentTimeMillis() - startTime)); NormalizedLevenshtein normalizedLevenshtein = new NormalizedLevenshtein(); startTime = System.currentTimeMillis(); normalizedLevenshtein.distance(s1, s2); System.out.println("NormalizedLevenshtein: " + (System.currentTimeMillis() - startTime)); JaroWinkler jaroWinkler = new JaroWinkler(); startTime = System.currentTimeMillis(); jaroWinkler.distance(s1, s2); System.out.println("JaroWinkler: " + (System.currentTimeMillis() - startTime)); LongestCommonSubsequence longestCommonSubsequence = new LongestCommonSubsequence(); startTime = System.currentTimeMillis(); longestCommonSubsequence.apply(s1, s2); System.out.println("LongestCommonSubsequence: " + (System.currentTimeMillis() - startTime)); info.debatty.java.stringsimilarity.MetricLCS metricLCS = new info.debatty.java.stringsimilarity.MetricLCS(); startTime = System.currentTimeMillis(); metricLCS.distance(s1, s2); System.out.println("MetricLCS: " + (System.currentTimeMillis() - startTime)); QGram qGram = new QGram(); startTime = System.currentTimeMillis(); qGram.distance(s1, s2); System.out.println("QGram: " + (System.currentTimeMillis() - startTime)); Cosine cosine = new Cosine(); startTime = System.currentTimeMillis(); cosine.distance(s1, s2); System.out.println("Cosine: " + (System.currentTimeMillis() - startTime)); Jaccard jaccard = new Jaccard(); startTime = System.currentTimeMillis(); jaccard.distance(s1, s2); System.out.println("Jaccard: " + (System.currentTimeMillis() - startTime)); } @Test public void testCosine() { String s1 = StringUtil.random(1_000_000); String s2 = StringUtil.random(1_000_000); Cosine cosine = new Cosine(); long startTime = System.currentTimeMillis(); cosine.distance(s1, s2); System.out.println("Cosine: " + (System.currentTimeMillis() - startTime)); } } |
11
matepi 2023-12-08 16:48:46 +08:00
考虑集合效率的话,还可以用 SimHash
|