feat: take related datastrcture by related

unit-mesh · Dec 25, 2023 · 70a4c03 · 70a4c03
1 parent ba61736
commit 70a4c03
Show file tree

Hide file tree

Showing 4 changed files with 92 additions and 78 deletions.
diff --git a/unit-core/src/main/kotlin/cc/unitmesh/core/intelli/SimilarChunker.kt b/unit-core/src/main/kotlin/cc/unitmesh/core/intelli/SimilarChunker.kt
@@ -36,7 +36,7 @@ abstract class SimilarChunker(
         return chunks.map { list ->
             list.map {
                 val tokenizedFile: Set<String> = tokenize(it).toSet()
-                similarityScore(currentFileTokens, tokenizedFile)
+                Companion.similarityScore(currentFileTokens, tokenizedFile)
             }
         }
     }
@@ -50,75 +50,6 @@ abstract class SimilarChunker(
         return chunk.split(Regex("[^a-zA-Z0-9]")).filter { it.isNotBlank() }
     }
 
-    /**
-     * Calculates the similarity score between two sets of strings.
-     *
-     * The similarity score is calculated as the size of the intersection of the two sets divided by the size of the union of the two sets.
-     *
-     * @param set1 the first set of strings
-     * @param set2 the second set of strings
-     * @return the similarity score between the two sets, represented as a double value
-     */
-    fun similarityScore(set1: Set<String>, set2: Set<String>): Double {
-        val intersectionSize: Int = (set1 intersect set2).size
-        val unionSize: Int = (set1 union set2).size
-        return intersectionSize.toDouble() / unionSize.toDouble()
-    }
-
-    /**
-     * Tokenizes a given path string into a list of separate words.
-     *
-     * The path string represents a file path and is tokenized as follows:
-     * 1. The file extension is removed.
-     * 2. The path is split by forward slash (/) or hyphen (-) characters.
-     * 3. Empty strings are removed from the resulting list.
-     * 4. Numeric values are removed from the list.
-     * 5. Common words such as "src", "main", "kotlin", and "java" are removed.
-     * 6. Camel case splits words if present.
-     *
-     * @param path The path string to be tokenized.
-     * @return A list of individual words extracted from the given path string.
-     */
-    fun pathTokenize(path: String): List<String> {
-        return path
-            .substringBeforeLast(".")
-            .split(Regex("[/\\-]"))
-            .flatMap { it.split(File.separatorChar) }
-            .asSequence()
-            .filter { it.isNotBlank() && !it.matches(Regex(".*\\d.*")) && !COMMON_WORDS.contains(it.lowercase()) }
-            .flatMap { it.split("(?=[A-Z])".toRegex()) } // split by camel case
-            .filter { it.isNotBlank() }
-            .toList()
-    }
-
-    /**
-     * Calculates the path-level Jaccard similarity between a list of path chunks and a given text.
-     * Removes some prefix path such as "src/main/kotlin", "src/main/java", "src/main/resources",
-     * "src/test/kotlin", "src/test/java", and "src/test/resources" from the path chunks.
-     * Then tokenizes the cleaned chunks and the given text.
-     * Computes the Jaccard similarity score between the tokenized text and each tokenized chunk.
-     *
-     * @param chunks the list of path chunks to compare with the text
-     * @param text the text to be compared with the path chunks
-     * @return a list of Jaccard similarity scores, one for each path chunk
-     */
-    fun pathLevelJaccardSimilarity(chunks: List<String>, text: String): List<Double> {
-        val cleanedChunks = chunks.map {
-            it.replace("src/main/kotlin", "")
-                .replace("src/main/java", "")
-                .replace("src/main/resources", "")
-                .replace("src/test/kotlin", "")
-                .replace("src/test/java", "")
-                .replace("src/test/resources", "")
-        }
-
-        val textTokens = pathTokenize(text)
-        return cleanedChunks.map {
-            val chunkTokens = pathTokenize(it)
-            similarityScore(textTokens.toSet(), chunkTokens.toSet())
-        }
-    }
-
     fun packageNameTokenize(packageName: String): List<String> {
         return packageName
             .split(".")
@@ -132,5 +63,74 @@ abstract class SimilarChunker(
 
     companion object {
         val COMMON_WORDS = setOf("src", "main", "kotlin", "java")
+
+        /**
+         * Calculates the similarity score between two sets of strings.
+         *
+         * The similarity score is calculated as the size of the intersection of the two sets divided by the size of the union of the two sets.
+         *
+         * @param set1 the first set of strings
+         * @param set2 the second set of strings
+         * @return the similarity score between the two sets, represented as a double value
+         */
+        fun similarityScore(set1: Set<String>, set2: Set<String>): Double {
+            val intersectionSize: Int = (set1 intersect set2).size
+            val unionSize: Int = (set1 union set2).size
+            return intersectionSize.toDouble() / unionSize.toDouble()
+        }
+
+        /**
+         * Tokenizes a given path string into a list of separate words.
+         *
+         * The path string represents a file path and is tokenized as follows:
+         * 1. The file extension is removed.
+         * 2. The path is split by forward slash (/) or hyphen (-) characters.
+         * 3. Empty strings are removed from the resulting list.
+         * 4. Numeric values are removed from the list.
+         * 5. Common words such as "src", "main", "kotlin", and "java" are removed.
+         * 6. Camel case splits words if present.
+         *
+         * @param path The path string to be tokenized.
+         * @return A list of individual words extracted from the given path string.
+         */
+        fun pathTokenize(path: String): List<String> {
+            return path
+                .substringBeforeLast(".")
+                .split(Regex("[/\\-]"))
+                .flatMap { it.split(File.separatorChar) }
+                .asSequence()
+                .filter { it.isNotBlank() && !it.matches(Regex(".*\\d.*")) && !COMMON_WORDS.contains(it.lowercase()) }
+                .flatMap { it.split("(?=[A-Z])".toRegex()) } // split by camel case
+                .filter { it.isNotBlank() }
+                .toList()
+        }
+
+        /**
+         * Calculates the path-level Jaccard similarity between a list of path chunks and a given text.
+         * Removes some prefix path such as "src/main/kotlin", "src/main/java", "src/main/resources",
+         * "src/test/kotlin", "src/test/java", and "src/test/resources" from the path chunks.
+         * Then tokenizes the cleaned chunks and the given text.
+         * Computes the Jaccard similarity score between the tokenized text and each tokenized chunk.
+         *
+         * @param chunks the list of path chunks to compare with the text
+         * @param text the text to be compared with the path chunks
+         * @return a list of Jaccard similarity scores, one for each path chunk
+         */
+        fun pathLevelJaccardSimilarity(chunks: List<String>, text: String): List<Double> {
+            val cleanedChunks = chunks.map {
+                it.replace("src/main/kotlin", "")
+                    .replace("src/main/java", "")
+                    .replace("src/main/resources", "")
+                    .replace("src/test/kotlin", "")
+                    .replace("src/test/java", "")
+                    .replace("src/test/resources", "")
+            }
+
+            val textTokens = pathTokenize(text)
+            return cleanedChunks.map {
+                val chunkTokens = pathTokenize(it)
+                similarityScore(textTokens.toSet(), chunkTokens.toSet())
+            }
+        }
     }
 }
diff --git a/unit-core/src/test/kotlin/cc/unitmesh/core/intelli/SimilarChunkerTest.kt b/unit-core/src/test/kotlin/cc/unitmesh/core/intelli/SimilarChunkerTest.kt
@@ -56,7 +56,7 @@ class SimilarChunkerTest {
         val set2 = setOf("word2", "word3", "word4")
 
         // When
-        val result = similarChunker.similarityScore(set1, set2)
+        val result = SimilarChunker.similarityScore(set1, set2)
 
         // Then
         val expected = 0.5
@@ -69,7 +69,7 @@ class SimilarChunkerTest {
         val path = "unit-picker/src/main/kotlin/cc/unitmesh/pick/similar/JavaSimilarChunks.kt"
 
         // When
-        val result = similarChunker.pathTokenize(path)
+        val result = SimilarChunker.pathTokenize(path)
 
         // Then
         val expected = listOf(
@@ -97,7 +97,7 @@ class SimilarChunkerTest {
         val text = "src/main/java/com/example/BlogService.java"
 
         // when
-        val result = similarChunker.pathLevelJaccardSimilarity(chunks, text)
+        val result = SimilarChunker.pathLevelJaccardSimilarity(chunks, text)
 
         // then
         // assert the size of the result is equal to the size of chunks

diff --git a/unit-picker/src/main/kotlin/cc/unitmesh/pick/similar/JavaSimilarChunker.kt b/unit-picker/src/main/kotlin/cc/unitmesh/pick/similar/JavaSimilarChunker.kt
@@ -38,7 +38,7 @@ class JavaSimilarChunker(private val fileTree: HashMap<String, InstructionFileJo
 
         val similarChunks: List<Pair<Double, String>> = allRelatedChunks.map {
             val tokenize = tokenize(it)
-            val score = similarityScore(tokenize.toSet(), tokenize(beforeCursor).toSet())
+            val score = Companion.similarityScore(tokenize.toSet(), tokenize(beforeCursor).toSet())
             score to it
         }.sortedByDescending { it.first }
             .filter { it.first > codeScoreThreshold }
@@ -61,7 +61,7 @@ class JavaSimilarChunker(private val fileTree: HashMap<String, InstructionFileJo
         val packageName = packageNameTokenize(text)
         return chunks.mapNotNull { chunk ->
             val chunkPackageName = packageNameTokenize(chunk)
-            val score = similarityScore(packageName.toSet(), chunkPackageName.toSet())
+            val score = Companion.similarityScore(packageName.toSet(), chunkPackageName.toSet())
             if (score >= packageScoreThreshold) {
                 score to chunk
             } else {

diff --git a/unit-picker/src/main/kotlin/cc/unitmesh/pick/strategy/bizcode/RelatedCodeStrategyBuilder.kt b/unit-picker/src/main/kotlin/cc/unitmesh/pick/strategy/bizcode/RelatedCodeStrategyBuilder.kt
@@ -1,9 +1,9 @@
 package cc.unitmesh.pick.strategy.bizcode
 
 import cc.unitmesh.core.completion.TypedIns
+import cc.unitmesh.core.intelli.SimilarChunker
 import cc.unitmesh.pick.builder.completionBuilders
 import cc.unitmesh.pick.builder.ins.RelatedCodeIns
-import cc.unitmesh.pick.builder.unittest.lang.UnitTestService
 import cc.unitmesh.pick.strategy.base.CodeStrategyBuilder
 import cc.unitmesh.pick.worker.job.JobContext
 import chapi.domain.core.CodeContainer
@@ -14,9 +14,8 @@ class RelatedCodeStrategyBuilder(private val context: JobContext) : CodeStrategy
     override fun build(): List<TypedIns> {
         val language = context.job.fileSummary.language.lowercase()
         val container = context.job.container ?: return emptyList()
-        val relatedCode = findRelatedCode(container)
 
-        // 3. checks with rule specified in config
+        // 1. checks with rule specified in config
         val dataStructs = container.DataStructures.filter {
             hasIssue(it, context.qualityTypes)
         }
@@ -25,6 +24,21 @@ class RelatedCodeStrategyBuilder(private val context: JobContext) : CodeStrategy
             return emptyList()
         }
 
+        val currentPath = container.DataStructures[0].FilePath
+
+        // 2. get name to calculate similarity to take the most similar 3
+        val findRelatedCodeDs = findRelatedCode(container)
+        val relatedCodePath = findRelatedCodeDs.map { it.FilePath }
+        val jaccardSimilarity = SimilarChunker.pathLevelJaccardSimilarity(relatedCodePath, currentPath)
+        val relatedCode = jaccardSimilarity.mapIndexed { index, d ->
+            findRelatedCodeDs[index] to d
+        }.sortedByDescending {
+            it.second
+        }.take(3).map {
+            it.first
+        }
+
+        // 3. build completion instruction
         val builders = completionBuilders(context.completionBuilderTypes, context)
 
         val codeCompletionIns = dataStructs.map { ds ->