Skip to content

Commit

Permalink
feat: take related datastrcture by related
Browse files Browse the repository at this point in the history
  • Loading branch information
phodal committed Dec 25, 2023
1 parent ba61736 commit 70a4c03
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 78 deletions.
140 changes: 70 additions & 70 deletions unit-core/src/main/kotlin/cc/unitmesh/core/intelli/SimilarChunker.kt
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ abstract class SimilarChunker(
return chunks.map { list ->
list.map {
val tokenizedFile: Set<String> = tokenize(it).toSet()
similarityScore(currentFileTokens, tokenizedFile)
Companion.similarityScore(currentFileTokens, tokenizedFile)
}
}
}
Expand All @@ -50,75 +50,6 @@ abstract class SimilarChunker(
return chunk.split(Regex("[^a-zA-Z0-9]")).filter { it.isNotBlank() }
}

/**
* Calculates the similarity score between two sets of strings.
*
* The similarity score is calculated as the size of the intersection of the two sets divided by the size of the union of the two sets.
*
* @param set1 the first set of strings
* @param set2 the second set of strings
* @return the similarity score between the two sets, represented as a double value
*/
fun similarityScore(set1: Set<String>, set2: Set<String>): Double {
val intersectionSize: Int = (set1 intersect set2).size
val unionSize: Int = (set1 union set2).size
return intersectionSize.toDouble() / unionSize.toDouble()
}

/**
* Tokenizes a given path string into a list of separate words.
*
* The path string represents a file path and is tokenized as follows:
* 1. The file extension is removed.
* 2. The path is split by forward slash (/) or hyphen (-) characters.
* 3. Empty strings are removed from the resulting list.
* 4. Numeric values are removed from the list.
* 5. Common words such as "src", "main", "kotlin", and "java" are removed.
* 6. Camel case splits words if present.
*
* @param path The path string to be tokenized.
* @return A list of individual words extracted from the given path string.
*/
fun pathTokenize(path: String): List<String> {
return path
.substringBeforeLast(".")
.split(Regex("[/\\-]"))
.flatMap { it.split(File.separatorChar) }
.asSequence()
.filter { it.isNotBlank() && !it.matches(Regex(".*\\d.*")) && !COMMON_WORDS.contains(it.lowercase()) }
.flatMap { it.split("(?=[A-Z])".toRegex()) } // split by camel case
.filter { it.isNotBlank() }
.toList()
}

/**
* Calculates the path-level Jaccard similarity between a list of path chunks and a given text.
* Removes some prefix path such as "src/main/kotlin", "src/main/java", "src/main/resources",
* "src/test/kotlin", "src/test/java", and "src/test/resources" from the path chunks.
* Then tokenizes the cleaned chunks and the given text.
* Computes the Jaccard similarity score between the tokenized text and each tokenized chunk.
*
* @param chunks the list of path chunks to compare with the text
* @param text the text to be compared with the path chunks
* @return a list of Jaccard similarity scores, one for each path chunk
*/
fun pathLevelJaccardSimilarity(chunks: List<String>, text: String): List<Double> {
val cleanedChunks = chunks.map {
it.replace("src/main/kotlin", "")
.replace("src/main/java", "")
.replace("src/main/resources", "")
.replace("src/test/kotlin", "")
.replace("src/test/java", "")
.replace("src/test/resources", "")
}

val textTokens = pathTokenize(text)
return cleanedChunks.map {
val chunkTokens = pathTokenize(it)
similarityScore(textTokens.toSet(), chunkTokens.toSet())
}
}

fun packageNameTokenize(packageName: String): List<String> {
return packageName
.split(".")
Expand All @@ -132,5 +63,74 @@ abstract class SimilarChunker(

companion object {
val COMMON_WORDS = setOf("src", "main", "kotlin", "java")

/**
* Calculates the similarity score between two sets of strings.
*
* The similarity score is calculated as the size of the intersection of the two sets divided by the size of the union of the two sets.
*
* @param set1 the first set of strings
* @param set2 the second set of strings
* @return the similarity score between the two sets, represented as a double value
*/
fun similarityScore(set1: Set<String>, set2: Set<String>): Double {
val intersectionSize: Int = (set1 intersect set2).size
val unionSize: Int = (set1 union set2).size
return intersectionSize.toDouble() / unionSize.toDouble()
}

/**
* Tokenizes a given path string into a list of separate words.
*
* The path string represents a file path and is tokenized as follows:
* 1. The file extension is removed.
* 2. The path is split by forward slash (/) or hyphen (-) characters.
* 3. Empty strings are removed from the resulting list.
* 4. Numeric values are removed from the list.
* 5. Common words such as "src", "main", "kotlin", and "java" are removed.
* 6. Camel case splits words if present.
*
* @param path The path string to be tokenized.
* @return A list of individual words extracted from the given path string.
*/
fun pathTokenize(path: String): List<String> {
return path
.substringBeforeLast(".")
.split(Regex("[/\\-]"))
.flatMap { it.split(File.separatorChar) }
.asSequence()
.filter { it.isNotBlank() && !it.matches(Regex(".*\\d.*")) && !COMMON_WORDS.contains(it.lowercase()) }
.flatMap { it.split("(?=[A-Z])".toRegex()) } // split by camel case
.filter { it.isNotBlank() }
.toList()
}

/**
* Calculates the path-level Jaccard similarity between a list of path chunks and a given text.
* Removes some prefix path such as "src/main/kotlin", "src/main/java", "src/main/resources",
* "src/test/kotlin", "src/test/java", and "src/test/resources" from the path chunks.
* Then tokenizes the cleaned chunks and the given text.
* Computes the Jaccard similarity score between the tokenized text and each tokenized chunk.
*
* @param chunks the list of path chunks to compare with the text
* @param text the text to be compared with the path chunks
* @return a list of Jaccard similarity scores, one for each path chunk
*/
fun pathLevelJaccardSimilarity(chunks: List<String>, text: String): List<Double> {
val cleanedChunks = chunks.map {
it.replace("src/main/kotlin", "")
.replace("src/main/java", "")
.replace("src/main/resources", "")
.replace("src/test/kotlin", "")
.replace("src/test/java", "")
.replace("src/test/resources", "")
}

val textTokens = pathTokenize(text)
return cleanedChunks.map {
val chunkTokens = pathTokenize(it)
similarityScore(textTokens.toSet(), chunkTokens.toSet())
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class SimilarChunkerTest {
val set2 = setOf("word2", "word3", "word4")

// When
val result = similarChunker.similarityScore(set1, set2)
val result = SimilarChunker.similarityScore(set1, set2)

// Then
val expected = 0.5
Expand All @@ -69,7 +69,7 @@ class SimilarChunkerTest {
val path = "unit-picker/src/main/kotlin/cc/unitmesh/pick/similar/JavaSimilarChunks.kt"

// When
val result = similarChunker.pathTokenize(path)
val result = SimilarChunker.pathTokenize(path)

// Then
val expected = listOf(
Expand Down Expand Up @@ -97,7 +97,7 @@ class SimilarChunkerTest {
val text = "src/main/java/com/example/BlogService.java"

// when
val result = similarChunker.pathLevelJaccardSimilarity(chunks, text)
val result = SimilarChunker.pathLevelJaccardSimilarity(chunks, text)

// then
// assert the size of the result is equal to the size of chunks
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class JavaSimilarChunker(private val fileTree: HashMap<String, InstructionFileJo

val similarChunks: List<Pair<Double, String>> = allRelatedChunks.map {
val tokenize = tokenize(it)
val score = similarityScore(tokenize.toSet(), tokenize(beforeCursor).toSet())
val score = Companion.similarityScore(tokenize.toSet(), tokenize(beforeCursor).toSet())
score to it
}.sortedByDescending { it.first }
.filter { it.first > codeScoreThreshold }
Expand All @@ -61,7 +61,7 @@ class JavaSimilarChunker(private val fileTree: HashMap<String, InstructionFileJo
val packageName = packageNameTokenize(text)
return chunks.mapNotNull { chunk ->
val chunkPackageName = packageNameTokenize(chunk)
val score = similarityScore(packageName.toSet(), chunkPackageName.toSet())
val score = Companion.similarityScore(packageName.toSet(), chunkPackageName.toSet())
if (score >= packageScoreThreshold) {
score to chunk
} else {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
package cc.unitmesh.pick.strategy.bizcode

import cc.unitmesh.core.completion.TypedIns
import cc.unitmesh.core.intelli.SimilarChunker
import cc.unitmesh.pick.builder.completionBuilders
import cc.unitmesh.pick.builder.ins.RelatedCodeIns
import cc.unitmesh.pick.builder.unittest.lang.UnitTestService
import cc.unitmesh.pick.strategy.base.CodeStrategyBuilder
import cc.unitmesh.pick.worker.job.JobContext
import chapi.domain.core.CodeContainer
Expand All @@ -14,9 +14,8 @@ class RelatedCodeStrategyBuilder(private val context: JobContext) : CodeStrategy
override fun build(): List<TypedIns> {
val language = context.job.fileSummary.language.lowercase()
val container = context.job.container ?: return emptyList()
val relatedCode = findRelatedCode(container)

// 3. checks with rule specified in config
// 1. checks with rule specified in config
val dataStructs = container.DataStructures.filter {
hasIssue(it, context.qualityTypes)
}
Expand All @@ -25,6 +24,21 @@ class RelatedCodeStrategyBuilder(private val context: JobContext) : CodeStrategy
return emptyList()
}

val currentPath = container.DataStructures[0].FilePath

// 2. get name to calculate similarity to take the most similar 3
val findRelatedCodeDs = findRelatedCode(container)
val relatedCodePath = findRelatedCodeDs.map { it.FilePath }
val jaccardSimilarity = SimilarChunker.pathLevelJaccardSimilarity(relatedCodePath, currentPath)
val relatedCode = jaccardSimilarity.mapIndexed { index, d ->
findRelatedCodeDs[index] to d
}.sortedByDescending {
it.second
}.take(3).map {
it.first
}

// 3. build completion instruction
val builders = completionBuilders(context.completionBuilderTypes, context)

val codeCompletionIns = dataStructs.map { ds ->
Expand Down

0 comments on commit 70a4c03

Please sign in to comment.