Skip to content

Commit

Permalink
feat: make for sample
Browse files Browse the repository at this point in the history
  • Loading branch information
phodal committed Dec 15, 2023
1 parent ffd6930 commit 684cc43
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@ package cc.unitmesh.core.intelli

import java.io.File

val SNIPPET_LENGTH = 60

val MAX_RELEVANT_FILES = 20

/**
* This class is used to find similar chunks with paths.
* Should be implemented by each language
*/
abstract class SimilarChunker(var snippetLength: Int = SNIPPET_LENGTH, var maxRelevantFiles: Int = MAX_RELEVANT_FILES) {
abstract class SimilarChunker(
var snippetLength: Int = 60,
var maxRelevantFiles: Int = 20,
val scoreThreshold: Double = 0.5,
) {
/**
* Returns a list of the most recently edited files in the project.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ data class PickerOption(
val language: String = "java",
val baseDir: String = "datasets",
val builderTypes: List<InstructionType> = listOf(
InstructionType.RELATED_CODE_COMPLETION
InstructionType.SIMILAR_CHUNKS_COMPLETION
),
val codeQualityTypes: List<CodeQualityType> = listOf(),
val builderConfig: BuilderConfig = BuilderConfig(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class SimilarChunksCompletionBuilder(private val context: InstructionContext) :
// collection all similar chunk structures by imports if exists in a file tree
val similarChunks: List<String> = similarChunker.calculate(
beforeCursor,
it.Package + "." + ds.NodeName,
ds.Package + "." + ds.NodeName,
).chunks ?: emptyList()

SimilarChunkCompletionIns(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,37 @@ import cc.unitmesh.pick.config.InstructionFileJob
* This class extends the SimilarChunksWithPaths class, providing additional functionality for handling Java code.
*/
class JavaSimilarChunker(private val fileTree: HashMap<String, InstructionFileJob>) : SimilarChunker() {
companion object {
val logger = org.slf4j.LoggerFactory.getLogger(JavaSimilarChunker::class.java)
}

/**
* Calculates the similar code chunks based on the given text and canonical name.
*
* @param text The text in which the code chunks are present.
* @param canonicalName The canonical name of the code snippet.
* @return A SimilarChunkContext object containing information about related code paths and similar chunks of code.
*/
override fun calculate(text: String, canonicalName: String): SimilarChunkContext {
val lines = text.split("\n")
// take lines before the cursor which will select from the last line
val beforeCursor = lines.takeLast(snippetLength).joinToString("\n")

val canonicalNames = fileTree.keys.toList()
val canonicalNames = fileTree.keys.filter { it != canonicalName }
val relatedCodePath = packageNameLevelJaccardSimilarity(canonicalNames, canonicalName)
.toList()
.sortedByDescending { it.first }
.take(maxRelevantFiles)
.map { it.second }

logger.info("relatedCodePath: $relatedCodePath")

val chunks = chunkedCode(beforeCursor)
val allRelatedChunks = relatedCodePath
.mapNotNull { fileTree[it] }
.map { chunkedCode(it.code) }
.flatten()
.map { chunkedCode(it.code).joinToString("\n") }

logger.info("allRelatedChunks: $allRelatedChunks")

val similarChunks = allRelatedChunks.map {
val score = similarityScore(tokenize(it).toSet(), chunks.toSet())
Expand All @@ -37,15 +51,26 @@ class JavaSimilarChunker(private val fileTree: HashMap<String, InstructionFileJo
.take(maxRelevantFiles)
.map { it.second }

return SimilarChunkContext("java", relatedCodePath, similarChunks.take(3))
// take the first 3 similar chunks or empty
val similarChunksText = if (similarChunks.size > 3) {
similarChunks.take(3)
} else {
similarChunks
}

return SimilarChunkContext("java", relatedCodePath, similarChunksText)
}

private fun packageNameLevelJaccardSimilarity(chunks: List<String>, text: String): Map<Double, String> {
val packageName = packageNameTokenize(text)
return chunks.map { chunk ->
return chunks.mapNotNull { chunk ->
val chunkPackageName = packageNameTokenize(chunk)
val score = similarityScore(packageName.toSet(), chunkPackageName.toSet())
score to chunk
if (score > scoreThreshold) {
score to chunk
} else {
null
}
}.toMap()
}

Expand Down Expand Up @@ -82,7 +107,7 @@ class JavaSimilarChunker(private val fileTree: HashMap<String, InstructionFileJo

fun chunkedCode(code: String): List<String> {
return code
.split("\n", limit = snippetLength)
.split("\n")
.filter {
val trim = it.trim()
!(trim.startsWith("import ") || trim.startsWith("package "))
Expand Down

0 comments on commit 684cc43

Please sign in to comment.