Skip to content

Commit

Permalink
feat: add line as quality threshold
Browse files Browse the repository at this point in the history
  • Loading branch information
phodal committed Dec 21, 2023
1 parent 6fa98c2 commit 88898ab
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 19 deletions.
7 changes: 5 additions & 2 deletions unit-cli/src/main/kotlin/cc/unitmesh/runner/Picker.kt
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@ private val logger = org.slf4j.LoggerFactory.getLogger(PickerCommand::class.java
fun main(args: Array<String>) = PickerCommand().main(args)

class PickerCommand : CliktCommand() {
// todo: find a way to make this configurable
private val projectTypedCompletionSize by option(help = "Limit each CompletionType size").int().default(100)
private val gitDepth by option(help = "Git depth").int().default(1)
private val maxCompletionInOneFile by option(help = "Max completion in one file").int().default(3)
private val maxCharInCode by option(help = "Max char in code").int().default(1500)
private val maxLineInCode by option(help = "Max line in code").int().default(320)

override fun run() {
val outputDir = File("datasets" + File.separator + "origin")
Expand All @@ -45,7 +46,9 @@ class PickerCommand : CliktCommand() {
language = code.language,
maxCompletionInOneFile = maxCompletionInOneFile,
gitDepth = gitDepth,
completionTypeSize = projectTypedCompletionSize
completionTypeSize = projectTypedCompletionSize,
maxCharInCode = maxCharInCode,
maxLineInCode = maxLineInCode,
)

val content = SimpleCodePicker(pickerOption).execute()
Expand Down
11 changes: 9 additions & 2 deletions unit-picker/src/main/kotlin/cc/unitmesh/pick/SimpleCodePicker.kt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import cc.unitmesh.pick.builder.InstructionFileJob
import cc.unitmesh.pick.builder.PickerOption
import cc.unitmesh.pick.prompt.Instruction
import cc.unitmesh.pick.walker.PickDirectoryWalker
import cc.unitmesh.pick.worker.QualityThreshold
import cc.unitmesh.pick.worker.WorkerContext
import cc.unitmesh.pick.worker.WorkerManager
import kotlinx.coroutines.channels.Channel
Expand Down Expand Up @@ -81,7 +82,13 @@ class SimpleCodePicker(private val config: PickerOption) : CodePicker {
pureDataFileName = config.pureDataFileName(),
config.completionTypes,
config.maxCompletionInOneFile,
config.completionTypeSize
config.completionTypeSize,
qualityThreshold = QualityThreshold(
complexity = QualityThreshold.MAX_COMPLEXITY,
fileSize = QualityThreshold.MAX_FILE_SIZE,
maxLineInCode = config.maxLineInCode,
maxCharInCode = config.maxCharInCode,
)
)
)
val walkdirChannel = Channel<FileJob>()
Expand All @@ -95,7 +102,7 @@ class SimpleCodePicker(private val config: PickerOption) : CodePicker {
launch {
for (fileJob in walkdirChannel) {
languageWorker.processFile(fileJob)?.let {
workerManager.addJob(InstructionFileJob.from(it))
workerManager.addJobByThreshold(InstructionFileJob.from(it))
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,19 @@ data class PickerOption(
val maxCompletionInOneFile: Int = 3,
val gitDepth: Int = 1,
val completionTypeSize: Int,
/**
* https://docs.sweep.dev/blogs/chunking-2m-files
* This is because the average token to a character ratio for code is ~1:5(300 tokens), and embedding models are
* capped at 512 tokens. Further, 1500 characters correspond approximately to 40 lines, roughly equivalent to a
* small to medium-sized function or class.
*
* Our token length is 2048, so we can use 1500 * 1024 / 512 = 3000
*/
val maxCharInCode: Int = 3000,
/**
* Our token length is 2048, so we can use 40 * 2048 / 512 = 160, but java has a lot of new lines, so we double it
*/
val maxLineInCode: Int = 320,
) {
fun pureDataFileName(): String {
return baseDir + File.separator + repoFileName() + ".jsonl"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ data class WorkerContext(

@Serializable
data class QualityThreshold(
val complexity: Long = 100,
val fileSize: Long = 1024 * 64,
)
val complexity: Long = MAX_COMPLEXITY,
val fileSize: Long = MAX_FILE_SIZE,
val maxLineInCode: Int = 160,
val maxCharInCode: Int = 3000,
) {
companion object {
const val MAX_COMPLEXITY: Long = 100
const val MAX_FILE_SIZE: Long = 1024 * 64
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,31 +23,29 @@ class WorkerManager(private val workerContext: WorkerContext) {

private val logger: Logger = org.slf4j.LoggerFactory.getLogger(WorkerManager::class.java)

fun addJob(job: InstructionFileJob) {
fun addJobByThreshold(job: InstructionFileJob) {
val summary = job.fileSummary
if (!supportedExtensions.contains(summary.extension)) {
return
}

if (summary.lines > workerContext.qualityThreshold.maxLineInCode) {
logger.info("skip file ${summary.location} for lines ${summary.lines}")
return
}

if (summary.complexity > workerContext.qualityThreshold.complexity) {
logger.info("skip file ${summary.location} for complexity ${summary.complexity}")
// TODO: add debugging option
// if (summary.filename.endsWith(".java")) {
// println("| filename: ${summary.filename} | complexity: ${summary.complexity} | code: ${summary.lines} | size: ${summary.bytes} | location: ${summary.location} |")
// }
return;
}

if (summary.binary || summary.generated || summary.minified) {
return
}

// if the file size is too large, we just try 64k
if (summary.bytes > workerContext.qualityThreshold.fileSize) {
logger.info("skip file ${summary.location} for size ${summary.bytes}")
// TODO: add debugging option
// if (summary.filename.endsWith(".java")) {
// println("| filename: ${summary.filename} | complexity: ${summary.complexity} | code: ${summary.lines} | size: ${summary.bytes} | location: ${summary.location} |")
// }
return
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class PickerOptionTest {
url = "https://github.com/functionaljava/functionaljava",
branch = "series/5.x",
completionTypeSize = 10,
maxCharInCode = maxCharInCode,
).pureDataFileName()

// Then
Expand All @@ -24,7 +25,8 @@ class PickerOptionTest {
val pickerOption = PickerOption(
url = "https://github.com/example/repo.git",
branch = "main",
completionTypeSize = 10
completionTypeSize = 10,
maxCharInCode = maxCharInCode
)

// When
Expand All @@ -39,7 +41,8 @@ class PickerOptionTest {
// Given
val pickerOption = PickerOption(
url = "https://github.com/example/repo*:?<>.|",
completionTypeSize = 10
completionTypeSize = 10,
maxCharInCode = maxCharInCode
)

// When
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ class SimpleCodePickerTest {
val picker = SimpleCodePicker(
PickerOption(
url = "https://github.com/unit-mesh/unit-eval-testing",
completionTypeSize = 10
completionTypeSize = 10,
maxCharInCode = maxCharInCode
)
)

Expand Down

0 comments on commit 88898ab

Please sign in to comment.