Skip to content

Commit

Permalink
added random sort of training data for better training results (#48)
Browse files Browse the repository at this point in the history
  • Loading branch information
Szer committed Aug 7, 2024
1 parent 2983943 commit 3501132
Show file tree
Hide file tree
Showing 5 changed files with 9 additions and 1 deletion.
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ ML_SPAM_DELETION_ENABLED=false
ML_SPAM_AUTOBAN_ENABLED=true
ML_SPAM_AUTOBAN_SCORE_THRESHOLD=-5.0
ML_SPAM_AUTOBAN_CHECK_LAST_MSG_COUNT=10
ML_TRAIN_RANDOM_SORT_DATA=true
ML_TRAIN_INTERVAL_DAYS=30
ML_TRAIN_CRITICAL_MSG_COUNT=5
ML_TRAINING_SET_FRACTION=0.2
Expand Down
1 change: 1 addition & 0 deletions src/VahterBanBot.Tests/ContainerTestBase.fs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ type VahterTestContainers() =
.WithEnvironment("CLEANUP_OLD_MESSAGES", "false")
.WithEnvironment("ML_ENABLED", "true")
.WithEnvironment("ML_SEED", "42")
.WithEnvironment("ML_TRAIN_RANDOM_SORT_DATA", "false")
.WithEnvironment("ML_SPAM_DELETION_ENABLED", "true")
.WithEnvironment("ML_SPAM_THRESHOLD", "1.0")
.WithEnvironment("ML_STOP_WORDS_IN_CHATS", """{"-42":["2"]}""")
Expand Down
6 changes: 5 additions & 1 deletion src/VahterBanBot/ML.fs
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,11 @@ type MachineLearning(
createdAt = x.created_at
lessThanNMessagesF = if x.less_than_n_messages then 1.0f else 0.0f }
)

|> fun x ->
if botConf.MlTrainRandomSortData then
Array.sortInPlaceBy (fun _ -> Guid.NewGuid()) x
x

let dataView = mlContext.Data.LoadFromEnumerable data
let trainTestSplit = mlContext.Data.TrainTestSplit(dataView, testFraction = botConf.MlTrainingSetFraction)
let trainingData = trainTestSplit.TrainSet
Expand Down
1 change: 1 addition & 0 deletions src/VahterBanBot/Program.fs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ let botConf =
MlSpamAutobanEnabled = getEnvOr "ML_SPAM_AUTOBAN_ENABLED" "false" |> bool.Parse
MlSpamAutobanCheckLastMsgCount = getEnvOr "ML_SPAM_AUTOBAN_CHECK_LAST_MSG_COUNT" "10" |> int
MlSpamAutobanScoreThreshold = getEnvOr "ML_SPAM_AUTOBAN_SCORE_THRESHOLD" "-5.0" |> double
MlTrainRandomSortData = getEnvOr "ML_TRAIN_RANDOM_SORT_DATA" "true" |> bool.Parse
MlTrainInterval = getEnvOr "ML_TRAIN_INTERVAL_DAYS" "30" |> int |> TimeSpan.FromDays
MlTrainCriticalMsgCount = getEnvOr "ML_TRAIN_CRITICAL_MSG_COUNT" "5" |> int
MlTrainingSetFraction = getEnvOr "ML_TRAINING_SET_FRACTION" "0.2" |> float
Expand Down
1 change: 1 addition & 0 deletions src/VahterBanBot/Types.fs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ type BotConfiguration =
MlSpamAutobanEnabled: bool
MlSpamAutobanCheckLastMsgCount: int
MlSpamAutobanScoreThreshold: double
MlTrainRandomSortData: bool
MlTrainInterval: TimeSpan
MlTrainCriticalMsgCount: int
MlTrainingSetFraction: float
Expand Down

0 comments on commit 3501132

Please sign in to comment.