Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added random sort of training data for better training results #48

Merged
merged 1 commit into from
Aug 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ ML_SPAM_DELETION_ENABLED=false
ML_SPAM_AUTOBAN_ENABLED=true
ML_SPAM_AUTOBAN_SCORE_THRESHOLD=-5.0
ML_SPAM_AUTOBAN_CHECK_LAST_MSG_COUNT=10
ML_TRAIN_RANDOM_SORT_DATA=true
ML_TRAIN_INTERVAL_DAYS=30
ML_TRAIN_CRITICAL_MSG_COUNT=5
ML_TRAINING_SET_FRACTION=0.2
Expand Down
1 change: 1 addition & 0 deletions src/VahterBanBot.Tests/ContainerTestBase.fs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ type VahterTestContainers() =
.WithEnvironment("CLEANUP_OLD_MESSAGES", "false")
.WithEnvironment("ML_ENABLED", "true")
.WithEnvironment("ML_SEED", "42")
.WithEnvironment("ML_TRAIN_RANDOM_SORT_DATA", "false")
.WithEnvironment("ML_SPAM_DELETION_ENABLED", "true")
.WithEnvironment("ML_SPAM_THRESHOLD", "1.0")
.WithEnvironment("ML_STOP_WORDS_IN_CHATS", """{"-42":["2"]}""")
Expand Down
6 changes: 5 additions & 1 deletion src/VahterBanBot/ML.fs
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,11 @@ type MachineLearning(
createdAt = x.created_at
lessThanNMessagesF = if x.less_than_n_messages then 1.0f else 0.0f }
)

|> fun x ->
if botConf.MlTrainRandomSortData then
Array.sortInPlaceBy (fun _ -> Guid.NewGuid()) x
x

let dataView = mlContext.Data.LoadFromEnumerable data
let trainTestSplit = mlContext.Data.TrainTestSplit(dataView, testFraction = botConf.MlTrainingSetFraction)
let trainingData = trainTestSplit.TrainSet
Expand Down
1 change: 1 addition & 0 deletions src/VahterBanBot/Program.fs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ let botConf =
MlSpamAutobanEnabled = getEnvOr "ML_SPAM_AUTOBAN_ENABLED" "false" |> bool.Parse
MlSpamAutobanCheckLastMsgCount = getEnvOr "ML_SPAM_AUTOBAN_CHECK_LAST_MSG_COUNT" "10" |> int
MlSpamAutobanScoreThreshold = getEnvOr "ML_SPAM_AUTOBAN_SCORE_THRESHOLD" "-5.0" |> double
MlTrainRandomSortData = getEnvOr "ML_TRAIN_RANDOM_SORT_DATA" "true" |> bool.Parse
MlTrainInterval = getEnvOr "ML_TRAIN_INTERVAL_DAYS" "30" |> int |> TimeSpan.FromDays
MlTrainCriticalMsgCount = getEnvOr "ML_TRAIN_CRITICAL_MSG_COUNT" "5" |> int
MlTrainingSetFraction = getEnvOr "ML_TRAINING_SET_FRACTION" "0.2" |> float
Expand Down
1 change: 1 addition & 0 deletions src/VahterBanBot/Types.fs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ type BotConfiguration =
MlSpamAutobanEnabled: bool
MlSpamAutobanCheckLastMsgCount: int
MlSpamAutobanScoreThreshold: double
MlTrainRandomSortData: bool
MlTrainInterval: TimeSpan
MlTrainCriticalMsgCount: int
MlTrainingSetFraction: float
Expand Down