Add equality to partitioned data set

Hernán Morales Durand · Hernán Morales Durand · commit de2790e3b2d7 · 2023-12-14T08:52:48.000+01:00
Add split train test test method
diff --git a/src/AI-DataPartitioners-Tests/AIRandomPartitionerTest.class.st b/src/AI-DataPartitioners-Tests/AIRandomPartitionerTest.class.st
@@ -1,10 +1,24 @@
 Class {
 	#name : 'AIRandomPartitionerTest',
 	#superclass : 'TestCase',
+	#instVars : [
+		'partitioner',
+		'df'
+	],
 	#category : 'AI-DataPartitioners-Tests',
 	#package : 'AI-DataPartitioners-Tests'
 }
 
+{ #category : 'running' }
+AIRandomPartitionerTest >> setUp [
+
+	super setUp.
+	df := DataFrame withRows: #( #( 'Barcelona' 1.609 true ) #( 'Dubai' 2.789 true ) #( 'London' 8.788 false ) ).
+
+	df rowNames: #( 'A' 'B' 'C' ).
+	df columnNames: #( 'City' 'Population' 'BeenThere' )
+]
+
 { #category : 'tests' }
 AIRandomPartitionerTest >> testSplitDataWithProportionsCase1 [
 
@@ -68,10 +82,36 @@ AIRandomPartitionerTest >> testSplitDataWithSizesCase2 [
 ]
 
 { #category : 'tests' }
-AIRandomPartitionerTest >> testSplitTrainTestFromFeaturesUsingTargetWithProportionsShuffle [
+AIRandomPartitionerTest >> testSplitTrainTestFromUsingTargetColumnWithProportionsShuffle [
 
-	| expected |
+	| expectedPartition partitionedDataSet |
 	
-	expected := self.
-	self assert: false
+	expectedPartition := AIPartitionedDataSet new
+		xTrain: (DataFrame 
+			withRows: #( #( 'Barcelona' 1.609 ) #( 'London' 8.788 )) 
+			rowNames: #('A' 'C')
+			columnNames: #( 'City' 'Population' ));
+		xTest: (DataFrame 
+			withRows: #( #( 'Dubai' 2.789 )) 
+			rowNames: #('B')
+			columnNames: #( 'City' 'Population' ));
+		yTrain: (DataFrame 
+			withRows: #( #( true ) #( false )) 
+			rowNames: #('A' 'C')
+			columnNames:  #( 'BeenThere' ));
+		yTest: (DataFrame 
+			withRows: #( #( true )) 
+			rowNames: #('B')
+			columnNames:  #( 'BeenThere' ));
+		yourself.
+
+	partitionedDataSet := (AIRandomPartitioner new 
+		splitTrainTestFrom: df 
+		usingTargetColumn: #('BeenThere') 
+		withProportions: #(0.7 0.3)  
+		seed: 1).
+
+	self
+		assert: partitionedDataSet
+		equals: expectedPartition
 ]
diff --git a/src/AI-DataPartitioners/AIPartitionedDataSet.class.st b/src/AI-DataPartitioners/AIPartitionedDataSet.class.st
@@ -24,6 +24,34 @@ Class {
 	#package : 'AI-DataPartitioners'
 }
 
+{ #category : 'comparing' }
+AIPartitionedDataSet >> = aPartitionedDataSet [
+
+	self == aPartitionedDataSet 
+		ifTrue: [ ^ true ].
+	self class = aPartitionedDataSet class
+		ifFalse: [ ^ false ].
+	self xTest = aPartitionedDataSet xTest
+		ifFalse: [ ^ false ].
+	self yTest = aPartitionedDataSet yTest
+		ifFalse: [ ^ false ].
+	self xTrain = aPartitionedDataSet xTrain
+		ifFalse: [ ^ false ].
+	self yTrain = aPartitionedDataSet yTrain
+		ifFalse: [ ^ false ].
+	^ true
+]
+
+{ #category : 'comparing' }
+AIPartitionedDataSet >> hash [
+	"hash is implemented because #= is implemented"
+
+	^ self species hash 
+		bitXor: (self xTest hash 
+			bitXor: (self xTrain hash bitXor: (self yTest hash 
+				bitXor: self yTrain hash)))
+]
+
 { #category : 'accessing' }
 AIPartitionedDataSet >> xTest [
 
diff --git a/src/AI-DataPartitioners/AIRandomPartitioner.class.st b/src/AI-DataPartitioners/AIRandomPartitioner.class.st
@@ -141,7 +141,30 @@ AIRandomPartitioner >> split: aCollection withSizes: aCollectionOfSizes [
 ]
 
 { #category : 'api' }
-AIRandomPartitioner >> splitTrainTestFromFeatures: aDataFrame usingTarget: targetCollection withProportions: aTwoElementCollectionOfProportions shuffle: aBoolean [ 
+AIRandomPartitioner >> splitTrainTestFrom: aDataFrame usingTargetColumn: targetCollection withProportions: aTwoElementCollectionOfProportions seed: aNumber [ 
+	"Answer a <AIPartitionedDataSet>. 	Split the receiver's data into two sets: train and test.
+	
+	xTrain and yTrain sets are used for training and fitting the model.
+	xTest and yTest sets are used for testing the model.
+	"
+
+	| partition |
+
+	partition := self 
+		split: aDataFrame 
+		withProportions: aTwoElementCollectionOfProportions 
+		seed: aNumber.
+	^ AIPartitionedDataSet new
+		xTrain: (partition first columnsAllBut: targetCollection);
+		yTrain: (partition first columns: targetCollection);
+
+		xTest: (partition second columnsAllBut: targetCollection);
+		yTest: (partition second columns: targetCollection);
+		yourself
+]
+
+{ #category : 'api' }
+AIRandomPartitioner >> splitTrainTestFrom: aDataFrame usingTargetColumn: targetCollection withProportions: aTwoElementCollectionOfProportions shuffle: aBoolean [ 
 	"Answer a <AIPartitionedDataSet>. 	Split the receiver's data into two sets: train and test.
 	
 	xTrain and yTrain sets are used for training and fitting the model.