diff --git a/src/com/jgaap/canonicizers/SmashI.java b/src/com/jgaap/canonicizers/SmashI.java new file mode 100644 index 000000000..a1c4b3e35 --- /dev/null +++ b/src/com/jgaap/canonicizers/SmashI.java @@ -0,0 +1,89 @@ +/* + * JGAAP -- a graphical program for stylometric authorship attribution + * Copyright (C) 2009,2011 by Patrick Juola + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +package com.jgaap.canonicizers; + +import com.jgaap.generics.CanonicizationException; +import com.jgaap.generics.Canonicizer; + +/** + * Canonicizer for smashing all instances in which "I" is used as a + * word. + * + * @author David + * @since 8.0.0 + */ +public class SmashI extends Canonicizer { + + @Override + public String displayName() { + return "Smash I"; + } + + @Override + public String tooltipText() { + return "Converts all uses of \"I\" to lowercase."; + } + + @Override + public String longDescription() { + return "Converts all uses of \"I\" as a word to lowercase."; + } + + @Override + public boolean showInGUI() { + return true; + } + + @Override + public char[] process(char[] procText) throws CanonicizationException { + for (int x = 0; x < procText.length; x++) { + // Character is not even a potential candidate for smashing if it is + // not a capital I. + if (procText[x] == 'I' ) { + // Left and right flag are for indicating if whitespace was found + // on either side of "I." + boolean leftFlag = false; + boolean rightFlag = false; + + try { + // Check for whitespace on left side. + leftFlag = Character.isWhitespace(procText[x - 1]); + } + catch (ArrayIndexOutOfBoundsException e) { + // If the I is at the beginning of the string, set left flag to true. + leftFlag = true; + } + + try { + // Check for whitespace on the right side. + rightFlag = Character.isWhitespace(procText[x + 1]); + } + catch (ArrayIndexOutOfBoundsException e) { + // If the I is at the end of the string, set right flag to true. + rightFlag = true; + } + + // Smash character if both flags are set to true. + if (leftFlag && rightFlag) + procText[x] = Character.toLowerCase(procText[x]); + } + } + return procText; + } + +} diff --git a/unittests/com/jgaap/canonicizers/SmashITest.java b/unittests/com/jgaap/canonicizers/SmashITest.java new file mode 100644 index 000000000..02bce2dc0 --- /dev/null +++ b/unittests/com/jgaap/canonicizers/SmashITest.java @@ -0,0 +1,64 @@ +/* + * JGAAP -- a graphical program for stylometric authorship attribution + * Copyright (C) 2009,2011 by Patrick Juola + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +package com.jgaap.canonicizers; + +import static org.junit.Assert.assertTrue; + +import java.util.Arrays; + +import org.junit.Test; + +import com.jgaap.generics.CanonicizationException; + +/** + * Unit test for the Smash I canonicizer. + * + * @author David + * @since 8.0.0 + */ +public class SmashITest { + @Test + public void testProcess() throws CanonicizationException { + SmashI smashI = new SmashI(); + + // Test 1 - I on ends + String in = "I don't care if IPad is supposed to be spelled with a lowercase I"; + char[] correct = "i don't care if IPad is supposed to be spelled with a lowercase i".toCharArray(); + char[] actual = smashI.process(in.toCharArray()); + assertTrue(Arrays.equals(correct, actual)); + + // Test 2 - I in middle surrounded by spaces + in = "Sometimes I cannot think of creative things to write for unit tests."; + correct = "Sometimes i cannot think of creative things to write for unit tests.".toCharArray(); + actual = smashI.process(in.toCharArray()); + assertTrue(Arrays.equals(correct, actual)); + + // Test 3 - I in middle surrounded by tabs + in = "Sometimes I cannot think of creative things to write for unit tests."; + correct = in.toCharArray(); + correct[11] = 'i'; + actual = smashI.process(in.toCharArray()); + assertTrue(Arrays.equals(correct, actual)); + + // Test 4 - Bunch of I's next to each other with varying case + in = "iIiIiiIIiiiiiiIIiiIiIiIIiiIIiiIiiIiiIIiiiIiiiIiI"; + correct = in.toCharArray(); + actual = smashI.process(in.toCharArray()); + assertTrue(Arrays.equals(correct, actual)); + } +}