Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 69 additions & 19 deletions src/iitb/Segment/DataCruncher.java
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,11 @@ class TestData {
String seq[];
String fname;
String delimit, impDelimit;
TestData(String file,String delimitP,String impDelimitP, String grpDelimit) {
try {
final boolean lowerCase;
TestData(String file, String delimitP, String impDelimitP, String grpDelimit,
final boolean lowerCase) {
this.lowerCase = lowerCase;
try {
fname = file;
rin =new BufferedReader(new FileReader(file+".raw"));
delimit = delimitP;
Expand Down Expand Up @@ -170,7 +173,8 @@ int[] groupedTokens() {
String[] nextRecord() {
try {
if ((line=rin.readLine())!=null) {
StringTokenizer tok=new StringTokenizer(line.toLowerCase(),delimit,true);
final String correctedLine = lowerCase ? line.toLowerCase() : line;
StringTokenizer tok=new StringTokenizer(correctedLine,delimit,true);
int len = tok.countTokens();
if ((seq == null) || (seq.length < len))
seq =new String[len];
Expand Down Expand Up @@ -298,16 +302,29 @@ void close() {

public class DataCruncher {

/**
* This is the old interface to keep compatibility
* @param text
* @param delimit A set of delimiters used by the Tokenizer.
* @param impDelimit Delimiters to be retained for tagging.
* @return an Array of tokens.
*/
protected static String[] getTokenList(String text, String delimit,
String impDelimit) {
return getTokenList(text,delimit,impDelimit,true);
}

/**
*
* @param text
* @param delimit A set of delimiters used by the Tokenizer.
* @param impDelimit Delimiters to be retained for tagging.
* @param impDelimit Delimiters to be retained for tagging.
* @param lowerCase convert tokens to lower case
* @return an Array of tokens.
*/
protected static String[] getTokenList(String text, String delimit,
String impDelimit) {
text = text.toLowerCase();
String impDelimit,boolean lowerCase) {
text = lowerCase ? text.toLowerCase() : text;
StringTokenizer textTok = new StringTokenizer(text, delimit, true);
//This allocates space for all tokens and delimiters,
//but will make a second pass through the String unnecessary.
Expand All @@ -322,7 +339,27 @@ protected static String[] getTokenList(String text, String delimit,
//Finally, the storage is trimmed to the actual size.
return tokenList.toArray(new String[tokenList.size()]);
}


/**
* Reads a block of text ended by a blank line or the end of the file.
* The block contains lines of tokens with a label.
*
* NOTE: This is the old interface which always lowercases the input
* @param numLabels The maximal number of labels expected
* @param tin
* @param tagDelimit Separator between tokens and tag number
* @param delimit Used to define token boundaries
* @param impDelimit Delimiters to be retained for tagging
* @param t Stores the labels
* @param cArray Stores the tokens
* @return number of lines read
* @throws IOException
*/
public static int readRowVarCol(int numLabels, BufferedReader tin,
String tagDelimit, String delimit, String impDelimit, int[] t,
String[][] cArray) throws IOException {
return readRowVarCol(numLabels,tin,tagDelimit,delimit,impDelimit,t,cArray,true);
}
/**
* Reads a block of text ended by a blank line or the end of the file.
* The block contains lines of tokens with a label.
Expand All @@ -333,19 +370,21 @@ protected static String[] getTokenList(String text, String delimit,
* @param impDelimit Delimiters to be retained for tagging
* @param t Stores the labels
* @param cArray Stores the tokens
* @param lowerCase lowercase tokens before processing
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: There need to be aligned.

* @return number of lines read
* @throws IOException
*/
public static int readRowVarCol(int numLabels, BufferedReader tin,
String tagDelimit, String delimit, String impDelimit, int[] t,
String[][] cArray) throws IOException {
String[][] cArray,boolean lowerCase) throws IOException {
int ptr = 0;
String line;
while(true) {
line = tin.readLine();
StringTokenizer firstSplit=null;
if (line!=null) {
firstSplit=new StringTokenizer(line.toLowerCase(),tagDelimit);
final String modifiedLine = lowerCase ? line.toLowerCase() : line;
firstSplit=new StringTokenizer(modifiedLine,tagDelimit);
}
if ((line==null) || (firstSplit.countTokens()<2)) {
// Empty Line
Expand All @@ -354,17 +393,23 @@ public static int readRowVarCol(int numLabels, BufferedReader tin,
String w = firstSplit.nextToken();
int label=Integer.parseInt(firstSplit.nextToken());
t[ptr] = label;
cArray[ptr++] = getTokenList(w,delimit,impDelimit);
cArray[ptr++] = getTokenList(w,delimit,impDelimit,lowerCase);
}
}

static int readRowFixedCol(int numLabels, BufferedReader tin, String tagDelimit,
String delimit, String impDelimit, int[] t, String[][] cArray, int labels[])
throws IOException {
return readRowFixedCol(numLabels,tin,tagDelimit,delimit,impDelimit,t,cArray,labels,true);
}
static int readRowFixedCol(int numLabels, BufferedReader tin, String tagDelimit,
String delimit, String impDelimit, int[] t, String[][] cArray, int labels[])
String delimit, String impDelimit, int[] t, String[][] cArray, int labels[],boolean lowerCase)
throws IOException {
String line=tin.readLine();
if (line == null)
return 0;
StringTokenizer firstSplit=new StringTokenizer(line.toLowerCase(),tagDelimit,true);
final String modifiedLine = lowerCase ? line.toLowerCase() : line;
StringTokenizer firstSplit=new StringTokenizer(modifiedLine,tagDelimit,true);
int ptr = 0;
for (int i = 0; (i < labels.length) && firstSplit.hasMoreTokens(); i++) {
int label = labels[i];
Expand All @@ -378,7 +423,7 @@ static int readRowFixedCol(int numLabels, BufferedReader tin, String tagDelimit,
}
if ((label > 0) && (label <= numLabels)) {
t[ptr] = label;
cArray[ptr++] = getTokenList(w,delimit,impDelimit);
cArray[ptr++] = getTokenList(w,delimit,impDelimit,lowerCase);
}
}
return ptr;
Expand Down Expand Up @@ -416,10 +461,14 @@ protected static int[] readHeaderInfo(int numLabels, BufferedReader tin,

return labels;
}

public static TrainData readTagged(int numLabels, String tfile,
String rfile, String delimit, String tagDelimit, String impDelimit,
LabelMap labelMap) {
return readTagged(numLabels,tfile,rfile,delimit,tagDelimit,impDelimit,labelMap,true);
}
public static TrainData readTagged(int numLabels, String tfile,
String rfile, String delimit, String tagDelimit, String impDelimit,
LabelMap labelMap) {
LabelMap labelMap,boolean lowerCase) {
try {
ArrayList<DCTrainRecord> td = new ArrayList<DCTrainRecord>();
BufferedReader tin = new BufferedReader(new FileReader(tfile
Expand Down Expand Up @@ -447,10 +496,10 @@ public static TrainData readTagged(int numLabels, String tfile,
int ptr = 0;
if (fixedColFormat) {
ptr = readRowFixedCol(numLabels, tin, tagDelimit, delimit,
impDelimit, t, cArray, labels);
impDelimit, t, cArray, labels,lowerCase);
} else {
ptr = readRowVarCol(numLabels, tin, tagDelimit, delimit,
impDelimit, t, cArray);
impDelimit, t, cArray,lowerCase);
}
if (ptr == 0) {
break;
Expand All @@ -471,12 +520,13 @@ public static TrainData readTagged(int numLabels, String tfile,
return null;
}

public static void readRaw(Vector<String[]> data,String file,String delimit,String impDelimit) {
public static void readRaw(Vector<String[]> data,String file,String delimit,String impDelimit,boolean lowerCase) {
try {
BufferedReader rin=new BufferedReader(new FileReader(file+".raw"));
String line;
while((line=rin.readLine())!=null) {
StringTokenizer tok=new StringTokenizer(line.toLowerCase(),delimit,true);
final String modifiedLine = lowerCase ? line.toLowerCase() : line;
StringTokenizer tok=new StringTokenizer(line,delimit,true);
String seq[]=new String[tok.countTokens()];
int count=0;
for(int i=0 ; i<seq.length ; i++) {
Expand Down
17 changes: 12 additions & 5 deletions src/iitb/Segment/Segment.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ public class Segment {

CRF crfModel;
FeatureGenImpl featureGen;

boolean lowerCase = true; //set if tokens are lowercased

public FeatureGenerator featureGenerator() {return featureGen;}

public static void main(String argv[]) throws Exception {
Expand Down Expand Up @@ -139,6 +142,9 @@ public void processArgs() throws Exception {
if ((value = options.getProperty("modelGraph")) != null) {
modelGraphType = value;
}
if ((value = options.getProperty("lowercase")) != null) {
lowerCase = !("false".equals(value.toLowerCase()));
}
}
void allocModel() throws Exception {
// add any code related to dependency/consistency amongst paramter
Expand Down Expand Up @@ -250,7 +256,7 @@ public void train() throws Exception {
DataCruncher.createRaw(baseDir+"/data/"+inName+"/"+inName+".train",tagDelimit);
File dir=new File(baseDir+"/learntModels/"+outDir);
dir.mkdirs();
TrainData trainData = DataCruncher.readTagged(nlabels,baseDir+"/data/"+inName+"/"+inName+".train",baseDir+"/data/"+inName+"/"+inName+".train",delimit,tagDelimit,impDelimit,labelMap);
TrainData trainData = DataCruncher.readTagged(nlabels,baseDir+"/data/"+inName+"/"+inName+".train",baseDir+"/data/"+inName+"/"+inName+".train",delimit,tagDelimit,impDelimit,labelMap,lowerCase);
AlphaNumericPreprocessor.preprocess(trainData,nlabels);

allocModel();
Expand Down Expand Up @@ -278,7 +284,8 @@ public void test() throws Exception {
public void doTest() throws Exception {
File dir=new File(baseDir+"/out/"+outDir);
dir.mkdirs();
TestData testData = new TestData(baseDir+"/data/"+inName+"/"+inName+".test",delimit,impDelimit,groupDelimit);
TestData testData = new TestData(baseDir+"/data/"+inName+"/"+inName+".test",delimit,impDelimit,groupDelimit,
lowerCase);
TestDataWrite tdw = new TestDataWrite(baseDir+"/out/"+outDir+"/"+inName+".test",baseDir+"/data/"+inName+"/"+inName+".test",delimit,tagDelimit,impDelimit,labelMap);

String collect[] = new String[nlabels];
Expand Down Expand Up @@ -309,9 +316,9 @@ String arrayToString(Object[] ar) {
}
public void calc() throws Exception {
Vector<String[]> s = new Vector<String[]>();
TrainData tdMan = DataCruncher.readTagged(nlabels,baseDir+"/data/"+inName+"/"+inName+".test",baseDir+"/data/"+inName+"/"+inName+".test",delimit,tagDelimit,impDelimit,labelMap);
TrainData tdAuto = DataCruncher.readTagged(nlabels,baseDir+"/out/"+outDir+"/"+inName+".test",baseDir+"/data/"+inName+"/"+inName+".test",delimit,tagDelimit,impDelimit,labelMap);
DataCruncher.readRaw(s,baseDir+"/data/"+inName+"/"+inName+".test","","");
TrainData tdMan = DataCruncher.readTagged(nlabels,baseDir+"/data/"+inName+"/"+inName+".test",baseDir+"/data/"+inName+"/"+inName+".test",delimit,tagDelimit,impDelimit,labelMap,lowerCase);
TrainData tdAuto = DataCruncher.readTagged(nlabels,baseDir+"/out/"+outDir+"/"+inName+".test",baseDir+"/data/"+inName+"/"+inName+".test",delimit,tagDelimit,impDelimit,labelMap,lowerCase);
DataCruncher.readRaw(s,baseDir+"/data/"+inName+"/"+inName+".test","","",lowerCase);
int len=tdAuto.size();
int truePos[]=new int[nlabels+1];
int totalMarkedPos[]=new int[nlabels+1];
Expand Down
11 changes: 11 additions & 0 deletions test/iitb/Segment/DataCruncherGetTokenListTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,15 @@ public void testGetTokenList() {
assertEquals(tokens[1], "goldfield");
assertEquals(tokens[4], "|3");
}
@Test
public void testGetTokenListWithoutLowerCasing() {
String tokenString = "West Goldfield Avenue, |3";
String delimit = ",\t/ -():.;'?#`&\"_";
String impDelimit = ",";
String[] tokens = DataCruncher.getTokenList(tokenString, delimit, impDelimit,false);

assertEquals(tokens.length, 5);
assertEquals(tokens[1], "Goldfield");
assertEquals(tokens[4], "|3");
}
}
50 changes: 49 additions & 1 deletion test/iitb/Segment/DataCruncherReadRowFixedColTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,29 @@ public void testReadRowFixedCol() {
e.printStackTrace();
}
}



@Test
public void testReadRowFixedColWithoutDowncasing() {
int numLabels = 7;
BufferedReader reader = new BufferedReader(new StringReader(tagged));
String tagDelimit = "|";
String delimit = ",\t/ -():.;'?#`&\"_";
String impDelimit = ",";
int[] t = new int[numLabels];
String[][] cArray = new String[numLabels][0];
try {
int[] labels = DataCruncher.readHeaderInfo(numLabels, reader, tagDelimit);
int ptr = DataCruncher.readRowFixedCol(numLabels, reader,
tagDelimit, delimit, impDelimit, t, cArray, labels,false);
assertEquals(ptr, 4);
assertEquals(cArray[0][2], "Road");
assertEquals(cArray[3][0], "99603");
} catch (IOException e) {
e.printStackTrace();
}
}

@Test
public void testReadRowFixedColSecond() {
int numLabels = 7;
Expand All @@ -73,4 +95,30 @@ public void testReadRowFixedColSecond() {
e.printStackTrace();
}
}

@Test
public void testReadRowFixedColSecondWithoutDowncasing() {
int numLabels = 7;
BufferedReader reader = new BufferedReader(new StringReader(tagged));
String tagDelimit = "|";
String delimit = ",\t/ -():.;'?#`&\"_";
String impDelimit = ",";
int[] t = new int[numLabels];
String[][] cArray = new String[numLabels][0];
try {
int[] labels = DataCruncher.readHeaderInfo(numLabels, reader, tagDelimit);
int ptr = DataCruncher.readRowFixedCol(numLabels, reader,
tagDelimit, delimit, impDelimit, t, cArray, labels,false);
t = new int[numLabels];
cArray = new String[numLabels][0];
ptr = DataCruncher.readRowFixedCol(numLabels, reader,
tagDelimit, delimit, impDelimit, t, cArray, labels,false);
assertEquals(ptr, 4);
assertEquals(cArray[0][2], "Center");
assertEquals(cArray[3][0], "36201");
} catch (IOException e) {
e.printStackTrace();
}
}

}
50 changes: 48 additions & 2 deletions test/iitb/Segment/DataCruncherReadRowVarColTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,32 @@ public void testReadRowVarCol() {
e.printStackTrace();
}
}

@Test

@Test
public void testReadRowVarColWithoutDowncasing() {
String file = "testdata" + File.separator + "us50-short.tagged";
try {
BufferedReader tin = new BufferedReader(new FileReader(file));
int numLabels = 7;
int[] t = new int[7];
String[][] cArray = new String[7][0];
String tagDelimit = "|";
String delimit = ",\t/ -():.;'?#`&\"_";
String impDelimit = ",";
int ptr = DataCruncher.readRowVarCol(numLabels, tin, tagDelimit, delimit,impDelimit,t,cArray,false);
assertEquals(ptr, 4);
assertEquals(t[3], 7);
assertEquals(cArray[1][1], ",");
assertEquals(cArray[0][2], "Road");
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}


@Test
public void testReadRowVarColEof() {
String file = "testdata" + File.separator + "us50-short.tagged";
try {
Expand All @@ -69,4 +93,26 @@ public void testReadRowVarColEof() {
e.printStackTrace();
}
}
@Test
public void testReadRowVarColEofWithoutDowncasing() {
String file = "testdata" + File.separator + "us50-short.tagged";
try {
BufferedReader tin = new BufferedReader(new FileReader(file));
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Four spaces.

int numLabels = 7;
int[] t = new int[7];
String[][] cArray = new String[7][0];
String tagDelimit = "|";
String delimit = ",\t/ -():.;'?#`&\"_";
String impDelimit = ",";
int ptr = DataCruncher.readRowVarCol(numLabels, tin, tagDelimit, delimit,impDelimit,t,cArray,false);
//Will run into end of file
ptr = DataCruncher.readRowVarCol(numLabels, tin, tagDelimit, delimit,impDelimit,t,cArray,false);
assertEquals(ptr, 4);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}

}
Loading