nlp - Weka - Naive Bayes always gives borderline results -
i trying write text classifier in weka naive bayes. have collection of foursquare tips training data close 500 of them marked positive , approximately same marked negative in excel file. input file has 2 columns first 1 being tip text , second 1 marked polarity. using afinn-111.txt add attribute enhance output. calculates polar words in tip , gives final score of words. here entire code:
public class datareader { static map<string, integer> affinmap = new hashmap<string, integer>(); public list<list<object>> createattributelist() { classloader classloader = getclass().getclassloader(); initializeaffinmap(classloader); file inputworkbook = new file(classloader .getresource("tip_dataset2.xls").getfile()); workbook w; sheet sheet = null; try { w = workbook.getworkbook(inputworkbook); // first sheet sheet = w.getsheet(0); } catch (exception e) { e.printstacktrace(); } list<list<object>> attributelist = new arraylist<list<object>>(); (int = 1; < sheet.getrows(); i++) { string tip = sheet.getcell(0, i).getcontents(); tip = tip.replaceall("'", ""); tip = tip.replaceall("\"", ""); tip = tip.replaceall("%", " percent"); tip = tip.replaceall("@", " atauthor"); string polarity = getpolarity(sheet.getcell(1, i).getcontents()); int affinscore = 0; string[] arr = tip.split(" "); (int j = 0; j < arr.length; j++) { if (affinmap.containskey(arr[j].tolowercase())) { affinscore = affinscore + affinmap.get(arr[j].tolowercase()); } } list<object> attrs = new arraylist<object>(); attrs.add(tip); attrs.add(affinscore); attrs.add(polarity); attributelist.add(attrs); } return attributelist; } private string getpolarity(string cell) { if (cell.equalsignorecase("positive")) { return "positive"; } else { return "negative"; } } private void initializeaffinmap(classloader classloader) { try { inputstream stream = classloader .getresourceasstream("afinn-111.txt"); datainputstream in = new datainputstream(stream); bufferedreader br = new bufferedreader(new inputstreamreader(in)); string str; while ((str = br.readline()) != null) { string[] array = str.split("\t"); affinmap.put(array[0], integer.parseint(array[1])); } in.close(); } catch (exception e) { e.printstacktrace(); } } public static void main(string[] args) throws exception { list<list<object>> attrlist=new datareader().createattributelist(); new createtrainedmodel().createtrainingdata(attrlist); } }
here actual classifier class:
public class createtrainedmodel { public void createtrainingdata(list<list<object>> attrlist) throws exception { attribute tip = new attribute("tip", (fastvector) null); attribute affin = new attribute("affinscore"); fastvector pol = new fastvector(2); pol.addelement("positive"); pol.addelement("negative"); attribute polaritycl = new attribute("polarity", pol); fastvector inputdatadesc = new fastvector(3); inputdatadesc.addelement(tip); inputdatadesc.addelement(affin); inputdatadesc.addelement(polaritycl); instances dataset = new instances("dataset", inputdatadesc, attrlist.size()); // set class index dataset.setclassindex(2); (list<object> onlist : attrlist) { instance in = new instance(3); in.setvalue((attribute) inputdatadesc.elementat(0), onlist.get(0) .tostring()); in.setvalue((attribute) inputdatadesc.elementat(1), integer.parseint(onlist.get(1).tostring())); in.setvalue((attribute) inputdatadesc.elementat(2), onlist.get(2) .tostring()); dataset.add(in); } filter f = new stringtowordvector(); f.setinputformat(dataset); dataset = filter.usefilter(dataset, f); classifier model = (classifier) new naivebayes(); try { model.buildclassifier(dataset); } catch (exception e1) { // todo auto-generated catch block e1.printstacktrace(); } objectoutputstream oos = new objectoutputstream(new fileoutputstream( "fs-tipsnaivebayes.model")); oos.writeobject(model); oos.flush(); oos.close(); fastvector fvwekaattributes1 = new fastvector(3); fvwekaattributes1.addelement(tip); fvwekaattributes1.addelement(affin); instance in = new instance(3); in.setvalue((attribute) fvwekaattributes1.elementat(0), "burger here good"); in.setvalue((attribute) fvwekaattributes1.elementat(1), 0); instances testset = new instances("dataset", fvwekaattributes1, 1); in.setdataset(testset); double[] fdistribution = model.distributionforinstance(in); system.out.println(fdistribution); } }
the problem facing input output distribution in range of [0.52314376998377, 0.47685623001622995]
. , more towards positive negative. these figures not change drastically. idea wrong doing?
i didn't read code, 1 thing can affin score normalized between range. if output more towards positive range need change classification cost function, because overfitting data.
Comments
Post a Comment