nlp - Weka - Naive Bayes always gives borderline results -
i trying write text classifier in weka naive bayes. have collection of foursquare tips training data close 500 of them marked positive , approximately same marked negative in excel file. input file has 2 columns first 1 being tip text , second 1 marked polarity. using afinn-111.txt add attribute enhance output. calculates polar words in tip , gives final score of words. here entire code:
    public class datareader {      static map<string, integer> affinmap = new hashmap<string, integer>();      public list<list<object>> createattributelist() {         classloader classloader = getclass().getclassloader();         initializeaffinmap(classloader);         file inputworkbook = new file(classloader                 .getresource("tip_dataset2.xls").getfile());         workbook w;         sheet sheet = null;         try {             w = workbook.getworkbook(inputworkbook);             // first sheet             sheet = w.getsheet(0);         } catch (exception e) {             e.printstacktrace();         }         list<list<object>> attributelist = new arraylist<list<object>>();         (int = 1; < sheet.getrows(); i++) {             string tip = sheet.getcell(0, i).getcontents();              tip = tip.replaceall("'", "");             tip = tip.replaceall("\"", "");             tip = tip.replaceall("%", " percent");             tip = tip.replaceall("@", " atauthor");             string polarity = getpolarity(sheet.getcell(1, i).getcontents());             int affinscore = 0;             string[] arr = tip.split(" ");             (int j = 0; j < arr.length; j++) {                 if (affinmap.containskey(arr[j].tolowercase())) {                     affinscore = affinscore                             + affinmap.get(arr[j].tolowercase());                 }             }             list<object> attrs = new arraylist<object>();             attrs.add(tip);             attrs.add(affinscore);             attrs.add(polarity);              attributelist.add(attrs);         }         return attributelist;     }      private string getpolarity(string cell) {         if (cell.equalsignorecase("positive")) {             return "positive";         } else {             return "negative";         }     }      private void initializeaffinmap(classloader classloader) {         try {             inputstream stream = classloader                     .getresourceasstream("afinn-111.txt");             datainputstream in = new datainputstream(stream);             bufferedreader br = new bufferedreader(new inputstreamreader(in));             string str;             while ((str = br.readline()) != null) {                 string[] array = str.split("\t");                 affinmap.put(array[0], integer.parseint(array[1]));             }             in.close();         } catch (exception e) {             e.printstacktrace();         }     }      public static void main(string[] args) throws exception {         list<list<object>> attrlist=new datareader().createattributelist();         new createtrainedmodel().createtrainingdata(attrlist);     }  } here actual classifier class:
public class createtrainedmodel {      public void createtrainingdata(list<list<object>> attrlist)             throws exception {          attribute tip = new attribute("tip", (fastvector) null);         attribute affin = new attribute("affinscore");          fastvector pol = new fastvector(2);         pol.addelement("positive");         pol.addelement("negative");         attribute polaritycl = new attribute("polarity", pol);          fastvector inputdatadesc = new fastvector(3);         inputdatadesc.addelement(tip);         inputdatadesc.addelement(affin);         inputdatadesc.addelement(polaritycl);          instances dataset = new instances("dataset", inputdatadesc,                 attrlist.size());         // set class index         dataset.setclassindex(2);          (list<object> onlist : attrlist) {             instance in = new instance(3);             in.setvalue((attribute) inputdatadesc.elementat(0), onlist.get(0)                     .tostring());             in.setvalue((attribute) inputdatadesc.elementat(1),                     integer.parseint(onlist.get(1).tostring()));             in.setvalue((attribute) inputdatadesc.elementat(2), onlist.get(2)                     .tostring());              dataset.add(in);         }          filter f = new stringtowordvector();         f.setinputformat(dataset);         dataset = filter.usefilter(dataset, f);          classifier model = (classifier) new naivebayes();         try {             model.buildclassifier(dataset);         } catch (exception e1) { // todo auto-generated catch block             e1.printstacktrace();         }          objectoutputstream oos = new objectoutputstream(new fileoutputstream(                 "fs-tipsnaivebayes.model"));         oos.writeobject(model);         oos.flush();         oos.close();          fastvector fvwekaattributes1 = new fastvector(3);         fvwekaattributes1.addelement(tip);         fvwekaattributes1.addelement(affin);          instance in = new instance(3);         in.setvalue((attribute) fvwekaattributes1.elementat(0),                 "burger here good");         in.setvalue((attribute) fvwekaattributes1.elementat(1), 0);          instances testset = new instances("dataset", fvwekaattributes1, 1);         in.setdataset(testset);          double[] fdistribution = model.distributionforinstance(in);         system.out.println(fdistribution);      }  } the problem facing input output distribution in range of [0.52314376998377, 0.47685623001622995]. , more towards positive negative. these figures not change drastically. idea wrong doing? 
i didn't read code, 1 thing can affin score normalized between range. if output more towards positive range need change classification cost function, because overfitting data.
Comments
Post a Comment