modify reddit classifier test

This commit is contained in:
DOHA 2015-04-21 19:33:16 +02:00
parent 91ea275b5b
commit dc1bcbd06a
7 changed files with 12419 additions and 8126 deletions

View File

@ -86,7 +86,7 @@ public class WebConfig extends WebMvcConfigurerAdapter {
@Bean
public RedditClassifier redditClassifier() throws IOException {
final Resource file = new ClassPathResource("train.csv");
final Resource file = new ClassPathResource("data.csv");
final RedditClassifier redditClassifier = new RedditClassifier();
redditClassifier.trainClassifier(file.getFile().getAbsolutePath());
return redditClassifier;

View File

@ -26,12 +26,12 @@ public class RedditClassifier {
public static int GOOD = 0;
public static int BAD = 1;
public static int MIN_SCORE = 10;
public static int NUM_OF_FEATURES = 1000;
private final AdaptiveLogisticRegression classifier;
private final FeatureVectorEncoder titleEncoder;
private final FeatureVectorEncoder domainEncoder;
private CrossFoldLearner learner;
private final int noOfFeatures;
private double accuracy;
private final int[] trainCount = { 0, 0 };
@ -41,7 +41,8 @@ public class RedditClassifier {
private final int[] correctCount = { 0, 0 };
public RedditClassifier() {
classifier = new AdaptiveLogisticRegression(2, NUM_OF_FEATURES, new L2());
noOfFeatures = 1000;
classifier = new AdaptiveLogisticRegression(2, 1000, new L2());
classifier.setPoolSize(150);
titleEncoder = new AdaptiveWordValueEncoder("title");
titleEncoder.setProbes(2);
@ -49,11 +50,22 @@ public class RedditClassifier {
domainEncoder.setProbes(1);
}
public RedditClassifier(int poolSize, int noOfFeatures) {
this.noOfFeatures = noOfFeatures;
classifier = new AdaptiveLogisticRegression(2, noOfFeatures, new L2());
classifier.setPoolSize(poolSize);
titleEncoder = new AdaptiveWordValueEncoder("title");
titleEncoder.setProbes(1);
domainEncoder = new StaticWordValueEncoder("domain");
domainEncoder.setProbes(1);
}
public void trainClassifier(String fileName) throws IOException {
final List<NamedVector> vectors = extractVectors(readDataFile(fileName));
final int noOfTraining = (int) (RedditDataCollector.DATA_SIZE * 0.8);
final int size = vectors.size();
final int noOfTraining = (int) (size * 0.8);
final List<NamedVector> trainingData = vectors.subList(0, noOfTraining);
final List<NamedVector> testData = vectors.subList(noOfTraining, RedditDataCollector.DATA_SIZE);
final List<NamedVector> testData = vectors.subList(noOfTraining, size);
int category;
for (final NamedVector vector : trainingData) {
category = (vector.getName() == "GOOD") ? GOOD : BAD;
@ -61,11 +73,12 @@ public class RedditClassifier {
trainCount[category]++;
}
System.out.println("Training count ========= Good = " + trainCount[0] + " ___ Bad = " + trainCount[1]);
System.out.println("----------------------------------------------------------------- \n");
evaluateClassifier(testData);
}
public Vector convertPost(String title, String domain, int hour) {
final Vector vector = new RandomAccessSparseVector(NUM_OF_FEATURES);
final Vector vector = new RandomAccessSparseVector(noOfFeatures);
final List<String> words = Splitter.onPattern("\\W").omitEmptyStrings().splitToList(title);
vector.set(0, hour);
vector.set(1, words.size());
@ -105,10 +118,10 @@ public class RedditClassifier {
wrong++;
}
}
System.out.println("Eval count ========= Good = " + evalCount[0] + " ___ Bad = " + evalCount[1]);
System.out.println("Test result ======== Correct prediction = " + correct + " ----- Wrong prediction = " + wrong);
System.out.println("Test result ======== Correct Good = " + correctCount[0] + " ----- Correct Bad = " + correctCount[1]);
System.out.println("Test result ======== Good accuracy = " + (correctCount[0] / (evalCount[0] + 0.0)) + " ----- Bad accuracy = " + (correctCount[1] / (evalCount[1] + 0.0)));
System.out.println("Eval count =================== Good = " + evalCount[0] + " ----- Bad = " + evalCount[1] + "\n");
System.out.println("Overall Evaluation ============= Correct prediction = " + correct + " ----- Wrong prediction = " + wrong);
System.out.println("Correctly Evaluated =========== Correct Good = " + correctCount[0] + " ----- Correct Bad = " + correctCount[1]);
System.out.println("Correctly Evaluated (%) ======== Good accuracy = " + (correctCount[0] / (evalCount[0] + 0.0)) + " ----- Bad accuracy = " + (correctCount[1] / (evalCount[1] + 0.0)));
this.accuracy = correct / (wrong + correct + 0.0);
}
@ -133,7 +146,7 @@ public class RedditClassifier {
private NamedVector extractVector(String line) {
final String[] items = line.split(",");
final String category = extractCategory(Integer.parseInt(items[0]));
final NamedVector vector = new NamedVector(new RandomAccessSparseVector(NUM_OF_FEATURES), category);
final NamedVector vector = new NamedVector(new RandomAccessSparseVector(noOfFeatures), category);
final Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"));
cal.setTimeInMillis(Long.parseLong(items[1]) * 1000);

View File

@ -16,8 +16,8 @@ import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
public class RedditDataCollector {
public static final String TRAINING_FILE = "src/main/resources/train.csv";
public static final int DATA_SIZE = 8000;
public static final String DATA_FILE = "src/main/resources/data.csv";
public static final int DATA_SIZE = 20000;
public static final int LIMIT = 100;
public static final Long YEAR = 31536000L;
private final Logger logger = LoggerFactory.getLogger(getClass());
@ -45,10 +45,11 @@ public class RedditDataCollector {
public void collectData() throws IOException {
final int noOfRounds = DATA_SIZE / LIMIT;
timestamp = System.currentTimeMillis() / 1000;
final FileWriter writer = new FileWriter(TRAINING_FILE);
final FileWriter writer = new FileWriter(DATA_FILE);
writer.write("Score, Timestamp in utc, Number of wrods in title, Title, Domain \n");
for (int i = 0; i < noOfRounds; i++) {
getPosts(writer);
System.out.println(i);
}
writer.close();
}

File diff suppressed because it is too large Load Diff

View File

@ -1,101 +0,0 @@
Score, Timestamp in utc, Number of wrods in title, Title, Domain
3,1357021066,7,Good Examples of Component dragging and dropping,self.java
0,1357017936,10,Game only works on mac need help porting to windows,self.java
2,1357008210,4,eclipse or keyboard issues,self.java
37,1356977564,6,The Long Strange Trip to Java,blinkenlights.com
5,1356970069,9,How to Send Email with Embedded Images Using Java,blog.smartbear.com
0,1356956937,4,What makes you architect,programming.freeblog.hu
0,1356900338,4,Apache Maven I of,javaxperiments.blogspot.com
0,1356896219,5,Custom functions per class instance,self.java
0,1356891056,5,JMeter Performance and Tuning Tips,ubik-ingenierie.com
12,1356888358,19,First fully functional GUI program about making acronyms easier to remember Was wondering if you guys had any tips,github.com
2,1356881034,12,Social Tech 101 Why do I love Java Developer Edition Part 1,socialtech101.blogspot.com
5,1356826782,7,Configurable deployment descriptors proposal for Java EE,java.net
31,1356793800,16,Finished my very first game in java Snake clone It s not much but it works,self.java
18,1356766107,10,la4j Linear Alebra for Java 0 3 0 is out,la4j.org
1,1356747219,6,RubyFlux a Ruby to Java compiler,github.com
15,1356735585,10,Simple JMS 2 0 Sample JMSDestinationDefinition and Synchronous Message Receive,blogs.oracle.com
9,1356717174,3,Java Use WebCam,self.java
4,1356711735,5,Compiler Optimisation for saving memory,self.java
4,1356662279,22,I m interested in your opinion about Java for Python Programmers by Brad Miller or a better alternative for a Java newbie,self.java
0,1356633508,4,A good android game,self.java
4,1356631759,12,a java library i saw mentioned here can t find pls help,self.java
1,1356627923,5,About learning Java a question,self.java
0,1356623761,3,Objects and java2d,self.java
0,1356593886,2,AffineTransform halp,self.java
43,1356584047,7,Java Was Strongly Influenced by Objective C,cs.gmu.edu
1,1356580543,7,Having trouble Setting Up Android Development Environment,self.java
0,1356560732,13,How can I fetch the first X links of reddit into a list,self.java
0,1356551788,4,JDK Download page error,self.java
9,1356536557,12,looking for a good book website to learn intermediate core java spring,self.java
7,1356487079,11,A popup menu like Filemaker s Any library have an implementation,self.java
1,1356455255,6,Just a Few Helpful Solr Functions,ignatyev-dev.blogspot.ru
13,1356433373,7,Bart s Blog Xtend the better compromise,bartnaudts.blogspot.de
4,1356410180,3,Beginner Question Here,self.java
19,1356283667,5,Nashorn JavaScript for the JVM,blogs.oracle.com
0,1356234086,5,Problem with Java memory use,self.java
0,1356195953,5,Learning Java in two weeks,self.java
0,1356127053,10,Twitter4J Download a Twitter Users Tweets to a Text File,github.com
20,1356118151,15,Using Apache Commons Functor functional interfaces with Java 8 lambdas cross post from r functionalprogramming,kinoshita.eti.br
13,1356102153,7,Date and Time in Java 8 Timezones,insightfullogic.com
10,1356088959,8,Implementing a collapsible ui repeat rows in JSF,kahimyang.info
8,1356034544,5,OmniFaces 1 3 is released,balusc.blogspot.com
1,1356027563,11,How to Configure a JNDI DataSource in the OpenShift Tomcat Cartridge,openshift.redhat.com
82,1356020780,7,Doomsday Sale IntelliJ 75 off today only,jetbrains.com
3,1355976320,3,IntelliJ Working Directory,self.java
0,1355966433,5,Help with java problem please,self.java
17,1355928745,12,What s new in Servlet 3 1 Java EE 7 moving forward,blogs.oracle.com
11,1355864485,5,Quick poll for research project,self.java
0,1355851994,5,Eclipse Text Problem Need Help,self.java
29,1355823193,4,Java 8 vs Xtend,blog.efftinge.de
2,1355805047,4,Learning Java between semesters,self.java
6,1355798488,11,I m a beginner programmer any tips on where to start,self.java
7,1355784039,9,Java Advent Calendar far sight look at JDK 8,javaadvent.com
2,1355782111,9,Technical Interview coming up Suggestions Pointers Words of Wisdom,self.java
0,1355775350,6,someone may help me out here,stackoverflow.com
2,1355765235,14,THC and a bit of Thunking Creative ways to deal with multiple return types,kingsfleet.blogspot.it
0,1355749586,12,Newbie here can you explain to me what class private stack is,self.java
0,1355748318,4,When StackOverflow Goes Bad,blogs.windward.net
0,1355721981,4,Java Graphics Projectile HELP,self.java
0,1355719622,12,Which one of the following statements about object oriented programming is false,self.java
16,1355707814,8,What s the skinny on JavaFX these days,self.java
2,1355685929,20,Can someone explain exactly what Apache Ant is How does it differ from just creating a jar file in blueJ,self.java
4,1355621071,7,Looking to add test code in Github,self.java
7,1355613608,6,Java Version of Jarvis Must Haves,self.java
5,1355599765,6,Java Advent Calendar Functional Java Collections,javaadvent.com
7,1355597483,13,I m working on a text based RPG and I have some questions,self.java
2,1355574445,6,Java EE 7 Community Survey Results,blog.eisele.net
0,1355576629,4,Evolution of Java Technology,compilr.org
18,1355574828,10,Are your Garbage Collection Logs speaking to you Censum does,blog.eisele.net
10,1355559380,13,What is the best GUI tool for creating a 2d platformer in Java,self.java
0,1355555357,7,Hit me with your best arrays tutorial,self.java
10,1355542403,11,Does any one know of clean 2d graphics library for java,self.java
23,1355511507,9,Dark Juno A Dark UI Theme for Eclipse 4,rogerdudler.github.com
0,1355504132,10,Java devs that work remote I have a few questions,self.java
0,1355501999,9,How do you make use of your Java knowledge,self.java
1,1355492027,5,How ClassLoader works in Java,javarevisited.blogspot.com.au
0,1355489352,9,Main difference between Abstract Class and Interface Compilr org,compilr.org
48,1355487006,8,Date and Time in Java 8 Part 1,insightfullogic.com
0,1355485766,3,Java JSON problem,self.java
10,1355448875,16,Open source applications large small worth looking at in Java I want to understand application structure,self.java
1,1355444452,4,lo mexor pz xxx,heavy-r.com
0,1355402889,11,JRebel Remoting to Push Changes to Your Toaster in The Cloud,zeroturnaround.com
0,1355402734,6,Are bugs part of technical debt,swreflections.blogspot.ca
2,1355400483,9,Compile and Run Java programs with Sublime Text 2,compilr.org
0,1355391115,4,console input like craftbukkit,self.java
7,1355390023,8,Hosting suggestions needed for a java web app,self.java
6,1355359227,17,Java novice here Have noticed funny performance differences across laptop and desktop Nvidia optimus related Details inside,self.java
1,1355327090,18,Please advice which java server technology should I choose for this new web app in my new work,self.java
0,1355326137,6,code to convert digits into words,compilr.org
34,1355319442,7,I want to learn REAL WORLD Java,self.java
5,1355285442,3,Hiring Java Developers,self.java
0,1355282335,14,Help How can I count the amount of a specific integer in an ArrayList,self.java
1,1355272303,24,I m taking a Java 1 final tomorrow I m fairly confident but I would appreciate any tips on things to look out for,self.java
38,1355267143,6,Will Java become the next COBOL,self.java
0,1355263047,2,Understanding recursion,imgur.com
1,1355257558,15,How can I clear the command prompt terminal with java and make it cross platform,self.java
2,1355253849,18,Is there a strategy for reducing code clutter when you are printing to the terminal alot Beginner Programmer,self.java
1,1355253049,5,BlockingQueues and multiple producer threads,self.java
1,1355241441,6,Beginner Struggling with classes Need help,self.java
0,1355238089,8,Simple Steps to Merge PDF files using Java,compilr.org
23,1355236940,8,Java and vs Python within a business context,self.java
1 Score Timestamp in utc Number of wrods in title Title Domain
2 3 1357021066 7 Good Examples of Component dragging and dropping self.java
3 0 1357017936 10 Game only works on mac need help porting to windows self.java
4 2 1357008210 4 eclipse or keyboard issues self.java
5 37 1356977564 6 The Long Strange Trip to Java blinkenlights.com
6 5 1356970069 9 How to Send Email with Embedded Images Using Java blog.smartbear.com
7 0 1356956937 4 What makes you architect programming.freeblog.hu
8 0 1356900338 4 Apache Maven I of javaxperiments.blogspot.com
9 0 1356896219 5 Custom functions per class instance self.java
10 0 1356891056 5 JMeter Performance and Tuning Tips ubik-ingenierie.com
11 12 1356888358 19 First fully functional GUI program about making acronyms easier to remember Was wondering if you guys had any tips github.com
12 2 1356881034 12 Social Tech 101 Why do I love Java Developer Edition Part 1 socialtech101.blogspot.com
13 5 1356826782 7 Configurable deployment descriptors proposal for Java EE java.net
14 31 1356793800 16 Finished my very first game in java Snake clone It s not much but it works self.java
15 18 1356766107 10 la4j Linear Alebra for Java 0 3 0 is out la4j.org
16 1 1356747219 6 RubyFlux a Ruby to Java compiler github.com
17 15 1356735585 10 Simple JMS 2 0 Sample JMSDestinationDefinition and Synchronous Message Receive blogs.oracle.com
18 9 1356717174 3 Java Use WebCam self.java
19 4 1356711735 5 Compiler Optimisation for saving memory self.java
20 4 1356662279 22 I m interested in your opinion about Java for Python Programmers by Brad Miller or a better alternative for a Java newbie self.java
21 0 1356633508 4 A good android game self.java
22 4 1356631759 12 a java library i saw mentioned here can t find pls help self.java
23 1 1356627923 5 About learning Java a question self.java
24 0 1356623761 3 Objects and java2d self.java
25 0 1356593886 2 AffineTransform halp self.java
26 43 1356584047 7 Java Was Strongly Influenced by Objective C cs.gmu.edu
27 1 1356580543 7 Having trouble Setting Up Android Development Environment self.java
28 0 1356560732 13 How can I fetch the first X links of reddit into a list self.java
29 0 1356551788 4 JDK Download page error self.java
30 9 1356536557 12 looking for a good book website to learn intermediate core java spring self.java
31 7 1356487079 11 A popup menu like Filemaker s Any library have an implementation self.java
32 1 1356455255 6 Just a Few Helpful Solr Functions ignatyev-dev.blogspot.ru
33 13 1356433373 7 Bart s Blog Xtend the better compromise bartnaudts.blogspot.de
34 4 1356410180 3 Beginner Question Here self.java
35 19 1356283667 5 Nashorn JavaScript for the JVM blogs.oracle.com
36 0 1356234086 5 Problem with Java memory use self.java
37 0 1356195953 5 Learning Java in two weeks self.java
38 0 1356127053 10 Twitter4J Download a Twitter Users Tweets to a Text File github.com
39 20 1356118151 15 Using Apache Commons Functor functional interfaces with Java 8 lambdas cross post from r functionalprogramming kinoshita.eti.br
40 13 1356102153 7 Date and Time in Java 8 Timezones insightfullogic.com
41 10 1356088959 8 Implementing a collapsible ui repeat rows in JSF kahimyang.info
42 8 1356034544 5 OmniFaces 1 3 is released balusc.blogspot.com
43 1 1356027563 11 How to Configure a JNDI DataSource in the OpenShift Tomcat Cartridge openshift.redhat.com
44 82 1356020780 7 Doomsday Sale IntelliJ 75 off today only jetbrains.com
45 3 1355976320 3 IntelliJ Working Directory self.java
46 0 1355966433 5 Help with java problem please self.java
47 17 1355928745 12 What s new in Servlet 3 1 Java EE 7 moving forward blogs.oracle.com
48 11 1355864485 5 Quick poll for research project self.java
49 0 1355851994 5 Eclipse Text Problem Need Help self.java
50 29 1355823193 4 Java 8 vs Xtend blog.efftinge.de
51 2 1355805047 4 Learning Java between semesters self.java
52 6 1355798488 11 I m a beginner programmer any tips on where to start self.java
53 7 1355784039 9 Java Advent Calendar far sight look at JDK 8 javaadvent.com
54 2 1355782111 9 Technical Interview coming up Suggestions Pointers Words of Wisdom self.java
55 0 1355775350 6 someone may help me out here stackoverflow.com
56 2 1355765235 14 THC and a bit of Thunking Creative ways to deal with multiple return types kingsfleet.blogspot.it
57 0 1355749586 12 Newbie here can you explain to me what class private stack is self.java
58 0 1355748318 4 When StackOverflow Goes Bad blogs.windward.net
59 0 1355721981 4 Java Graphics Projectile HELP self.java
60 0 1355719622 12 Which one of the following statements about object oriented programming is false self.java
61 16 1355707814 8 What s the skinny on JavaFX these days self.java
62 2 1355685929 20 Can someone explain exactly what Apache Ant is How does it differ from just creating a jar file in blueJ self.java
63 4 1355621071 7 Looking to add test code in Github self.java
64 7 1355613608 6 Java Version of Jarvis Must Haves self.java
65 5 1355599765 6 Java Advent Calendar Functional Java Collections javaadvent.com
66 7 1355597483 13 I m working on a text based RPG and I have some questions self.java
67 2 1355574445 6 Java EE 7 Community Survey Results blog.eisele.net
68 0 1355576629 4 Evolution of Java Technology compilr.org
69 18 1355574828 10 Are your Garbage Collection Logs speaking to you Censum does blog.eisele.net
70 10 1355559380 13 What is the best GUI tool for creating a 2d platformer in Java self.java
71 0 1355555357 7 Hit me with your best arrays tutorial self.java
72 10 1355542403 11 Does any one know of clean 2d graphics library for java self.java
73 23 1355511507 9 Dark Juno A Dark UI Theme for Eclipse 4 rogerdudler.github.com
74 0 1355504132 10 Java devs that work remote I have a few questions self.java
75 0 1355501999 9 How do you make use of your Java knowledge self.java
76 1 1355492027 5 How ClassLoader works in Java javarevisited.blogspot.com.au
77 0 1355489352 9 Main difference between Abstract Class and Interface Compilr org compilr.org
78 48 1355487006 8 Date and Time in Java 8 Part 1 insightfullogic.com
79 0 1355485766 3 Java JSON problem self.java
80 10 1355448875 16 Open source applications large small worth looking at in Java I want to understand application structure self.java
81 1 1355444452 4 lo mexor pz xxx heavy-r.com
82 0 1355402889 11 JRebel Remoting to Push Changes to Your Toaster in The Cloud zeroturnaround.com
83 0 1355402734 6 Are bugs part of technical debt swreflections.blogspot.ca
84 2 1355400483 9 Compile and Run Java programs with Sublime Text 2 compilr.org
85 0 1355391115 4 console input like craftbukkit self.java
86 7 1355390023 8 Hosting suggestions needed for a java web app self.java
87 6 1355359227 17 Java novice here Have noticed funny performance differences across laptop and desktop Nvidia optimus related Details inside self.java
88 1 1355327090 18 Please advice which java server technology should I choose for this new web app in my new work self.java
89 0 1355326137 6 code to convert digits into words compilr.org
90 34 1355319442 7 I want to learn REAL WORLD Java self.java
91 5 1355285442 3 Hiring Java Developers self.java
92 0 1355282335 14 Help How can I count the amount of a specific integer in an ArrayList self.java
93 1 1355272303 24 I m taking a Java 1 final tomorrow I m fairly confident but I would appreciate any tips on things to look out for self.java
94 38 1355267143 6 Will Java become the next COBOL self.java
95 0 1355263047 2 Understanding recursion imgur.com
96 1 1355257558 15 How can I clear the command prompt terminal with java and make it cross platform self.java
97 2 1355253849 18 Is there a strategy for reducing code clutter when you are printing to the terminal alot Beginner Programmer self.java
98 1 1355253049 5 BlockingQueues and multiple producer threads self.java
99 1 1355241441 6 Beginner Struggling with classes Need help self.java
100 0 1355238089 8 Simple Steps to Merge PDF files using Java compilr.org
101 23 1355236940 8 Java and vs Python within a business context self.java

File diff suppressed because it is too large Load Diff

View File

@ -6,25 +6,38 @@ import java.io.IOException;
import org.baeldung.reddit.classifier.RedditClassifier;
import org.baeldung.reddit.classifier.RedditDataCollector;
import org.junit.Before;
import org.junit.Test;
//@Ignore
public class RedditClassifierTest {
private RedditClassifier classifier;
@Before
public void init() throws IOException {
classifier = new RedditClassifier();
classifier.trainClassifier(RedditDataCollector.TRAINING_FILE);
@Test
public void whenUsingDefaultClassifier_thenAccurate() throws IOException {
final RedditClassifier classifier = new RedditClassifier();
classifier.trainClassifier(RedditDataCollector.DATA_FILE);
final double result = classifier.getAccuracy();
System.out.println("==== Default Classifier Accuracy = " + result);
System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++\n\n\n");
assertTrue(result > 0.7);
}
@Test
public void testClassifier() throws IOException {
public void givenSmallerPoolSizeAndFeatures_whenUsingCustomClassifier_thenAccurate() throws IOException {
final RedditClassifier classifier = new RedditClassifier(100, 500);
classifier.trainClassifier(RedditDataCollector.DATA_FILE);
final double result = classifier.getAccuracy();
System.out.println("Accuracy = " + result);
assertTrue(result > 0.8);
System.out.println("==== Custom Classifier (small) Accuracy = " + result);
System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++\n\n\n");
assertTrue(result < 0.7);
}
@Test
public void givenLargerPoolSizeAndFeatures_whenUsingCustomClassifier_thenAccurate() throws IOException {
final RedditClassifier classifier = new RedditClassifier(200, 2000);
classifier.trainClassifier(RedditDataCollector.DATA_FILE);
final double result = classifier.getAccuracy();
System.out.println("==== Custom Classifier (large) Accuracy = " + result);
System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++\n\n\n");
assertTrue(result > 0.7);
}
}