Merge pull request #199 from Doha2012/master
modify reddit classifier test
This commit is contained in:
commit
1175f22a99
|
@ -86,7 +86,7 @@ public class WebConfig extends WebMvcConfigurerAdapter {
|
|||
|
||||
@Bean
|
||||
public RedditClassifier redditClassifier() throws IOException {
|
||||
final Resource file = new ClassPathResource("train.csv");
|
||||
final Resource file = new ClassPathResource("data.csv");
|
||||
final RedditClassifier redditClassifier = new RedditClassifier();
|
||||
redditClassifier.trainClassifier(file.getFile().getAbsolutePath());
|
||||
return redditClassifier;
|
||||
|
|
|
@ -26,12 +26,12 @@ public class RedditClassifier {
|
|||
public static int GOOD = 0;
|
||||
public static int BAD = 1;
|
||||
public static int MIN_SCORE = 10;
|
||||
public static int NUM_OF_FEATURES = 1000;
|
||||
|
||||
private final AdaptiveLogisticRegression classifier;
|
||||
private final FeatureVectorEncoder titleEncoder;
|
||||
private final FeatureVectorEncoder domainEncoder;
|
||||
private CrossFoldLearner learner;
|
||||
private final int noOfFeatures;
|
||||
private double accuracy;
|
||||
|
||||
private final int[] trainCount = { 0, 0 };
|
||||
|
@ -41,7 +41,8 @@ public class RedditClassifier {
|
|||
private final int[] correctCount = { 0, 0 };
|
||||
|
||||
public RedditClassifier() {
|
||||
classifier = new AdaptiveLogisticRegression(2, NUM_OF_FEATURES, new L2());
|
||||
noOfFeatures = 1000;
|
||||
classifier = new AdaptiveLogisticRegression(2, 1000, new L2());
|
||||
classifier.setPoolSize(150);
|
||||
titleEncoder = new AdaptiveWordValueEncoder("title");
|
||||
titleEncoder.setProbes(2);
|
||||
|
@ -49,11 +50,22 @@ public class RedditClassifier {
|
|||
domainEncoder.setProbes(1);
|
||||
}
|
||||
|
||||
public RedditClassifier(int poolSize, int noOfFeatures) {
|
||||
this.noOfFeatures = noOfFeatures;
|
||||
classifier = new AdaptiveLogisticRegression(2, noOfFeatures, new L2());
|
||||
classifier.setPoolSize(poolSize);
|
||||
titleEncoder = new AdaptiveWordValueEncoder("title");
|
||||
titleEncoder.setProbes(1);
|
||||
domainEncoder = new StaticWordValueEncoder("domain");
|
||||
domainEncoder.setProbes(1);
|
||||
}
|
||||
|
||||
public void trainClassifier(String fileName) throws IOException {
|
||||
final List<NamedVector> vectors = extractVectors(readDataFile(fileName));
|
||||
final int noOfTraining = (int) (RedditDataCollector.DATA_SIZE * 0.8);
|
||||
final int size = vectors.size();
|
||||
final int noOfTraining = (int) (size * 0.8);
|
||||
final List<NamedVector> trainingData = vectors.subList(0, noOfTraining);
|
||||
final List<NamedVector> testData = vectors.subList(noOfTraining, RedditDataCollector.DATA_SIZE);
|
||||
final List<NamedVector> testData = vectors.subList(noOfTraining, size);
|
||||
int category;
|
||||
for (final NamedVector vector : trainingData) {
|
||||
category = (vector.getName() == "GOOD") ? GOOD : BAD;
|
||||
|
@ -61,11 +73,12 @@ public class RedditClassifier {
|
|||
trainCount[category]++;
|
||||
}
|
||||
System.out.println("Training count ========= Good = " + trainCount[0] + " ___ Bad = " + trainCount[1]);
|
||||
System.out.println("----------------------------------------------------------------- \n");
|
||||
evaluateClassifier(testData);
|
||||
}
|
||||
|
||||
public Vector convertPost(String title, String domain, int hour) {
|
||||
final Vector vector = new RandomAccessSparseVector(NUM_OF_FEATURES);
|
||||
final Vector vector = new RandomAccessSparseVector(noOfFeatures);
|
||||
final List<String> words = Splitter.onPattern("\\W").omitEmptyStrings().splitToList(title);
|
||||
vector.set(0, hour);
|
||||
vector.set(1, words.size());
|
||||
|
@ -105,10 +118,10 @@ public class RedditClassifier {
|
|||
wrong++;
|
||||
}
|
||||
}
|
||||
System.out.println("Eval count ========= Good = " + evalCount[0] + " ___ Bad = " + evalCount[1]);
|
||||
System.out.println("Test result ======== Correct prediction = " + correct + " ----- Wrong prediction = " + wrong);
|
||||
System.out.println("Test result ======== Correct Good = " + correctCount[0] + " ----- Correct Bad = " + correctCount[1]);
|
||||
System.out.println("Test result ======== Good accuracy = " + (correctCount[0] / (evalCount[0] + 0.0)) + " ----- Bad accuracy = " + (correctCount[1] / (evalCount[1] + 0.0)));
|
||||
System.out.println("Eval count =================== Good = " + evalCount[0] + " ----- Bad = " + evalCount[1] + "\n");
|
||||
System.out.println("Overall Evaluation ============= Correct prediction = " + correct + " ----- Wrong prediction = " + wrong);
|
||||
System.out.println("Correctly Evaluated =========== Correct Good = " + correctCount[0] + " ----- Correct Bad = " + correctCount[1]);
|
||||
System.out.println("Correctly Evaluated (%) ======== Good accuracy = " + (correctCount[0] / (evalCount[0] + 0.0)) + " ----- Bad accuracy = " + (correctCount[1] / (evalCount[1] + 0.0)));
|
||||
this.accuracy = correct / (wrong + correct + 0.0);
|
||||
}
|
||||
|
||||
|
@ -133,7 +146,7 @@ public class RedditClassifier {
|
|||
private NamedVector extractVector(String line) {
|
||||
final String[] items = line.split(",");
|
||||
final String category = extractCategory(Integer.parseInt(items[0]));
|
||||
final NamedVector vector = new NamedVector(new RandomAccessSparseVector(NUM_OF_FEATURES), category);
|
||||
final NamedVector vector = new NamedVector(new RandomAccessSparseVector(noOfFeatures), category);
|
||||
final Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"));
|
||||
cal.setTimeInMillis(Long.parseLong(items[1]) * 1000);
|
||||
|
||||
|
|
|
@ -16,8 +16,8 @@ import com.google.common.base.Joiner;
|
|||
import com.google.common.base.Splitter;
|
||||
|
||||
public class RedditDataCollector {
|
||||
public static final String TRAINING_FILE = "src/main/resources/train.csv";
|
||||
public static final int DATA_SIZE = 8000;
|
||||
public static final String DATA_FILE = "src/main/resources/data.csv";
|
||||
public static final int DATA_SIZE = 20000;
|
||||
public static final int LIMIT = 100;
|
||||
public static final Long YEAR = 31536000L;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
@ -45,10 +45,11 @@ public class RedditDataCollector {
|
|||
public void collectData() throws IOException {
|
||||
final int noOfRounds = DATA_SIZE / LIMIT;
|
||||
timestamp = System.currentTimeMillis() / 1000;
|
||||
final FileWriter writer = new FileWriter(TRAINING_FILE);
|
||||
final FileWriter writer = new FileWriter(DATA_FILE);
|
||||
writer.write("Score, Timestamp in utc, Number of wrods in title, Title, Domain \n");
|
||||
for (int i = 0; i < noOfRounds; i++) {
|
||||
getPosts(writer);
|
||||
System.out.println(i);
|
||||
}
|
||||
writer.close();
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,101 +0,0 @@
|
|||
Score, Timestamp in utc, Number of wrods in title, Title, Domain
|
||||
3,1357021066,7,Good Examples of Component dragging and dropping,self.java
|
||||
0,1357017936,10,Game only works on mac need help porting to windows,self.java
|
||||
2,1357008210,4,eclipse or keyboard issues,self.java
|
||||
37,1356977564,6,The Long Strange Trip to Java,blinkenlights.com
|
||||
5,1356970069,9,How to Send Email with Embedded Images Using Java,blog.smartbear.com
|
||||
0,1356956937,4,What makes you architect,programming.freeblog.hu
|
||||
0,1356900338,4,Apache Maven I of,javaxperiments.blogspot.com
|
||||
0,1356896219,5,Custom functions per class instance,self.java
|
||||
0,1356891056,5,JMeter Performance and Tuning Tips,ubik-ingenierie.com
|
||||
12,1356888358,19,First fully functional GUI program about making acronyms easier to remember Was wondering if you guys had any tips,github.com
|
||||
2,1356881034,12,Social Tech 101 Why do I love Java Developer Edition Part 1,socialtech101.blogspot.com
|
||||
5,1356826782,7,Configurable deployment descriptors proposal for Java EE,java.net
|
||||
31,1356793800,16,Finished my very first game in java Snake clone It s not much but it works,self.java
|
||||
18,1356766107,10,la4j Linear Alebra for Java 0 3 0 is out,la4j.org
|
||||
1,1356747219,6,RubyFlux a Ruby to Java compiler,github.com
|
||||
15,1356735585,10,Simple JMS 2 0 Sample JMSDestinationDefinition and Synchronous Message Receive,blogs.oracle.com
|
||||
9,1356717174,3,Java Use WebCam,self.java
|
||||
4,1356711735,5,Compiler Optimisation for saving memory,self.java
|
||||
4,1356662279,22,I m interested in your opinion about Java for Python Programmers by Brad Miller or a better alternative for a Java newbie,self.java
|
||||
0,1356633508,4,A good android game,self.java
|
||||
4,1356631759,12,a java library i saw mentioned here can t find pls help,self.java
|
||||
1,1356627923,5,About learning Java a question,self.java
|
||||
0,1356623761,3,Objects and java2d,self.java
|
||||
0,1356593886,2,AffineTransform halp,self.java
|
||||
43,1356584047,7,Java Was Strongly Influenced by Objective C,cs.gmu.edu
|
||||
1,1356580543,7,Having trouble Setting Up Android Development Environment,self.java
|
||||
0,1356560732,13,How can I fetch the first X links of reddit into a list,self.java
|
||||
0,1356551788,4,JDK Download page error,self.java
|
||||
9,1356536557,12,looking for a good book website to learn intermediate core java spring,self.java
|
||||
7,1356487079,11,A popup menu like Filemaker s Any library have an implementation,self.java
|
||||
1,1356455255,6,Just a Few Helpful Solr Functions,ignatyev-dev.blogspot.ru
|
||||
13,1356433373,7,Bart s Blog Xtend the better compromise,bartnaudts.blogspot.de
|
||||
4,1356410180,3,Beginner Question Here,self.java
|
||||
19,1356283667,5,Nashorn JavaScript for the JVM,blogs.oracle.com
|
||||
0,1356234086,5,Problem with Java memory use,self.java
|
||||
0,1356195953,5,Learning Java in two weeks,self.java
|
||||
0,1356127053,10,Twitter4J Download a Twitter Users Tweets to a Text File,github.com
|
||||
20,1356118151,15,Using Apache Commons Functor functional interfaces with Java 8 lambdas cross post from r functionalprogramming,kinoshita.eti.br
|
||||
13,1356102153,7,Date and Time in Java 8 Timezones,insightfullogic.com
|
||||
10,1356088959,8,Implementing a collapsible ui repeat rows in JSF,kahimyang.info
|
||||
8,1356034544,5,OmniFaces 1 3 is released,balusc.blogspot.com
|
||||
1,1356027563,11,How to Configure a JNDI DataSource in the OpenShift Tomcat Cartridge,openshift.redhat.com
|
||||
82,1356020780,7,Doomsday Sale IntelliJ 75 off today only,jetbrains.com
|
||||
3,1355976320,3,IntelliJ Working Directory,self.java
|
||||
0,1355966433,5,Help with java problem please,self.java
|
||||
17,1355928745,12,What s new in Servlet 3 1 Java EE 7 moving forward,blogs.oracle.com
|
||||
11,1355864485,5,Quick poll for research project,self.java
|
||||
0,1355851994,5,Eclipse Text Problem Need Help,self.java
|
||||
29,1355823193,4,Java 8 vs Xtend,blog.efftinge.de
|
||||
2,1355805047,4,Learning Java between semesters,self.java
|
||||
6,1355798488,11,I m a beginner programmer any tips on where to start,self.java
|
||||
7,1355784039,9,Java Advent Calendar far sight look at JDK 8,javaadvent.com
|
||||
2,1355782111,9,Technical Interview coming up Suggestions Pointers Words of Wisdom,self.java
|
||||
0,1355775350,6,someone may help me out here,stackoverflow.com
|
||||
2,1355765235,14,THC and a bit of Thunking Creative ways to deal with multiple return types,kingsfleet.blogspot.it
|
||||
0,1355749586,12,Newbie here can you explain to me what class private stack is,self.java
|
||||
0,1355748318,4,When StackOverflow Goes Bad,blogs.windward.net
|
||||
0,1355721981,4,Java Graphics Projectile HELP,self.java
|
||||
0,1355719622,12,Which one of the following statements about object oriented programming is false,self.java
|
||||
16,1355707814,8,What s the skinny on JavaFX these days,self.java
|
||||
2,1355685929,20,Can someone explain exactly what Apache Ant is How does it differ from just creating a jar file in blueJ,self.java
|
||||
4,1355621071,7,Looking to add test code in Github,self.java
|
||||
7,1355613608,6,Java Version of Jarvis Must Haves,self.java
|
||||
5,1355599765,6,Java Advent Calendar Functional Java Collections,javaadvent.com
|
||||
7,1355597483,13,I m working on a text based RPG and I have some questions,self.java
|
||||
2,1355574445,6,Java EE 7 Community Survey Results,blog.eisele.net
|
||||
0,1355576629,4,Evolution of Java Technology,compilr.org
|
||||
18,1355574828,10,Are your Garbage Collection Logs speaking to you Censum does,blog.eisele.net
|
||||
10,1355559380,13,What is the best GUI tool for creating a 2d platformer in Java,self.java
|
||||
0,1355555357,7,Hit me with your best arrays tutorial,self.java
|
||||
10,1355542403,11,Does any one know of clean 2d graphics library for java,self.java
|
||||
23,1355511507,9,Dark Juno A Dark UI Theme for Eclipse 4,rogerdudler.github.com
|
||||
0,1355504132,10,Java devs that work remote I have a few questions,self.java
|
||||
0,1355501999,9,How do you make use of your Java knowledge,self.java
|
||||
1,1355492027,5,How ClassLoader works in Java,javarevisited.blogspot.com.au
|
||||
0,1355489352,9,Main difference between Abstract Class and Interface Compilr org,compilr.org
|
||||
48,1355487006,8,Date and Time in Java 8 Part 1,insightfullogic.com
|
||||
0,1355485766,3,Java JSON problem,self.java
|
||||
10,1355448875,16,Open source applications large small worth looking at in Java I want to understand application structure,self.java
|
||||
1,1355444452,4,lo mexor pz xxx,heavy-r.com
|
||||
0,1355402889,11,JRebel Remoting to Push Changes to Your Toaster in The Cloud,zeroturnaround.com
|
||||
0,1355402734,6,Are bugs part of technical debt,swreflections.blogspot.ca
|
||||
2,1355400483,9,Compile and Run Java programs with Sublime Text 2,compilr.org
|
||||
0,1355391115,4,console input like craftbukkit,self.java
|
||||
7,1355390023,8,Hosting suggestions needed for a java web app,self.java
|
||||
6,1355359227,17,Java novice here Have noticed funny performance differences across laptop and desktop Nvidia optimus related Details inside,self.java
|
||||
1,1355327090,18,Please advice which java server technology should I choose for this new web app in my new work,self.java
|
||||
0,1355326137,6,code to convert digits into words,compilr.org
|
||||
34,1355319442,7,I want to learn REAL WORLD Java,self.java
|
||||
5,1355285442,3,Hiring Java Developers,self.java
|
||||
0,1355282335,14,Help How can I count the amount of a specific integer in an ArrayList,self.java
|
||||
1,1355272303,24,I m taking a Java 1 final tomorrow I m fairly confident but I would appreciate any tips on things to look out for,self.java
|
||||
38,1355267143,6,Will Java become the next COBOL,self.java
|
||||
0,1355263047,2,Understanding recursion,imgur.com
|
||||
1,1355257558,15,How can I clear the command prompt terminal with java and make it cross platform,self.java
|
||||
2,1355253849,18,Is there a strategy for reducing code clutter when you are printing to the terminal alot Beginner Programmer,self.java
|
||||
1,1355253049,5,BlockingQueues and multiple producer threads,self.java
|
||||
1,1355241441,6,Beginner Struggling with classes Need help,self.java
|
||||
0,1355238089,8,Simple Steps to Merge PDF files using Java,compilr.org
|
||||
23,1355236940,8,Java and vs Python within a business context,self.java
|
|
File diff suppressed because it is too large
Load Diff
|
@ -6,25 +6,38 @@ import java.io.IOException;
|
|||
|
||||
import org.baeldung.reddit.classifier.RedditClassifier;
|
||||
import org.baeldung.reddit.classifier.RedditDataCollector;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
//@Ignore
|
||||
public class RedditClassifierTest {
|
||||
|
||||
private RedditClassifier classifier;
|
||||
|
||||
@Before
|
||||
public void init() throws IOException {
|
||||
classifier = new RedditClassifier();
|
||||
classifier.trainClassifier(RedditDataCollector.TRAINING_FILE);
|
||||
@Test
|
||||
public void whenUsingDefaultClassifier_thenAccurate() throws IOException {
|
||||
final RedditClassifier classifier = new RedditClassifier();
|
||||
classifier.trainClassifier(RedditDataCollector.DATA_FILE);
|
||||
final double result = classifier.getAccuracy();
|
||||
System.out.println("==== Default Classifier Accuracy = " + result);
|
||||
System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++\n\n\n");
|
||||
assertTrue(result > 0.7);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testClassifier() throws IOException {
|
||||
public void givenSmallerPoolSizeAndFeatures_whenUsingCustomClassifier_thenAccurate() throws IOException {
|
||||
final RedditClassifier classifier = new RedditClassifier(100, 500);
|
||||
classifier.trainClassifier(RedditDataCollector.DATA_FILE);
|
||||
final double result = classifier.getAccuracy();
|
||||
System.out.println("Accuracy = " + result);
|
||||
assertTrue(result > 0.8);
|
||||
System.out.println("==== Custom Classifier (small) Accuracy = " + result);
|
||||
System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++\n\n\n");
|
||||
assertTrue(result < 0.7);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void givenLargerPoolSizeAndFeatures_whenUsingCustomClassifier_thenAccurate() throws IOException {
|
||||
final RedditClassifier classifier = new RedditClassifier(200, 2000);
|
||||
classifier.trainClassifier(RedditDataCollector.DATA_FILE);
|
||||
final double result = classifier.getAccuracy();
|
||||
System.out.println("==== Custom Classifier (large) Accuracy = " + result);
|
||||
System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++\n\n\n");
|
||||
assertTrue(result > 0.7);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue