package aima.core.learning.neural; import java.io.BufferedReader; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import aima.core.learning.data.DataResource; import aima.core.learning.framework.DataSet; import aima.core.learning.framework.Example; import aima.core.util.Util; import aima.core.util.datastructure.Pair; /** * @author Ravi Mohan * */ public abstract class NNDataSet { /* * This class represents a source of examples to the rest of the nn * framework. Assumes only one function approximator works on an instance at * a given point in time */ /* * the parsed and preprocessed form of the dataset. */ private List dataset; /* * a copy from which examples are drawn. */ private List presentlyProcessed = new ArrayList();; /* * list of mean Values for all components of raw data set */ private List means; /* * list of stdev Values for all components of raw data set */ private List stdevs; /* * the normalized data set */ protected List> nds; /* * the column numbers of the "target" */ protected List targetColumnNumbers; /* * population delegated to subclass because only subclass knows which * column(s) is target */ public abstract void setTargetColumns(); /* * create a normalized data "table" from the data in the file. At this * stage, the data isnot split into input pattern and tragets */ public void createNormalizedDataFromFile(String filename) throws Exception { List> rds = new ArrayList>(); // create raw data set BufferedReader reader = new BufferedReader(new InputStreamReader( DataResource.class.getResourceAsStream(filename + ".csv"))); String line; while ((line = reader.readLine()) != null) { rds.add(exampleFromString(line, ",")); } // normalize raw dataset nds = normalize(rds); } /* * create a normalized data "table" from the DataSet using numerizer. At * this stage, the data isnot split into input pattern and targets TODO * remove redundancy of recreating the target columns. the numerizer has * already isolated the targets */ public void createNormalizedDataFromDataSet(DataSet ds, Numerizer numerizer) throws Exception { List> rds = rawExamplesFromDataSet(ds, numerizer); // normalize raw dataset nds = normalize(rds); } /* * Gets (and removes) a random example from the 'presentlyProcessed' */ public NNExample getExampleAtRandom() { int i = Util.randomNumberBetween(0, (presentlyProcessed.size() - 1)); return presentlyProcessed.remove(i); } /* * Gets (and removes) a random example from the 'presentlyProcessed' */ public NNExample getExample(int index) { return presentlyProcessed.remove(index); } /* * check if any more examples remain to be processed */ public boolean hasMoreExamples() { return presentlyProcessed.size() > 0; } /* * check how many examples remain to be processed */ public int howManyExamplesLeft() { return presentlyProcessed.size(); } /* * refreshes the presentlyProcessed dataset so it can be used for a new * epoch of training. */ public void refreshDataset() { presentlyProcessed = new ArrayList(); for (NNExample e : dataset) { presentlyProcessed.add(e.copyExample()); } } /* * method called by clients to set up data set and make it ready for * processing */ public void createExamplesFromFile(String filename) throws Exception { createNormalizedDataFromFile(filename); setTargetColumns(); createExamples(); } /* * method called by clients to set up data set and make it ready for * processing */ public void createExamplesFromDataSet(DataSet ds, Numerizer numerizer) throws Exception { createNormalizedDataFromDataSet(ds, numerizer); setTargetColumns(); createExamples(); } public List> getNormalizedData() { return nds; } public List getMeans() { return means; } public List getStdevs() { return stdevs; } // // PRIVATE METHODS // /* * create Example instances from a normalized data "table". */ private void createExamples() { dataset = new ArrayList(); for (List dataLine : nds) { List input = new ArrayList(); List target = new ArrayList(); for (int i = 0; i < dataLine.size(); i++) { if (targetColumnNumbers.contains(i)) { target.add(dataLine.get(i)); } else { input.add(dataLine.get(i)); } } dataset.add(new NNExample(input, target)); } refreshDataset();// to populate the preentlyProcessed dataset } private List> normalize(List> rds) { int rawDataLength = rds.get(0).size(); List> nds = new ArrayList>(); means = new ArrayList(); stdevs = new ArrayList(); List> normalizedColumns = new ArrayList>(); // clculate means for each coponent of example data for (int i = 0; i < rawDataLength; i++) { List columnValues = new ArrayList(); for (List rawDatum : rds) { columnValues.add(rawDatum.get(i)); } double mean = Util.calculateMean(columnValues); means.add(mean); double stdev = Util.calculateStDev(columnValues, mean); stdevs.add(stdev); normalizedColumns.add(Util.normalizeFromMeanAndStdev(columnValues, mean, stdev)); } // re arrange data from columns // TODO Assert normalized columns have same size etc int columnLength = normalizedColumns.get(0).size(); int numberOfColumns = normalizedColumns.size(); for (int i = 0; i < columnLength; i++) { List lst = new ArrayList(); for (int j = 0; j < numberOfColumns; j++) { lst.add(normalizedColumns.get(j).get(i)); } nds.add(lst); } return nds; } private List exampleFromString(String line, String separator) { // assumes all values for inout and target are doubles List rexample = new ArrayList(); List attributeValues = Arrays.asList(line.split(separator)); for (String valString : attributeValues) { rexample.add(Double.parseDouble(valString)); } return rexample; } private List> rawExamplesFromDataSet(DataSet ds, Numerizer numerizer) { // assumes all values for inout and target are doubles List> rds = new ArrayList>(); for (int i = 0; i < ds.size(); i++) { List rexample = new ArrayList(); Example e = ds.getExample(i); Pair, List> p = numerizer.numerize(e); List attributes = p.getFirst(); for (Double d : attributes) { rexample.add(d); } List targets = p.getSecond(); for (Double d : targets) { rexample.add(d); } rds.add(rexample); } return rds; } }