Cosan  1.0
Data Analytics Library
CosanData.h
Go to the documentation of this file.
1 #ifndef __COSANData_H_INCLUDED__
2 #define __COSANData_H_INCLUDED__
3 #include <string>
4 #include <Eigen/Dense>
5 #include <Eigen/Core>
6 #include <cosan/io/utils.h>
7 #include <cosan/base/CosanBO.h>
8 #include <tuple>
9 #include <unordered_map>
10 #include <vector>
11 //typedef Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> EigenMatrix;
12 namespace Cosan
13 {
14 // class CosanMatrix: public Eigen::MatrixXd{
15 // public:
16 // CosanMatrix(): Eigen::MatrixXd(){}
17 // CosanMatrix(int row, int col ): Eigen::MatrixXd(row,col){}
18 // template<typename OtherDerived>
19 // CosanMatrix(const Eigen::MatrixBase<OtherDerived> &other): Eigen::MatrixXd(other){}
20 //
21 // template<typename OtherDerived>
22 // CosanMatrix & operator=(const Eigen::MatrixBase <OtherDerived>& other){
23 // this->Eigen::MatrixXd::operator=(other);
24 // return *this;
25 // }
26 //
27 // template<typename NumericType,
28 // typename = typename std::enable_if<std::is_arithmetic<NumericType>::value,NumericType>::type>
29  /**
30  * @brief Raw Data container.
31  * @details Every constructor needs to have at least one input. To obtain CosanRawData, three constructors can be used:
32  * @code CosanRawData(const std::string & srcX); @endcode
33  * @code CosanRawData(const std::string & srcX, const std::string & srcY); @endcode
34  **/
35  template<Numeric NumericType>
36  class CosanRawData: public CosanBO{
37  public:
38  CosanRawData()=default;
39  /**
40  * @brief Constructor: Read data X and Y from csv files and form raw data container.
41  * @param[in] srcX path to the csv file of data X;
42  * @param[in] srcY path to the csv file of data Y.
43  * @note X and Y are from two separate csv files. For the data format of csv files, see Tutorial.
44  **/
45  CosanRawData(const std::string & srcX, const std::string & srcY):CosanBO(){
46 // static_assert(std::is_arithmetic<NumericType>::value, "NumericType must be numeric");
47  if (std::is_same_v<NumericType, bool>){
48  throw std::invalid_argument(
49  "We do not accept bool at this moment. Try unsigned int, unsigned long, unsigned long long, int, "
50  "long, long, float, double ,long double.");
51  }
52  SetInput(srcX);
53  SetTarget(srcY);
54  }
55  /**
56  * @brief Constructor: Read data X from csv and form raw data container.
57  * @param[in] srcX path to the csv file of data X.
58  **/
59  CosanRawData(const std::string & srcX):CosanBO(){
60 // static_assert(std::is_arithmetic<NumericType>::value, "NumericType must be numeric");
61  if (std::is_same_v<NumericType, bool>){
62  throw std::invalid_argument(
63  "We do not accept bool at this moment. Try unsigned int, unsigned long, unsigned long long, int, "
64  "long, long, float, double ,long double.");
65  }
66  SetInput(srcX);}
67  // virtual ~CosanBO();
68 
69  /**
70  * @brief Update input X from csv file.
71  * @param[in] srcX path to the csv file of data X.
72  * @details
73  **/
74  void SetInput(const std::string & srcX){
76  }
77  /**
78  * @brief Update target Y from csv file.
79  * @param[in] srcY path to the csv file of data Y.
80  * @details
81  **/
82  void SetTarget(const std::string & srcY){
84  if (colCatY.size()!=0){
85  catY=true;
86  }
87  }
88  /**
89  * @brief Concatenate X using CosanMatrix<NumericType> input X. Add new columns
90  * @param[in] const CosanMatrix<NumericType>& inputX
91  *
92  * @details Add new columns.
93  **/
95  if (GetrowsX()!=inputX.rows()){
96  throw std::invalid_argument(fmt::format("To concatenate, the number of rows from inputX should match with original X. Current nrow of X is {:}", GetrowsX() ));
97  }
98  for (gsl::index i = 0 ;i<inputX.cols();i++){
99  X.conservativeResize(X.rows(), X.cols()+1);
100  X.col(X.cols() - 1) = inputX.col(i);
101  }
102  }
103  /**
104  * @brief Update X using CosanMatrix<NumericType> input X
105  * @param[in] const CosanMatrix<NumericType>& inputX
106  * @details
107  **/
109  X = inputX;
110  }
111  /**
112  * @brief Update X and Y using CosanMatrix<NumericType> inputX,inputY
113  * @param[in] const CosanMatrix<NumericType>& inputX,const CosanMatrix<NumericType>& inputY
114  * @details
115  **/
117  X = inputX;
118  Y = inputY;
119  }
120  /**
121  * @brief Update categorical vector svaluesX using std::vector<std::string> & inputX
122  * @param[in] const std::vector<std::string> & inputX
123  * @details
124  **/
125  void UpdateCat(const std::vector<std::string> & inputX){
126  svaluesX = inputX;
127  }
128  /**
129  * @brief Update categorical vector svaluesX,svaluesY using std::vector<std::string> & inputX,inputY
130  * @param[in] const std::vector<std::string> & inputX,inputY
131  * @details
132  **/
133  void UpdateCat(const std::vector<std::string> & inputX,const std::vector<std::string> & inputY){
134  svaluesX = inputX;
135  svaluesY = inputY;
136  }
137 
138  /**
139  * @brief Get a copy of CosanMatrix<NumericType> X
140  **/
142  return X;
143  }
144  /**
145  * @brief Get a copy of CosanMatrix<NumericType> Y
146  **/
148  return Y;
149  }
150  /**
151  * @brief Get a const reference to const CosanMatrix<NumericType> X
152  **/
154  return X;
155  }
156  /**
157  * @brief Get a const reference to const CosanMatrix<NumericType> Y
158  **/
160  return Y;
161  }
162  /**
163  * @brief Get the total number data information
164  * @return std::tuple<# missing of X,#missing of Y>
165  **/
166  std::tuple<gsl::index,gsl::index> GetMissingNumber(){
167  return {X.array().isNaN().template cast<NumericType>().sum(),Y.array().isNaN().template cast<NumericType>().sum()};
168  }
169 
170  // virtual CosanBO *Shallow_copy() const;
171  // virtual CosanBO *Deep_copy() const;
172  /**
173  * @brief Get the name of the objects
174  * @return std::string
175  **/
176  virtual const std::string GetName() const {return "Raw Data Object.";}
177 
178  // virtual bool SaveFile(const std::string &path ,const std::string & prefix = "");
179  // virtual bool LoadFile(const std::string &path);
180  // void PrintModel();
181  // virtual bool Equals(CosanBO* other, float accuracy = 0.0);
182  // virtual CosanBO* Clone();
183  /**
184  * @brief Get the summary message on reading csv file on X
185  * @return std::string
186  **/
187  const std::string& GetSummaryMessageX() const {return SummaryMessageX;}
188  /**
189  * @brief Get the summary message on reading csv file on Y
190  * @return std::string
191  **/
192  const std::string& GetSummaryMessageY() const {return SummaryMessageY;}
193  /**
194  * @brief Raw data column index to numeric data matrix X column index
195  * @return std::unordered_map<gsl::index,gsl::index>
196  **/
197  std::unordered_map<gsl::index,gsl::index> & GetRawToNumIdx(){return _raw2numIdx;}
198  /**
199  * @brief Raw data column index to categorical data column index
200  * @return std::unordered_map<gsl::index,gsl::index>
201  **/
202  std::unordered_map<gsl::index,gsl::index> & GetRawToCatIdx(){return _raw2catIdx;}
203  /**
204  * @brief Get the position of positive infinity in the origin data X
205  * @return std::unordered_map<gsl::index,gsl::index>
206  **/
207  std::vector<std::vector<gsl::index>> GetIdxpinfX() const {return IdxpinfX;}
208  /**
209  * @brief Get the position of negative infinity in the origin data X
210  * @return std::unordered_map<gsl::index,gsl::index>
211  **/
212  std::vector<std::vector<gsl::index>> GetIdxminfX() const {return IdxminfX;}
213  /**
214  * @brief Get the position of missing in the origin data X
215  * @return std::unordered_map<gsl::index,gsl::index>
216  **/
217  std::vector<std::vector<gsl::index>> GetIdxmissingX() const {return IdxmissingX;}
218  /**
219  * @brief Get the position of positive infinity in the origin data Y
220  * @return std::unordered_map<gsl::index,gsl::index>
221  **/
222  std::vector<std::vector<gsl::index>> GetIdxpinfY() const {return IdxpinfY;}
223  /**
224  * @brief Get the position of negative infinity in the origin data Y
225  * @return std::unordered_map<gsl::index,gsl::index>
226  **/
227  std::vector<std::vector<gsl::index>> GetIdxminfY() const {return IdxminfY;}
228  /**
229  * @brief Get the position of missing in the origin data Y
230  * @return std::unordered_map<gsl::index,gsl::index>
231  **/
232  std::vector<std::vector<gsl::index>> GetIdxmissingY() const {return IdxmissingY;}
233  /**
234  * @brief Get the column index (in the origin X of csv file) where the column is of categorical type.
235  * @return std::set<gsl::index>
236  **/
237  std::set<gsl::index> GetcolCatX() const{return colCatX;}
238  /**
239  * @brief Get the column index (in the origin Y of csv file) where the column is of categorical type.
240  * @return std::set<gsl::index>
241  * @note If Y is one dimension, the size of return should be one. Otherwise it is empty.
242  **/
243  std::set<gsl::index> GetcolCatY() const {return colCatY;}
244  /**
245  * @brief True if Y is categorical data type. False otherwise.
246  * @return bool
247  **/
248  bool GetcatY() const {return catY;}
249 
250  /**
251  * @brief Get the number of rows for X.
252  * @return gsl::index
253  **/
254  gsl::index GetrowsX() {
255  rowsX=X.rows();
256  return rowsX;
257  }
258  /**
259  * @brief Get the number of rows for Y.
260  * @return gsl::index
261  **/
262  gsl::index GetrowsY() {
263  rowsY=Y.rows();
264  return rowsY;
265  }
266  /**
267  * @brief Get the number of columns for X.
268  * @return gsl::index
269  **/
270  gsl::index GetcolsX() {
271  colsX = X.cols();
272  return colsX;
273  }
274  /**
275  * @brief Get the number of columns for Y.
276  * @return gsl::index
277  **/
278  gsl::index GetcolsY() {
279  colsY = Y.cols();
280  return colsY;
281  }
282  /**
283  * @brief Get the vector of categorical data from X. order: row first.
284  * @return std::vector<std::string>
285  * @note it is a std::vector of std::string. Strings are stored row-first.
286  **/
287  std::vector<std::string> GetsvaluesX() const {return svaluesX;}
288  /**
289  * @brief Get the vector of categorical data from Y. order: row first.
290  * @return std::vector<std::string>
291  * @note it is a std::vector of std::string. Strings are stored row-first.
292  **/
293  std::vector<std::string> GetsvaluesY() const {return svaluesY;}
294  /**
295  * @return an empty CosanMatrix<NumericType> data structure
296  * @note it is used to determined CosanMatrix<NumericType> data type.
297  **/
299  protected:
300  /**
301  * @brief Numeric data from origin CSV file for X.
302  **/
304  /**
305  * @brief Numeric data from origin CSV file for Y.
306  **/
309  /**
310  * @brief Loading message.
311  **/
313  /**
314  * @brief position for positive, negative infinity and missing values.
315  **/
316  std::vector<std::vector<gsl::index>> IdxpinfX,IdxminfX,IdxmissingX;
317  std::vector<std::vector<gsl::index>> IdxpinfY,IdxminfY,IdxmissingY;
318  /**
319  * @brief column idx in the origin data that is categorical data.
320  **/
321  std::set<gsl::index> colCatX,colCatY;
322  /**
323  * @brief true mean respone variable Y is categorical data.
324  **/
325  bool catY = false;
326  /**
327  * @brief number of rows.
328  **/
329  gsl::index rowsX = 0,colsX = 0;
330  /**
331  * @brief number of columns
332  **/
333  gsl::index rowsY = 0,colsY = 0;
334  /**
335  * Get the vector of categorical data from original data. order: row first.
336  */
337  std::vector<std::string> svaluesX,svaluesY;
338  private:
339  std::unordered_map<gsl::index,gsl::index> _raw2numIdx,_raw2catIdx;
340  // template<typename Matrix>
341 
342  /**
343  * @brief load data from csv
344  * @details We accept data file in `csv` format where each data is of dimension n\times p where n (the number of rows)
345  * is number of samples and `p` (the number of columns) denotes number of features. Each data entry is separated by ","
346  * and allows for positive/negative infinity (user-specific `NumericType` is `float`,`double` or `long double`),
347  * missing values (either emptry entry between two contiguous comma "," or NAN expression ) and non-number string.
348  * If user-specific `NumericType` is `float`,`double` or `long double`, acceptable numeric expressions also include
349  * hexadecimal and variants of decimal float-poing expression (see [this](https://en.cppreference.com/w/cpp/string/basic_string/stof) for more details).
350  * It will throw ``std::invalid_argument`` if the the entry read is not-a-number expression except this entry is
351  * of categorical type.
352  *
353  * We determine each column's data type (either numeric or categorical) by the first row.
354  * We treat every entry as numeric if it is a number (whether it is ordinal or numerical) and treat every entry
355  * that does not start with a numeric as categorical (also called nominal data specifically). For those starting
356  * with a numeric but containing non-numeric character, ```std::invalid_argument``` will be thrown.
357  **/
358  std::tuple<gsl::index,gsl::index,std::string> _load_csv (const std::string & path, CosanMatrix<NumericType>& X,std::vector<std::vector<gsl::index>>& Idxpinf,
359  std::vector<std::vector<gsl::index>>& Idxminf,std::vector<std::vector<gsl::index>>& Idxmissing,
360  std::vector<std::string> &svalues, std::set<gsl::index> & colCat) {
361 
362  std::ifstream indata;
363  indata.open(path);
364  std::string line;
365  std::vector<NumericType> values;
366  // std::vector<std::string> svalues;
367  gsl::index rows = 0,cols = 0,col_idx=0;
368  // uint col_idx=0;
369  // std::vector<std::vector<uint>> Idxpinf,Idxminf,Idxmissing;
370  // std::set<uint> colCat;
371  NumericType result;
372  std::size_t pos;
373  std::string SummaryMessage;
374 
375  // stod -> "-23","-12E1","-+nan" (is a double type), "+\- inf","+\- infinity", "jklsgfd","1235lkjfg",
376 // first row: empty/nonempty: empty-> numerical nan,
377  // not empty-> can read numeric -> it is indeed a correct numeric format
378 // -> wrong format i.e. "1235lkjfg" throw error
379  // -> cannot read numeric-> then consider this as category
380 
381 // then set colCat, cols,
382  // numeric-format,
383  //
384  std::getline(indata, line);
385  std::stringstream lineStream(line);
386  std::string cell;
387  while(getline(lineStream, cell, ',')) {
388  if (cell.size()==0){
389  values.push_back(StringToNum<NumericType>(std::string("nan")));
390  Idxmissing.push_back(std::vector<gsl::index>({rows,col_idx}));
391  col_idx++;
392  cols=std::max(cols,col_idx);
393  continue;
394  }
395  try{
396  result = StringToNum<NumericType>(cell, &pos);
397  }catch(...){
398  svalues.push_back(cell);
399  colCat.insert(col_idx);
400  col_idx++;
401  cols=std::max(cols,col_idx);
402  continue;
403  }
404  if (pos!=cell.size()){
405  throw std::invalid_argument(
406  "Incorrect numeric format! Abort the program. The entry reads "+cell+
407  " and the position is ("+ std::to_string(rows)+","+ std::to_string(col_idx)+")");
408  }
409  values.push_back(result);
410  if (isinf(values.back())){
411  if (values.back()==std::numeric_limits<NumericType>::infinity()){
412  Idxpinf.push_back(std::vector<gsl::index>({rows,col_idx}));}
413  else {Idxminf.push_back(std::vector<gsl::index>({rows,col_idx}));}
414  }
415  else if (isnan(values.back())){
416  Idxmissing.push_back(std::vector<gsl::index>({rows,col_idx}));
417  }
418  col_idx++;
419  cols=std::max(cols,col_idx);
420  }
421  rows = 1;
422  col_idx = 0;
423 
424 
425  while (std::getline(indata, line)) {
426  // std::stringstream lineStream(line);
427  // std::string cell;
428  lineStream.str("");
429  lineStream.clear(); // Clear state flags.
430  lineStream<<line;
431  while(getline(lineStream, cell, ',')) {
432  if (cell.size()==0){
433  if (colCat.find(col_idx)==colCat.end()){
434  values.push_back(StringToNum<NumericType>(std::string("nan")));
435  }
436  else{
437  svalues.push_back("");
438  }
439  Idxmissing.push_back(std::vector<gsl::index>({rows,col_idx}));
440  col_idx++;
441  continue;
442  }
443  try{
444  result = StringToNum<NumericType>(cell, &pos);
445  }catch(...){
446  if (colCat.find(col_idx)!=colCat.end())
447  {
448  svalues.push_back(cell);
449  colCat.insert(col_idx);
450  col_idx++;
451  continue;}
452  else{
453  throw std::invalid_argument(
454  "Incorrect value type! Should be numeric but non-numeric input. The entry reads "+cell+
455  " and the position is ("+ std::to_string(rows)+","+ std::to_string(col_idx)+")");
456  }
457  }
458  if (pos!=cell.size()){
459  throw std::invalid_argument(
460  "Incorrect numeric format! Abort the program. The entry reads "+cell+
461  " and the position is ("+ std::to_string(rows)+","+ std::to_string(col_idx)+")");
462  }
463  values.push_back(result);
464  if (isinf(values.back())){
465  if (values.back()==std::numeric_limits<NumericType>::infinity()){
466  Idxpinf.push_back(std::vector<gsl::index>({rows,col_idx}));}
467  else {Idxminf.push_back(std::vector<gsl::index>({rows,col_idx}));}
468  }
469  else if (isnan(values.back())){
470  Idxmissing.push_back(std::vector<gsl::index>({rows,col_idx}));
471  }
472  col_idx++;
473  }
474  if (cols!=col_idx){
475  std::cout<<cols<<" "<<col_idx<<std::endl;
476  throw std::invalid_argument("Not all rows has same number of entry! First row has "+std::to_string(cols)+" columns but row "+std::to_string(rows)+" has "+std::to_string(col_idx)+" columns!" );
477  }
478  ++rows;
479  col_idx=0;
480  }
481  X.resize(rows,values.size()/rows);
482  gsl::index i =0,__cols = values.size()/rows;
483  for (auto &each :values ){
484  X(i/__cols,i%__cols) = each;
485  i++;
486  }
487 
488 // Eigen::Map<const CosanMatrix<NumericType>>(values.data(), rows, values.size()/rows);
489 
490  SummaryMessage+="Number of rows: "+std::to_string(rows)+"\n";
491  SummaryMessage+="Number of columns: "+std::to_string(cols)+"\n";
492  SummaryMessage+="Number of positive infinity values: "+std::to_string(Idxpinf.size())+". They are at " ;
493  for(auto each :Idxpinf){
494  SummaryMessage+="("+std::to_string(each[0])+","+std::to_string(each[1])+") ";
495  }
496  SummaryMessage+="\n";
497  SummaryMessage+="Number of negative infinity values: "+std::to_string(Idxminf.size())+". They are at ";
498  for(auto each :Idxminf){
499  SummaryMessage+="("+std::to_string(each[0])+","+std::to_string(each[1])+")"+" ";
500  }
501  SummaryMessage+="\n";
502  SummaryMessage+="Number of missing values: "+std::to_string(Idxmissing.size())+". They are at ";
503  for(auto each :Idxmissing){
504  SummaryMessage+="("+std::to_string(each[0])+","+std::to_string(each[1])+")"+" ";
505  }
506  SummaryMessage+="\n";
507  // for (auto fvalue:values) {std::cout<<fvalue<<std::endl;}
508  SummaryMessage+="Columns of categorical values: Column ";
509  for (auto idx:colCat) {
510  SummaryMessage+=std::to_string(idx)+" ";}
511  SummaryMessage+="\n";
512  gsl::index j = 0 ;
513  for (gsl::index i = 0;i<cols;i++){
514  if (colCat.find(i)==colCat.end()){
515  _raw2numIdx[i] = j;
516  j++;}
517  }
518  j = 0 ;
519  for (gsl::index i = 0;i<cols;i++){
520  if(colCat.find(i)!=colCat.end()){
521  _raw2catIdx[i]=j;
522  j++;
523  }
524  }
525 
526 
527  return {rows,cols,SummaryMessage};
528 
529 
530  }
531 
532  };
533 
534 // template<typename NumericType,
535 // typename = typename std::enable_if<std::is_arithmetic<NumericType>::value,NumericType>::type>
536  /**
537  * @brief Data container.
538  * @details Every constructor needs to have at least one input. To obtain CosanData, the following constructors can be used:
539  * @code CosanData(gsl::index nrows,gsl::index ncols,NumericType lb=0,NumericType ub = 1) @endcode
540  * @code CosanData(const CosanMatrix<NumericType> & inputX) @endcode
541  * @code CosanData(const CosanMatrix<NumericType>& inputX,const CosanMatrix<NumericType>& inputY) @endcode
542  * @code CosanData(const std::vector<NumericType>& inputX,gsl::index nrows,const std::string & order = "rowfirst") @endcode
543  * @code CosanData(const std::vector<NumericType>& inputX,const std::vector<NumericType>& inputY,gsl::index nrows,const std::string & order = "rowfirst") @endcode
544  **/
545  template<Numeric NumericType>
546  class CosanData: public CosanRawData<NumericType>{
547  public:
548  CosanData()=default;
549  /**
550  * @brief Generate random matrix with each entry uniformly sampled from lb to lb. Dimension is nrows by ncols.
551  **/
552  CosanData(gsl::index nrows,gsl::index ncols,NumericType lb=0,NumericType ub = 1):CosanRawData<NumericType>(){
553  this->X.resize(nrows,ncols);
554  std::default_random_engine generator;
555  std::uniform_real_distribution<double> distribution(lb,ub);
556  for (gsl::index i = 0;i<nrows*ncols;i++){
557  this->X(i/ncols,i%ncols) =distribution(generator);
558  }
559  }
560  /**
561  * @brief Get CosanData from CosanMatrix<NUmericType> inputX
562  **/
564  static_assert(std::is_arithmetic<NumericType>::value, "NumericType must be numeric");
565  this->X = inputX;
566  }
567  /**
568  * @brief Get CosanData from CosanMatrix<NUmericType> inputX, inputY
569  **/
571  static_assert(std::is_arithmetic<NumericType>::value, "NumericType must be numeric");
572  this->X = inputX;
573  this->Y = inputY;}
574  /**
575  * @brief Get CosanData from std::vector inputX, fill the data either by 'rowfirst' or 'columnfirst'.
576  **/
577  CosanData(const std::vector<NumericType>& inputX,gsl::index nrows,const std::string & order = "rowfirst"):CosanRawData<NumericType>(){
578  if (nrows>inputX.size() || inputX.size()%nrows!=0){
579  throw std::invalid_argument(
580  fmt::format("Incorrect nrows specification, should be less than or equal to input vector size and size is divisible by nrows. Input vector size is "
581  "{:} and nrows is {:}",inputX.size(),nrows));
582  }
583  this->X.resize(nrows,inputX.size()/nrows);
584  gsl::index i =0,__cols=inputX.size()/nrows;
585  if (order=="columnfirst"){
586  i =0;
587  for (auto &each :inputX ){
588  this->X(i%nrows,i/nrows) = each;
589  i++;
590  }
591  return;
592  }
593  i =0;
594  for (auto &each :inputX ){
595  this->X(i/__cols,i%__cols) = each;
596  i++;
597  }
598  return;
599  }
600 
601 
602  /**
603  * @brief Get CosanData from std::vector inputX and inputY, fill the data either by 'rowfirst' or 'columnfirst'.
604  **/
605  CosanData(const std::vector<NumericType>& inputX,const std::vector<NumericType>& inputY,gsl::index nrows,const std::string & order = "rowfirst"):CosanRawData<NumericType>(){
606  if (nrows>inputX.size() || inputX.size()%nrows!=0 || nrows!=inputY.size()){
607  throw std::invalid_argument(
608  fmt::format("Incorrect nrows specification, should be less than or equal to input vector size and size is divisible by nrows. inputY size should also be equal to nrows."
609  "inputX vector size is {:}, inputY vector size is {:} and nrows is {:}",inputX.size(),inputY.size(),nrows));
610  }
611  this->X.resize(nrows,inputX.size()/nrows);
612  this->Y.resize(nrows,1);
613  gsl::index i =0,__cols=inputX.size()/nrows;
614  for (auto &each:inputY){
615  this->Y(i,0) = each;
616  i++;
617  }
618  if (order=="columnfirst"){
619  i =0;
620  for (auto &each :inputX ){
621  this->X(i%nrows,i/nrows) = each;
622  i++;
623  }
624  return;
625  }
626  i =0;
627  for (auto &each :inputX ){
628  this->X(i/__cols,i%__cols) = each;
629  i++;
630  }
631  return;
632  }
633 
634  /**
635  * @brief Get the name of the object.
636  **/
637  virtual const std::string GetName() const {return "Processed Data Object.";}
638  private:
639  };
640 }
641 
642 #endif
Cosan::CosanRawData::GetrowsY
gsl::index GetrowsY()
Get the number of rows for Y.
Definition: CosanData.h:262
Cosan::CosanRawData::GetIdxmissingY
std::vector< std::vector< gsl::index > > GetIdxmissingY() const
Get the position of missing in the origin data Y.
Definition: CosanData.h:232
Cosan::CosanRawData::GetsvaluesY
std::vector< std::string > GetsvaluesY() const
Get the vector of categorical data from Y. order: row first.
Definition: CosanData.h:293
Cosan::CosanRawData::SummaryMessageY
std::string SummaryMessageY
Definition: CosanData.h:312
Cosan
Definition: CosanBO.h:29
Cosan::CosanRawData::GetInput
CosanMatrix< NumericType > GetInput()
Get a copy of CosanMatrix<NumericType> X.
Definition: CosanData.h:141
Cosan::CosanRawData::GetTarget
CosanMatrix< NumericType > GetTarget()
Get a copy of CosanMatrix<NumericType> Y.
Definition: CosanData.h:147
Cosan::CosanRawData::GetName
virtual const std::string GetName() const
Get the name of the objects.
Definition: CosanData.h:176
Cosan::CosanRawData::GetcolsY
gsl::index GetcolsY()
Get the number of columns for Y.
Definition: CosanData.h:278
Cosan::CosanData::CosanData
CosanData(const std::vector< NumericType > &inputX, gsl::index nrows, const std::string &order="rowfirst")
Get CosanData from std::vector inputX, fill the data either by 'rowfirst' or 'columnfirst'.
Definition: CosanData.h:577
Cosan::CosanRawData::colsY
gsl::index colsY
Definition: CosanData.h:333
Cosan::CosanRawData::GetInput
const CosanMatrix< NumericType > & GetInput() const
Get a const reference to const CosanMatrix<NumericType> X.
Definition: CosanData.h:153
NumericType
double NumericType
Definition: onehotencodingTest.cpp:20
Cosan::CosanBO
Cosan Base Object.
Definition: CosanBO.h:62
Cosan::CosanRawData::CosanRawData
CosanRawData(const std::string &srcX)
Constructor: Read data X from csv and form raw data container.
Definition: CosanData.h:59
Cosan::CosanRawData::IdxminfX
std::vector< std::vector< gsl::index > > IdxminfX
Definition: CosanData.h:316
Cosan::CosanRawData::__TYPE
CosanMatrix< NumericType > __TYPE
Definition: CosanData.h:308
Cosan::CosanData::GetName
virtual const std::string GetName() const
Get the name of the object.
Definition: CosanData.h:637
Cosan::CosanRawData::GetcatY
bool GetcatY() const
True if Y is categorical data type. False otherwise.
Definition: CosanData.h:248
Cosan::CosanRawData::UpdateData
void UpdateData(const CosanMatrix< NumericType > &inputX)
Update X using CosanMatrix<NumericType> input X.
Definition: CosanData.h:108
Cosan::CosanRawData::GetcolsX
gsl::index GetcolsX()
Get the number of columns for X.
Definition: CosanData.h:270
Cosan::CosanRawData::IdxpinfX
std::vector< std::vector< gsl::index > > IdxpinfX
position for positive, negative infinity and missing values.
Definition: CosanData.h:316
Cosan::CosanRawData::svaluesX
std::vector< std::string > svaluesX
Definition: CosanData.h:337
Cosan::CosanData::CosanData
CosanData(const CosanMatrix< NumericType > &inputX)
Get CosanData from CosanMatrix<NUmericType> inputX.
Definition: CosanData.h:563
Cosan::CosanRawData::GetsvaluesX
std::vector< std::string > GetsvaluesX() const
Get the vector of categorical data from X. order: row first.
Definition: CosanData.h:287
Cosan::CosanMatrix
Eigen::Matrix< NumericType, Eigen::Dynamic, Eigen::Dynamic > CosanMatrix
Definition: CosanBO.h:37
Cosan::CosanRawData::GetIdxmissingX
std::vector< std::vector< gsl::index > > GetIdxmissingX() const
Get the position of missing in the origin data X.
Definition: CosanData.h:217
Cosan::CosanData::CosanData
CosanData(const CosanMatrix< NumericType > &inputX, const CosanMatrix< NumericType > &inputY)
Get CosanData from CosanMatrix<NUmericType> inputX, inputY.
Definition: CosanData.h:570
Cosan::CosanRawData::GetcolCatX
std::set< gsl::index > GetcolCatX() const
Get the column index (in the origin X of csv file) where the column is of categorical type.
Definition: CosanData.h:237
Cosan::CosanRawData::UpdateCat
void UpdateCat(const std::vector< std::string > &inputX)
Update categorical vector svaluesX using std::vector<std::string> & inputX.
Definition: CosanData.h:125
Cosan::CosanRawData::IdxmissingX
std::vector< std::vector< gsl::index > > IdxmissingX
Definition: CosanData.h:316
Cosan::CosanData
Data container.
Definition: CosanData.h:546
utils.h
Cosan::CosanRawData::SetTarget
void SetTarget(const std::string &srcY)
Update target Y from csv file.
Definition: CosanData.h:82
Cosan::CosanRawData::catY
bool catY
true mean respone variable Y is categorical data.
Definition: CosanData.h:325
Cosan::CosanRawData::X
CosanMatrix< NumericType > X
Numeric data from origin CSV file for X.
Definition: CosanData.h:303
Cosan::CosanRawData::_load_csv
std::tuple< gsl::index, gsl::index, std::string > _load_csv(const std::string &path, CosanMatrix< NumericType > &X, std::vector< std::vector< gsl::index >> &Idxpinf, std::vector< std::vector< gsl::index >> &Idxminf, std::vector< std::vector< gsl::index >> &Idxmissing, std::vector< std::string > &svalues, std::set< gsl::index > &colCat)
load data from csv
Definition: CosanData.h:358
Cosan::CosanRawData::_raw2numIdx
std::unordered_map< gsl::index, gsl::index > _raw2numIdx
Definition: CosanData.h:339
Cosan::CosanRawData::GetIdxminfY
std::vector< std::vector< gsl::index > > GetIdxminfY() const
Get the position of negative infinity in the origin data Y.
Definition: CosanData.h:227
Cosan::CosanRawData::colCatY
std::set< gsl::index > colCatY
Definition: CosanData.h:321
Cosan::CosanRawData::rowsY
gsl::index rowsY
number of columns
Definition: CosanData.h:333
Cosan::CosanRawData::CosanRawData
CosanRawData()=default
Cosan::CosanRawData::svaluesY
std::vector< std::string > svaluesY
Definition: CosanData.h:337
Cosan::CosanRawData::IdxmissingY
std::vector< std::vector< gsl::index > > IdxmissingY
Definition: CosanData.h:317
Cosan::CosanRawData
Raw Data container.
Definition: CosanData.h:36
Cosan::CosanRawData::GetType
CosanMatrix< NumericType > GetType()
Definition: CosanData.h:298
CosanBO.h
CosanBO.
Cosan::CosanRawData::GetIdxminfX
std::vector< std::vector< gsl::index > > GetIdxminfX() const
Get the position of negative infinity in the origin data X.
Definition: CosanData.h:212
Cosan::CosanRawData::rowsX
gsl::index rowsX
number of rows.
Definition: CosanData.h:329
Cosan::CosanRawData::GetcolCatY
std::set< gsl::index > GetcolCatY() const
Get the column index (in the origin Y of csv file) where the column is of categorical type.
Definition: CosanData.h:243
Cosan::CosanRawData::GetSummaryMessageY
const std::string & GetSummaryMessageY() const
Get the summary message on reading csv file on Y.
Definition: CosanData.h:192
Cosan::CosanRawData::CosanRawData
CosanRawData(const std::string &srcX, const std::string &srcY)
Constructor: Read data X and Y from csv files and form raw data container.
Definition: CosanData.h:45
Cosan::CosanRawData::_raw2catIdx
std::unordered_map< gsl::index, gsl::index > _raw2catIdx
Definition: CosanData.h:339
Cosan::CosanRawData::GetTarget
const CosanMatrix< NumericType > & GetTarget() const
Get a const reference to const CosanMatrix<NumericType> Y.
Definition: CosanData.h:159
Cosan::CosanRawData::GetRawToCatIdx
std::unordered_map< gsl::index, gsl::index > & GetRawToCatIdx()
Raw data column index to categorical data column index.
Definition: CosanData.h:202
Cosan::CosanRawData::GetSummaryMessageX
const std::string & GetSummaryMessageX() const
Get the summary message on reading csv file on X.
Definition: CosanData.h:187
Cosan::CosanRawData::GetrowsX
gsl::index GetrowsX()
Get the number of rows for X.
Definition: CosanData.h:254
Cosan::CosanRawData::UpdateData
void UpdateData(const CosanMatrix< NumericType > &inputX, const CosanMatrix< NumericType > &inputY)
Update X and Y using CosanMatrix<NumericType> inputX,inputY.
Definition: CosanData.h:116
Cosan::CosanRawData::IdxminfY
std::vector< std::vector< gsl::index > > IdxminfY
Definition: CosanData.h:317
Cosan::CosanRawData::GetIdxpinfX
std::vector< std::vector< gsl::index > > GetIdxpinfX() const
Get the position of positive infinity in the origin data X.
Definition: CosanData.h:207
Cosan::CosanRawData::UpdateCat
void UpdateCat(const std::vector< std::string > &inputX, const std::vector< std::string > &inputY)
Update categorical vector svaluesX,svaluesY using std::vector<std::string> & inputX,...
Definition: CosanData.h:133
Cosan::CosanRawData::GetMissingNumber
std::tuple< gsl::index, gsl::index > GetMissingNumber()
Get the total number data information.
Definition: CosanData.h:166
Cosan::CosanRawData::IdxpinfY
std::vector< std::vector< gsl::index > > IdxpinfY
Definition: CosanData.h:317
Cosan::CosanRawData::Y
CosanMatrix< NumericType > Y
Numeric data from origin CSV file for Y.
Definition: CosanData.h:307
Cosan::CosanRawData::ConcatenateData
void ConcatenateData(const CosanMatrix< NumericType > &inputX)
Concatenate X using CosanMatrix<NumericType> input X. Add new columns.
Definition: CosanData.h:94
Cosan::CosanData::CosanData
CosanData(const std::vector< NumericType > &inputX, const std::vector< NumericType > &inputY, gsl::index nrows, const std::string &order="rowfirst")
Get CosanData from std::vector inputX and inputY, fill the data either by 'rowfirst' or 'columnfirst'...
Definition: CosanData.h:605
Cosan::CosanRawData::SetInput
void SetInput(const std::string &srcX)
Update input X from csv file.
Definition: CosanData.h:74
Cosan::CosanData::CosanData
CosanData(gsl::index nrows, gsl::index ncols, NumericType lb=0, NumericType ub=1)
Generate random matrix with each entry uniformly sampled from lb to lb. Dimension is nrows by ncols.
Definition: CosanData.h:552
Cosan::CosanRawData::SummaryMessageX
std::string SummaryMessageX
Loading message.
Definition: CosanData.h:312
Cosan::CosanRawData::colCatX
std::set< gsl::index > colCatX
column idx in the origin data that is categorical data.
Definition: CosanData.h:321
Cosan::CosanRawData::GetIdxpinfY
std::vector< std::vector< gsl::index > > GetIdxpinfY() const
Get the position of positive infinity in the origin data Y.
Definition: CosanData.h:222
Cosan::CosanData::CosanData
CosanData()=default
Cosan::CosanRawData::GetRawToNumIdx
std::unordered_map< gsl::index, gsl::index > & GetRawToNumIdx()
Raw data column index to numeric data matrix X column index.
Definition: CosanData.h:197
Cosan::CosanRawData::colsX
gsl::index colsX
Definition: CosanData.h:329