Cosan  1.0
Data Analytics Library
encoder.h
Go to the documentation of this file.
1 #ifndef COSAN_ENCODER_H
2 #define COSAN_ENCODER_H
3 
5 #include <string>
6 
7 namespace Cosan {
8  template<Numeric NumericType>
9  class Encoder: public Preprocessor<NumericType> {
10 // Encoder() = default;
11 //
12 // Encoder(std::vector<std::unordered_map<std::string, gsl::index>> categories) {
13 // this->categories = categories;
14 // }
15 //
16 // ~Encoder() = default;
17 //
18 // enum HANDLE_UNKNOWN {
19 // IGNORE,
20 // ERROR
21 // }
22 // ;
23  public:
24  Encoder() = delete;
26  fit(RD);
27  if (add_back==true){
29  fmt::print("Notice that CRD.X has been modified. The dimension of X is ({:},{:}). {:} columns of one-hot encodings have been added.\n",RD.GetrowsX(),RD.GetcolsX(),CatMatrix.cols());
30  }
31  }
33  fmt::print("*********************************\n");
34  fmt::print("Begin encoding categorical data !\n");
35 
36  gsl::index colCat = RD.GetcolCatX().size();
37  std::vector <std::string> svaluesX = RD.GetsvaluesX();
38  gsl::index totalCol = 0;
39  std::vector<std::vector<std::string>> svalues;
40  svalues = std::vector < std::vector <
41  std::string >> (colCat, std::vector<std::string>(svaluesX.size() / colCat, ""));
42  CosanMatrix <NumericType> OneHotMatrixi;
43  for (gsl::index col = 0; col < colCat; col++) {
44  std::unordered_map <std::string, gsl::index> categoryToOrdinal{};
45  for (gsl::index row = 0; row < svaluesX.size() / colCat; row++) {
46  svalues[col][row] = svaluesX[row * colCat + col];
47  if (categoryToOrdinal.find(svalues[col][row]) == categoryToOrdinal.end()) {
48  categoryToOrdinal.insert({svalues[col][row], categoryToOrdinal.size()});
49  }
50  }
51  this->categories.push_back(categoryToOrdinal);
52  OneHotMatrixi = CosanMatrix<NumericType>::Zero(svaluesX.size() / colCat, categoryToOrdinal.size());
53  for (gsl::index row = 0; row < svaluesX.size() / colCat; row++) {
54  OneHotMatrixi(row, categoryToOrdinal[svalues[col][row]]) = 1;
55  }
56  if (col == 0) {
57  CatMatrix = OneHotMatrixi;
58  } else {
59  for (gsl::index i = 0; i < OneHotMatrixi.cols(); i++) {
60  CatMatrix.conservativeResize(CatMatrix.rows(), CatMatrix.cols() + 1);
61  CatMatrix.col(CatMatrix.cols() - 1) = OneHotMatrixi.col(i);
62  }
63  }
64 
65  }
66  fmt::print("Finish encoding categorical data! Get access to the newly-generated additional matrix via .GetCatMatrix()\n");
67  fmt::print("*********************************\n");
68  }
69 // virtual std::vector<int> getEncoding(int colIdx, const std::string &category) = 0;
71  std::vector <std::unordered_map<std::string, gsl::index>> getCategories() const {
72  return this->categories;}
73  private:
74 
75  std::vector<std::unordered_map<std::string, gsl::index>> categories;
77  };
78 }
79 
80 
81 #endif //COSAN_ENCODER_H
Cosan
Definition: CosanBO.h:29
Cosan::Encoder
Definition: encoder.h:9
Cosan::Encoder::getCategories
std::vector< std::unordered_map< std::string, gsl::index > > getCategories() const
Definition: encoder.h:71
NumericType
double NumericType
Definition: onehotencodingTest.cpp:20
Cosan::Encoder::Encoder
Encoder()=delete
Cosan::CosanRawData::GetcolsX
gsl::index GetcolsX()
Get the number of columns for X.
Definition: CosanData.h:270
Cosan::CosanRawData::GetsvaluesX
std::vector< std::string > GetsvaluesX() const
Get the vector of categorical data from X. order: row first.
Definition: CosanData.h:287
Cosan::Encoder::CatMatrix
CosanMatrix< NumericType > CatMatrix
Definition: encoder.h:76
Cosan::CosanMatrix
Eigen::Matrix< NumericType, Eigen::Dynamic, Eigen::Dynamic > CosanMatrix
Definition: CosanBO.h:37
Cosan::Encoder::GetCatMatrix
CosanMatrix< NumericType > & GetCatMatrix()
Definition: encoder.h:70
Cosan::CosanRawData::GetcolCatX
std::set< gsl::index > GetcolCatX() const
Get the column index (in the origin X of csv file) where the column is of categorical type.
Definition: CosanData.h:237
Cosan::CosanRawData
Raw Data container.
Definition: CosanData.h:36
Cosan::Encoder::Encoder
Encoder(CosanRawData< NumericType > &RD, bool add_back=false)
Definition: encoder.h:25
Cosan::Encoder::fit
void fit(CosanRawData< NumericType > &RD)
Definition: encoder.h:32
Cosan::Preprocessor
Definition: preprocessor.h:14
Cosan::Encoder::categories
std::vector< std::unordered_map< std::string, gsl::index > > categories
Definition: encoder.h:75
Cosan::CosanRawData::GetrowsX
gsl::index GetrowsX()
Get the number of rows for X.
Definition: CosanData.h:254
preprocessor.h
Cosan::CosanRawData::ConcatenateData
void ConcatenateData(const CosanMatrix< NumericType > &inputX)
Concatenate X using CosanMatrix<NumericType> input X. Add new columns.
Definition: CosanData.h:94