Cosan  1.0
Data Analytics Library
missingvalues.h
Go to the documentation of this file.
1 //
2 // Created by Xinyu Zhang on 4/4/21.
3 //
4 
5 #ifndef COSAN_MISSINGVALUES_H
6 #define COSAN_MISSINGVALUES_H
7 
8 
10 
11 namespace Cosan{
12  template<Numeric NumericType>
13  class MissingValues: public Preprocessor<NumericType> {
14  public:
15  MissingValues()=delete;
17  this->fit(RD);
18  }
19 
20  private:
22  fmt::print("*********************************\n");
23  fmt::print("Begin cleaning up missing data!\n");
24  std::set<gsl::index> MissingIdxX;
25  for (auto & each: RD.GetIdxmissingX()){
26  MissingIdxX.insert(RD.GetRawToNumIdx()[each[1]]);
27  }
28  bool HasMissingIdxY = RD.GetIdxmissingY().size()==0 ? false :true;
30 
31  for (auto &i : MissingIdxX ){
32  gsl::index _numofMissing = X.col(i).array().isNaN().template cast<NumericType>().sum();
33  fmt::print("For X Column {:d} has {:d} missing value(s).\n",i,_numofMissing);
34  NumericType mean = X.col(i).unaryExpr([](NumericType v) { return std::isnan(v)? 0 : v; }).sum()/(X.rows()-_numofMissing);
35  fmt::print("Fill the missing/nan entry with the column mean={:f}.\n",mean);
36  X.col(i) = X.col(i).array().isNaN().select(0,X.col(i)).array()+mean*(X.col(i).array().isNaN().template cast<NumericType>());
37 
38  }
39  if (HasMissingIdxY==true){
40  gsl::index i = 0;
42  gsl::index _numofMissing = Y.col(i).array().isNaN().template cast<NumericType>().sum();
43  fmt::print("For Y has {:d} missing value(s).\n",_numofMissing);
44 
45  NumericType mean = Y.col(i).unaryExpr([](NumericType v) { return std::isnan(v)? 0 : v; }).sum()/(Y.rows()-_numofMissing);
46  fmt::print("Fill the missing/nan entry with the column mean={:f}.\n",mean);
47  Y.col(i) = Y.col(i).array().isNaN().select(0,Y.col(i)).array()+mean*(Y.col(i).array().isNaN().template cast<NumericType>());
48  RD.UpdateData(X,Y);
49  }
50  else{
51  RD.UpdateData(X);
52  }
53  fmt::print("End of cleaning process\n");
54  fmt::print("*********************************\n");
55  }
56 
57  };
58 
59 
60 
61 
62 
63 
64 }
65 
66 
67 #endif //COSAN_MISSINGVALUES_H
Cosan::CosanRawData::GetIdxmissingY
std::vector< std::vector< gsl::index > > GetIdxmissingY() const
Get the position of missing in the origin data Y.
Definition: CosanData.h:232
Cosan
Definition: CosanBO.h:29
Cosan::CosanRawData::GetInput
CosanMatrix< NumericType > GetInput()
Get a copy of CosanMatrix<NumericType> X.
Definition: CosanData.h:141
Cosan::CosanRawData::GetTarget
CosanMatrix< NumericType > GetTarget()
Get a copy of CosanMatrix<NumericType> Y.
Definition: CosanData.h:147
NumericType
double NumericType
Definition: onehotencodingTest.cpp:20
Cosan::CosanRawData::UpdateData
void UpdateData(const CosanMatrix< NumericType > &inputX)
Update X using CosanMatrix<NumericType> input X.
Definition: CosanData.h:108
Cosan::CosanMatrix
Eigen::Matrix< NumericType, Eigen::Dynamic, Eigen::Dynamic > CosanMatrix
Definition: CosanBO.h:37
Cosan::CosanRawData::GetIdxmissingX
std::vector< std::vector< gsl::index > > GetIdxmissingX() const
Get the position of missing in the origin data X.
Definition: CosanData.h:217
Cosan::MissingValues::fit
void fit(CosanRawData< NumericType > &RD)
Definition: missingvalues.h:21
Cosan::CosanRawData
Raw Data container.
Definition: CosanData.h:36
Cosan::MissingValues::MissingValues
MissingValues()=delete
Cosan::MissingValues
Definition: missingvalues.h:13
Cosan::Preprocessor
Definition: preprocessor.h:14
Cosan::MissingValues::MissingValues
MissingValues(CosanRawData< NumericType > &RD)
Definition: missingvalues.h:16
preprocessor.h
Cosan::CosanRawData::GetRawToNumIdx
std::unordered_map< gsl::index, gsl::index > & GetRawToNumIdx()
Raw data column index to numeric data matrix X column index.
Definition: CosanData.h:197