1 #ifndef __COSANData_H_INCLUDED__
2 #define __COSANData_H_INCLUDED__
9 #include <unordered_map>
35 template<Numeric NumericType>
47 if (std::is_same_v<NumericType, bool>){
48 throw std::invalid_argument(
49 "We do not accept bool at this moment. Try unsigned int, unsigned long, unsigned long long, int, "
50 "long, long, float, double ,long double.");
61 if (std::is_same_v<NumericType, bool>){
62 throw std::invalid_argument(
63 "We do not accept bool at this moment. Try unsigned int, unsigned long, unsigned long long, int, "
64 "long, long, float, double ,long double.");
96 throw std::invalid_argument(fmt::format(
"To concatenate, the number of rows from inputX should match with original X. Current nrow of X is {:}",
GetrowsX() ));
98 for (gsl::index i = 0 ;i<inputX.cols();i++){
99 X.conservativeResize(
X.rows(),
X.cols()+1);
100 X.col(
X.cols() - 1) = inputX.col(i);
125 void UpdateCat(
const std::vector<std::string> & inputX){
133 void UpdateCat(
const std::vector<std::string> & inputX,
const std::vector<std::string> & inputY){
167 return {
X.array().isNaN().template cast<NumericType>().sum(),
Y.array().isNaN().template cast<NumericType>().sum()};
176 virtual const std::string
GetName()
const {
return "Raw Data Object.";}
359 std::vector<std::vector<gsl::index>>& Idxminf,std::vector<std::vector<gsl::index>>& Idxmissing,
360 std::vector<std::string> &svalues, std::set<gsl::index> & colCat) {
362 std::ifstream indata;
365 std::vector<NumericType> values;
367 gsl::index rows = 0,cols = 0,col_idx=0;
373 std::string SummaryMessage;
384 std::getline(indata, line);
385 std::stringstream lineStream(line);
387 while(getline(lineStream, cell,
',')) {
389 values.push_back(StringToNum<NumericType>(std::string(
"nan")));
390 Idxmissing.push_back(std::vector<gsl::index>({rows,col_idx}));
392 cols=std::max(cols,col_idx);
396 result = StringToNum<NumericType>(cell, &pos);
398 svalues.push_back(cell);
399 colCat.insert(col_idx);
401 cols=std::max(cols,col_idx);
404 if (pos!=cell.size()){
405 throw std::invalid_argument(
406 "Incorrect numeric format! Abort the program. The entry reads "+cell+
407 " and the position is ("+ std::to_string(rows)+
","+ std::to_string(col_idx)+
")");
409 values.push_back(result);
410 if (isinf(values.back())){
411 if (values.back()==std::numeric_limits<NumericType>::infinity()){
412 Idxpinf.push_back(std::vector<gsl::index>({rows,col_idx}));}
413 else {Idxminf.push_back(std::vector<gsl::index>({rows,col_idx}));}
415 else if (isnan(values.back())){
416 Idxmissing.push_back(std::vector<gsl::index>({rows,col_idx}));
419 cols=std::max(cols,col_idx);
425 while (std::getline(indata, line)) {
431 while(getline(lineStream, cell,
',')) {
433 if (colCat.find(col_idx)==colCat.end()){
434 values.push_back(StringToNum<NumericType>(std::string(
"nan")));
437 svalues.push_back(
"");
439 Idxmissing.push_back(std::vector<gsl::index>({rows,col_idx}));
444 result = StringToNum<NumericType>(cell, &pos);
446 if (colCat.find(col_idx)!=colCat.end())
448 svalues.push_back(cell);
449 colCat.insert(col_idx);
453 throw std::invalid_argument(
454 "Incorrect value type! Should be numeric but non-numeric input. The entry reads "+cell+
455 " and the position is ("+ std::to_string(rows)+
","+ std::to_string(col_idx)+
")");
458 if (pos!=cell.size()){
459 throw std::invalid_argument(
460 "Incorrect numeric format! Abort the program. The entry reads "+cell+
461 " and the position is ("+ std::to_string(rows)+
","+ std::to_string(col_idx)+
")");
463 values.push_back(result);
464 if (isinf(values.back())){
465 if (values.back()==std::numeric_limits<NumericType>::infinity()){
466 Idxpinf.push_back(std::vector<gsl::index>({rows,col_idx}));}
467 else {Idxminf.push_back(std::vector<gsl::index>({rows,col_idx}));}
469 else if (isnan(values.back())){
470 Idxmissing.push_back(std::vector<gsl::index>({rows,col_idx}));
475 std::cout<<cols<<
" "<<col_idx<<std::endl;
476 throw std::invalid_argument(
"Not all rows has same number of entry! First row has "+std::to_string(cols)+
" columns but row "+std::to_string(rows)+
" has "+std::to_string(col_idx)+
" columns!" );
481 X.resize(rows,values.size()/rows);
482 gsl::index i =0,__cols = values.size()/rows;
483 for (
auto &each :values ){
484 X(i/__cols,i%__cols) = each;
490 SummaryMessage+=
"Number of rows: "+std::to_string(rows)+
"\n";
491 SummaryMessage+=
"Number of columns: "+std::to_string(cols)+
"\n";
492 SummaryMessage+=
"Number of positive infinity values: "+std::to_string(Idxpinf.size())+
". They are at " ;
493 for(
auto each :Idxpinf){
494 SummaryMessage+=
"("+std::to_string(each[0])+
","+std::to_string(each[1])+
") ";
496 SummaryMessage+=
"\n";
497 SummaryMessage+=
"Number of negative infinity values: "+std::to_string(Idxminf.size())+
". They are at ";
498 for(
auto each :Idxminf){
499 SummaryMessage+=
"("+std::to_string(each[0])+
","+std::to_string(each[1])+
")"+
" ";
501 SummaryMessage+=
"\n";
502 SummaryMessage+=
"Number of missing values: "+std::to_string(Idxmissing.size())+
". They are at ";
503 for(
auto each :Idxmissing){
504 SummaryMessage+=
"("+std::to_string(each[0])+
","+std::to_string(each[1])+
")"+
" ";
506 SummaryMessage+=
"\n";
508 SummaryMessage+=
"Columns of categorical values: Column ";
509 for (
auto idx:colCat) {
510 SummaryMessage+=std::to_string(idx)+
" ";}
511 SummaryMessage+=
"\n";
513 for (gsl::index i = 0;i<cols;i++){
514 if (colCat.find(i)==colCat.end()){
519 for (gsl::index i = 0;i<cols;i++){
520 if(colCat.find(i)!=colCat.end()){
527 return {rows,cols,SummaryMessage};
545 template<Numeric NumericType>
553 this->
X.resize(nrows,ncols);
554 std::default_random_engine generator;
555 std::uniform_real_distribution<double> distribution(lb,ub);
556 for (gsl::index i = 0;i<nrows*ncols;i++){
557 this->
X(i/ncols,i%ncols) =distribution(generator);
564 static_assert(std::is_arithmetic<NumericType>::value,
"NumericType must be numeric");
571 static_assert(std::is_arithmetic<NumericType>::value,
"NumericType must be numeric");
578 if (nrows>inputX.size() || inputX.size()%nrows!=0){
579 throw std::invalid_argument(
580 fmt::format(
"Incorrect nrows specification, should be less than or equal to input vector size and size is divisible by nrows. Input vector size is "
581 "{:} and nrows is {:}",inputX.size(),nrows));
583 this->
X.resize(nrows,inputX.size()/nrows);
584 gsl::index i =0,__cols=inputX.size()/nrows;
585 if (order==
"columnfirst"){
587 for (
auto &each :inputX ){
588 this->
X(i%nrows,i/nrows) = each;
594 for (
auto &each :inputX ){
595 this->
X(i/__cols,i%__cols) = each;
605 CosanData(
const std::vector<NumericType>& inputX,
const std::vector<NumericType>& inputY,gsl::index nrows,
const std::string & order =
"rowfirst"):
CosanRawData<
NumericType>(){
606 if (nrows>inputX.size() || inputX.size()%nrows!=0 || nrows!=inputY.size()){
607 throw std::invalid_argument(
608 fmt::format(
"Incorrect nrows specification, should be less than or equal to input vector size and size is divisible by nrows. inputY size should also be equal to nrows."
609 "inputX vector size is {:}, inputY vector size is {:} and nrows is {:}",inputX.size(),inputY.size(),nrows));
611 this->
X.resize(nrows,inputX.size()/nrows);
612 this->
Y.resize(nrows,1);
613 gsl::index i =0,__cols=inputX.size()/nrows;
614 for (
auto &each:inputY){
618 if (order==
"columnfirst"){
620 for (
auto &each :inputX ){
621 this->
X(i%nrows,i/nrows) = each;
627 for (
auto &each :inputX ){
628 this->
X(i/__cols,i%__cols) = each;
637 virtual const std::string
GetName()
const {
return "Processed Data Object.";}