diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/csv_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/csv_op.cc index 8fbcc7205e..171734aa3f 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/csv_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/csv_op.cc @@ -142,6 +142,11 @@ int CsvOp::CsvParser::put_row(char c) { return ret; } + if (cur_col_ != column_default_.size()) { + err_message_ = "The number of columns does not match the definition."; + return -1; + } + total_rows_++; cur_row_++; cur_col_ = 0; @@ -159,8 +164,12 @@ int CsvOp::CsvParser::put_row(char c) { int CsvOp::CsvParser::end_file(char c) { if (cur_col_ > 0) { - put_row(c); + int ret = put_row(c); + if (ret < 0) { + return ret; + } } + if (cur_row_ > 0) { cur_buffer_->set_tensor_table(std::move(tensor_table_)); buffer_connector_->Add(worker_id_, std::move(cur_buffer_)); @@ -190,16 +199,16 @@ Status CsvOp::CsvParser::initCsvParser() { // State diagram for counting rows sdl = {// START_OF_FILE - // ┌───────────┬───────────┬─────────────┐ - // │ abc │ " │ \n │ - // ├───────────┼───────────┼─────────────┤ - // │ UNQUOTE │ QUOTE │ END_OF_LINE │ - // ├───────────┼───────────┼─────────────┤ - // | null_func │ null_func │ null_func │ - // └───────────┴───────────┴─────────────┘ + // ┌───────────┬───────────┬───────────────┐ + // │ abc │ " │ \n │ + // ├───────────┼───────────┼───────────────┤ + // │ UNQUOTE │ QUOTE │ START_OF_FILE │ + // ├───────────┼───────────┼───────────────┤ + // | null_func │ null_func │ null_func │ + // └───────────┴───────────┴───────────────┘ {{State::START_OF_FILE, Message::MS_NORMAL}, {State::UNQUOTE, &CsvParser::null_func}}, {{State::START_OF_FILE, Message::MS_QUOTE}, {State::QUOTE, &CsvParser::null_func}}, - {{State::START_OF_FILE, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::null_func}}, + {{State::START_OF_FILE, Message::MS_END_OF_LINE}, {State::START_OF_FILE, &CsvParser::null_func}}, // UNQUOTE // ┌───────────┬───────────┬─────────────┐ @@ -254,7 +263,7 @@ Status CsvOp::CsvParser::initCsvParser() { // ┌───────────┬──────────┬──────────┬────────────────┬────────────────┐ // │ abc │ , │ " │ \n │ EOF │ // ├───────────┼──────────┼──────────┼────────────────┼────────────────┤ - // │ UNQUOTE │ DELIM │ QUOTE │ END_OF_LINE │ END_OF_FILE │ + // │ UNQUOTE │ DELIM │ QUOTE │ START_OF_FILE │ END_OF_FILE │ // ├───────────┼──────────┼──────────┼────────────────┼────────────────┤ // | lambda │ lambda │ lambda │ null_func │ null_func │ // └───────────┴──────────┴──────────┴────────────────┴────────────────┘ @@ -282,7 +291,7 @@ Status CsvOp::CsvParser::initCsvParser() { this->pos_ = 0; return 0; }}}, - {{State::START_OF_FILE, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::null_func}}, + {{State::START_OF_FILE, Message::MS_END_OF_LINE}, {State::START_OF_FILE, &CsvParser::null_func}}, {{State::START_OF_FILE, Message::MS_END_OF_FILE}, {State::END_OF_FILE, &CsvParser::null_func}}, // UNQUOTE @@ -683,7 +692,7 @@ Status CsvOp::CalculateNumRowsPerShard() { } if (all_num_rows_ == 0) { RETURN_STATUS_UNEXPECTED( - "There is no valid data matching the dataset API CsvDataset. Please check file path or dataset API " + "There is no valid data matching the dataset API CsvDataset. Please check file path or CSV format " "validation first."); } @@ -756,6 +765,8 @@ Status CsvOp::ComputeColMap() { getline(handle, line); std::vector col_names = split(line, field_delim_); for (int32_t i = 0; i < col_names.size(); i++) { + // consider the case of CRLF + col_names[i].erase(col_names[i].find_last_not_of('\r') + 1); column_name_id_map_[col_names[i]] = i; } } else { diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/csv_op.h b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/csv_op.h index 1921b61bdc..93ed2a754e 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/csv_op.h +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/csv_op.h @@ -77,7 +77,7 @@ class CsvOp : public ParallelOp { total_rows_(0), start_offset_(0), end_offset_(std::numeric_limits::max()), - err_message_("unkonw") { + err_message_("unknown") { cur_buffer_ = std::make_unique(0, DataBuffer::BufferFlags::kDeBFlagNone); initCsvParser(); } @@ -101,8 +101,9 @@ class CsvOp : public ParallelOp { if (it == sd.end()) { return -1; } + int ret = it->second.second(*this, static_cast(c)); cur_state_ = it->second.first; - return it->second.second(*this, c); + return ret; } int countRows(int c); @@ -169,7 +170,13 @@ class CsvOp : public ParallelOp { } int catch_exception(char c) { - MS_LOG(ERROR) << "Invalid syntax!"; + if (getMessage(c) == Message::MS_QUOTE && cur_state_ == State::UNQUOTE) { + err_message_ = "Invalid quote in unquote field."; + } else if (getMessage(c) == Message::MS_END_OF_FILE && cur_state_ == State::QUOTE) { + err_message_ = "Reach the end of file in quote field."; + } else if (getMessage(c) == Message::MS_NORMAL && cur_state_ == State::SECOND_QUOTE) { + err_message_ = "Receive unquote char in quote field."; + } return -1; } @@ -425,6 +432,8 @@ class CsvOp : public ParallelOp { Status ComputeColMap() override; // Split string based on a character delimiter + // @param str - the input string + // @param str - the delimiter // @return - the a string vector std::vector split(const std::string &s, char delim);