You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
graphengine/ge/omm/csa_interact.h

184 lines
5.1 KiB

/**
* Copyright 2019-2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef GE_OMM_CSA_INTERACT_H_
#define GE_OMM_CSA_INTERACT_H_
#include <string>
#include "framework/common/ge_inner_error_codes.h"
namespace ge {
enum JobState {
JOBSTATE_WAITING = 1,
JOBSTATE_RUNNING,
JOBSTATE_KILLING,
JOBSTATE_SUCCEED,
JOBSTATE_FAILED,
JOBSTATE_KILLED,
JOBSTATE_UNKOWN
};
enum JobSubState {
JOBSUBSTATE_ENV_INIT = 201,
JOBSUBSTATE_ENV_FIN,
JOBSUBSTATE_RESOUCE_ALLOC,
JOBSUBSTATE_MODEL_COMPILE,
JOBSUBSTATE_GRAPH_PREPARE,
JOBSUBSTATE_GRAPH_SPLIT,
JOBSUBSTATE_GRAPH_OPTIMIZE,
JOBSUBSTATE_GRAPH_BUILD,
JOBSUBSTATE_GRAPH_LOAD,
JOBSUBSTATE_GRAPH_EXEC,
JOBSUBSTATE_GRAPH_UNLOAD,
JOBSUBSTATE_OTHER
};
enum ErrorModule {
ERROR_MODULE_DRIVER = 0x01,
ERROR_MODULE_RUNTIME = 0x04,
ERROR_MODULE_CCE = 0x06,
ERROR_MODULE_FMK = 0x08,
ERROR_MODULE_HCCL = 0x12
};
struct CsaErrorCode {
CsaErrorCode()
: module_ret_errcode(0),
error_module(ERROR_MODULE_FMK),
job_sub_state(JOBSUBSTATE_OTHER) {}
~CsaErrorCode() {}
uint32_t module_ret_errcode;
ErrorModule error_module;
JobSubState job_sub_state;
};
class CsaInteract {
public:
///
/// @brief Obtain CsaInteract instance
/// @return CsaInteract instance
///
static CsaInteract& GetInstance();
///
/// @brief CsaInteract instance initialization
/// @param [in] dev_index device index
/// @param [in] job_id job id
/// @return void
///
void Init(int32_t dev_index, int64_t job_id);
///
/// @brief Update job state file
/// @param [in] job_state job state
/// @param [in] job_sub_state detailed job state
/// @param [in] module_ret_errcode sub module training failure error code
/// @param [in] error_module error module identified by FMK
/// @return Status
///
Status WriteJobState(JobState job_state,
JobSubState job_sub_state = JOBSUBSTATE_OTHER,
uint32_t module_ret_errcode = SUCCESS,
ErrorModule error_module = ERROR_MODULE_FMK);
///
/// @brief Update error code in the job state file
/// @param [in] module_ret_errcode sub module training failure error code
/// @param [in] error_module error module identified by FMK
/// @param [in] job_sub_state detailed job state
/// @return void
///
void WriteErrorCode(uint32_t module_ret_errcode, ErrorModule error_module,
JobSubState job_sub_state);
///
/// @brief Record errors that occurred durning the training
/// @param [in] module_ret_errcode sub module training failure error code
/// @param [in] error_module error module identified by FMK
/// @param [in] job_sub_state detailed job state
/// @return void
///
void StoreInternalErrorCode(uint32_t module_ret_errcode,
ErrorModule error_module,
JobSubState job_sub_state);
///
/// @brief Update training error code in the job state file
/// @return void
///
void WriteInternalErrorCode();
///
/// @brief Update network connectivity detect file
/// @param [in] content network connectivity content
/// @return Status
///
Status WriteHcomDetection(const std::string& content);
private:
CsaInteract()
: dev_index_(0),
job_id_(0),
is_init_(false),
curr_state_(JOBSTATE_UNKOWN),
is_have_internal_error_(false) {}
~CsaInteract() {}
CsaInteract(const CsaInteract&) = delete;
CsaInteract(CsaInteract&&) = delete;
CsaInteract& operator=(const CsaInteract&) = delete;
CsaInteract& operator=(CsaInteract&&) = delete;
///
/// @ingroup WriteFile
/// @brief Write the content into the file. If the file does not exist, create the file
/// @param [in] file_name: File name to be written
/// @param [in] content: Contents to be written
/// @return Status
///
Status WriteFile(const std::string& file_name, const std::string& content);
///
/// @ingroup MakePath
/// @brief Verify whether the file path exists, if not, recursively create the folder
/// @param [in] file_name: File name to be verified
/// @return Status
///
Status MakePath(const std::string& file_name);
// device index
int32_t dev_index_;
// job id
int64_t job_id_;
// is initialization complete
bool is_init_;
// current job state
JobState curr_state_;
// job state file
std::string job_state_file_;
// network connectivity detect file
std::string hcom_detect_file_;
// identification of internal errors that occurred during the training
bool is_have_internal_error_;
// error code information
CsaErrorCode csa_error_code_;
};
} // namespace ge
#endif // GE_OMM_CSA_INTERACT_H_