!6515 Resolve deadlock issue when terminate debugger from UI

Merge pull request !6515 from lichen_101010/terminate_deadlock_issue
pull/6515/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit d30dece8a9

@ -102,7 +102,8 @@ void Executor::CheckException() {
}
void Executor::WorkerJoin() {
if (worker_->joinable()) {
// Avoid worker thread join itself which will cause deadlock
if (worker_->joinable() && worker_->get_id() != std::this_thread::get_id()) {
{
std::unique_lock<std::mutex> lock(task_mutex_);
auto task = std::make_shared<ExitTask>();

@ -444,6 +444,8 @@ void Debugger::CommandLoop() {
case DebuggerCommand::kExitCMD:
MS_LOG(INFO) << "ExitCMD";
Exit();
// Used for debugger termination
run = true;
break;
case DebuggerCommand::kRunCMD:
MS_LOG(INFO) << "RunCMD";
@ -594,8 +596,18 @@ std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &ten
void Debugger::Exit() {
// clear resource before exit
pipeline::ClearResAtexit();
std::exit(EXIT_FAILURE);
// For node level, debugger has to exit itself because main thread can only exit in step bundary;
// For step level, debugger will notify main thread to exit;
if (run_level_ == "node") {
pipeline::ClearResAtexit();
exit(1);
} else if (run_level_ == "step") {
// Notify main thread to terminate
pipeline::ExecutorPy::DebugTerminate(true);
} else {
pipeline::ClearResAtexit();
exit(1);
}
}
std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode) {

@ -76,6 +76,7 @@ const char IR_TYPE_MINDIR[] = "mind_ir";
ExecutorPyPtr ExecutorPy::executor_ = nullptr;
std::mutex ExecutorPy::instance_lock_;
bool ExecutorPy::debugger_terminate_ = false;
std::unordered_map<abstract::AbstractBasePtrList, int, abstract::AbstractBasePtrListHasher,
abstract::AbstractBasePtrListEqual>
@ -748,7 +749,17 @@ void ExecutorPy::ProcessVmArg(const py::tuple &args, const std::string &phase, V
ProcessVmArgInner(args, GetResource(phase), arg_list);
}
void ExecutorPy::TerminateDebugger() {
if (debugger_terminate_) {
MS_LOG(INFO) << "Terminate debugger and clear resources!";
ClearResAtexit();
exit(1);
}
}
py::object ExecutorPy::Run(const py::tuple &args, const py::object &phase) {
// Mindspore debugger notify main thread to exit after one step, and will not run next step
TerminateDebugger();
std::size_t size = args.size();
if (!py::isinstance<py::str>(phase)) {
MS_LOG(EXCEPTION) << "Run failed, phase input is not a str";

@ -97,6 +97,9 @@ class ExecutorPy : public std::enable_shared_from_this<ExecutorPy> {
void DelNetRes(const std::string &id);
void ReleaseResource(const py::object &phase);
static void ClearRes();
static bool GetDebugTerminate() { return debugger_terminate_; }
static void DebugTerminate(bool val) { debugger_terminate_ = val; }
void TerminateDebugger();
std::map<std::string, std::pair<PrimitivePyPtr, std::string>> FetchInfoForQuantExport(const std::string &phase_s);
@ -111,6 +114,7 @@ class ExecutorPy : public std::enable_shared_from_this<ExecutorPy> {
std::map<std::string, ExecutorInfoPtr> info_;
static std::shared_ptr<ExecutorPy> executor_;
static std::mutex instance_lock_;
static bool debugger_terminate_;
};
using ExecutorPyPtr = std::shared_ptr<ExecutorPy>;
@ -125,7 +129,6 @@ void InitHccl();
void FinalizeHccl();
void InitBackend();
void FinalizeBackend();
void ClearResAtexit();
void ReleaseGeTsd();

Loading…
Cancel
Save