long running training tests: fail when got NaN loss (#8169)

emailweixu-patch-1
helinwang 7 years ago committed by Abhinav Arora
parent 8e5bc804bb
commit be7fcc0bfc

@ -16,6 +16,8 @@ import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import contextlib import contextlib
import unittest import unittest
import math
import sys
def main(use_cuda): def main(use_cuda):
@ -58,6 +60,8 @@ def main(use_cuda):
print(avg_loss_value) print(avg_loss_value)
if avg_loss_value[0] < 10.0: if avg_loss_value[0] < 10.0:
return return
if math.isnan(float(avg_loss_value)):
sys.exit("got NaN loss, training failed.")
raise AssertionError("Fit a line cost is too large, {0:2.2}".format( raise AssertionError("Fit a line cost is too large, {0:2.2}".format(
avg_loss_value[0])) avg_loss_value[0]))

@ -17,6 +17,8 @@ from __future__ import print_function
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import contextlib import contextlib
import math
import sys
import numpy import numpy
import unittest import unittest
@ -145,6 +147,8 @@ def train(net_type, use_cuda, save_dirname):
loss_t, acc_t = exe.run(program=test_program, loss_t, acc_t = exe.run(program=test_program,
feed=feeder.feed(test_data), feed=feeder.feed(test_data),
fetch_list=[avg_cost, acc]) fetch_list=[avg_cost, acc])
if math.isnan(float(loss_t)):
sys.exit("got NaN loss, training failed.")
acc_list.append(float(acc_t)) acc_list.append(float(acc_t))
avg_loss_list.append(float(loss_t)) avg_loss_list.append(float(loss_t))
break # Use 1 segment for speeding up CI break # Use 1 segment for speeding up CI

@ -18,6 +18,8 @@ import paddle.v2 as paddle
import sys import sys
import numpy import numpy
import unittest import unittest
import math
import sys
def parse_arg(): def parse_arg():
@ -148,6 +150,8 @@ def train(nn_type, use_cuda, parallel, save_dirname):
'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'. 'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
format(pass_id, batch_id + 1, format(pass_id, batch_id + 1,
float(avg_loss_val), float(acc_val))) float(avg_loss_val), float(acc_val)))
if math.isnan(float(avg_loss_val)):
sys.exit("got NaN loss, training failed.")
raise AssertionError("Loss of recognize digits is too large") raise AssertionError("Loss of recognize digits is too large")

@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import math
import sys
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
@ -217,6 +219,8 @@ def main():
if out[0] < 6.0: if out[0] < 6.0:
# if avg cost less than 6.0, we think our code is good. # if avg cost less than 6.0, we think our code is good.
exit(0) exit(0)
if math.isnan(float(out[0])):
sys.exit("got NaN loss, training failed.")
main() main()

@ -16,6 +16,8 @@ import unittest
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import paddle.v2 as paddle import paddle.v2 as paddle
import contextlib import contextlib
import math
import sys
def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32, def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
@ -115,6 +117,8 @@ def main(word_dict, net_method, use_cuda):
print("cost=" + str(cost_val) + " acc=" + str(acc_val)) print("cost=" + str(cost_val) + " acc=" + str(acc_val))
if cost_val < 0.4 and acc_val > 0.8: if cost_val < 0.4 and acc_val > 0.8:
return return
if math.isnan(float(cost_val)):
sys.exit("got NaN loss, training failed.")
raise AssertionError("Cost is too large for {0}".format( raise AssertionError("Cost is too large for {0}".format(
net_method.__name__)) net_method.__name__))

@ -16,6 +16,8 @@ import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import unittest import unittest
import os import os
import math
import sys
def main(use_cuda, is_sparse, parallel): def main(use_cuda, is_sparse, parallel):
@ -112,6 +114,9 @@ def main(use_cuda, is_sparse, parallel):
fetch_list=[avg_cost]) fetch_list=[avg_cost])
if avg_cost_np[0] < 5.0: if avg_cost_np[0] < 5.0:
return return
if math.isnan(float(avg_cost_np[0])):
sys.exit("got NaN loss, training failed.")
raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0])) raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))

Loading…
Cancel
Save