You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Paddle/python/paddle/fluid/tests/unittests/gradient_checker.py

394 lines
13 KiB

# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This is the lib for gradient checker unittest."""
from __future__ import print_function
import unittest
import six
import collections
import numpy as np
from itertools import product
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.executor import Executor
from paddle.fluid.backward import _append_grad_suffix_, _as_list
def _product(t):
if isinstance(t, int):
return t
else:
return np.product(t)
def dtype_to_np_dtype(dtype):
if dtype == core.VarDesc.VarType.FP32:
return np.float32
elif dtype == core.VarDesc.VarType.FP64:
return np.float64
elif dtype == core.VarDesc.VarType.FP16:
return np.float16
else:
raise ValueError("Not supported data type " + str(dtype))
def _get_item(t, i, np_dtype):
if np_dtype == np.float16:
np_t = np.array(t).astype(np.float16)
np_t = np_t.flatten()
return np_t[i]
elif np_dtype == np.float32:
return t._get_float_element(i)
elif np_dtype == np.float64:
return t._get_double_element(i)
else:
raise ValueError("Not supported data type " + str(np_dtype))
def _set_item(t, i, e, np_dtype):
if np_dtype == np.float16:
np_t = np.array(t).astype(np.float16)
shape = np_t.shape
np_t = np_t.flatten()
np_t[i] = e
np_t = np_t.reshape(shape).view(np.uint16)
t.set(np_t, place)
elif np_dtype == np.float32:
t._set_float_element(i, e)
elif np_dtype == np.float64:
t._set_double_element(i, e)
else:
raise ValueError("Not supported data type " + str(np_dtype))
def set_var_in_scope(scope, place, name, value, recursive_seq_len=None):
t = scope.var(name).get_tensor()
t.set(value, place)
if recursive_seq_len:
t.set_recursive_sequence_lengths(recursive_seq_len)
return t
def var_to_np_array_in_scope(scope, place, name):
return np.array(scope.var(name).get_tensor())
def make_jacobian(x, y_size, np_dtype):
if isinstance(x, fluid.framework.Variable):
return np.zeros((_product(x.shape), y_size), dtype=np_dtype)
elif isinstance(x, collections.Sequence):
jacobians = list(
filter(lambda t: t is not None, (make_jacobian(
item, y_size, np_dtype) for item in x)))
return jacobians
else:
None
def _compute_numerical_jacobian(program, x, y, place, scope, delta):
"""Computes the numeric Jacobian for dy/dx.
Computes the numeric Jacobian by slightly perturbing the inputs and
measuring the differences on the output.
Args:
program (Program): the network program.
x (Variable): the input variables.
y (list[Variable]): the output variables.
place (fluid.CPUPlace or fluid.CUDAPlace): the device.
scope (Scope): the scope used to run program.
delta: the amount of perturbation we give to the input
Returns:
A list of 2-D numpy array, the list length is len(y).
Each 2-D numpy array represents the Jacobian for dy_i/dx.
It has "x_size" rows and "y_size" columns
where "x_size" is the number of elements in x and
"y_size" is the number of elements in each y_i.
"""
if not isinstance(x, fluid.framework.Variable):
raise TypeError('x is not Variable')
# To compute the jacobian, treat x and y as one-dimensional vectors.
y = _as_list(y)
exe = fluid.Executor(place)
def run():
y_res = exe.run(program, scope=scope, fetch_list=y)
return [yi.flatten() for yi in y_res]
x_name = x.name
x_shape = x.shape
x_size = _product(x_shape)
x_t = scope.find_var(x_name).get_tensor()
np_type = dtype_to_np_dtype(x.dtype)
jacobian = [make_jacobian(x, _product(yi.shape), np_type) for yi in y]
for i in six.moves.xrange(x_size):
orig = _get_item(x_t, i, np_type)
x_pos = orig + delta
_set_item(x_t, i, x_pos, np_type)
y_pos = run()
x_neg = orig - delta
_set_item(x_t, i, x_neg, np_type)
y_neg = run()
_set_item(x_t, i, orig, np_type)
for j in six.moves.xrange(len(y)):
jacobian[j][i, :] = (y_pos[j] - y_neg[j]) / delta / 2.
return jacobian
def _compute_analytical_jacobian(program, x, y, place, scope):
"""Computes the analytical Jacobian for dy/dx.
Args:
program (Program): a Program with forward pass.
x (Variable|list[Variable]): a variable or list of variable
y (Variable): the target variable.
place (fluid.CPUPlace or fluid.CUDAPlace): the device.
scope (Scope): the scope used to run program.
Returns:
A list of 2-D numpy array. The list length is len(x).
Each 2-D numpy array represents the Jacobian for dy/dx_i.
It has "xi_size" rows and "dy_size" columns
where "x_size" is the number of elements in x_i and
"dy_size" is the number of elements in y.
"""
if not isinstance(y, fluid.framework.Variable):
raise TypeError('y is not Variable')
dy_name = _append_grad_suffix_(y.name)
np_type = dtype_to_np_dtype(y.dtype)
# create dy Variable in Program
dy = program.global_block().create_var(
name=dy_name, shape=y.shape, dtype=np_type, persistable=True)
# append backward
dx = fluid.gradients(y, x, dy)
# init dy tensor in scope
value = np.zeros(y.shape, dtype=np_type)
dy_t = set_var_in_scope(scope, place, dy_name, value)
exe = fluid.Executor(place)
y_size = _product(y.shape)
x = _as_list(x)
jacobian = make_jacobian(x, y_size, np_type)
# filter None in dx for DX/DY may be None in kernel
# only fetch not None dx in exe.run
filted = [(i, dxi) for i, dxi in enumerate(dx) if dxi is not None]
filted_idx, filted_dx = zip(*filted)
for i in six.moves.xrange(y_size):
_set_item(dy_t, i, 1, np_type)
dx_res = exe.run(program, scope=scope, fetch_list=filted_dx)
for j in six.moves.xrange(len(filted_dx)):
dx_idx = filted_idx[j]
if dx_res[j] is not None:
jacobian[dx_idx][:, i] = dx_res[j].flatten()
else:
jacobian[dx_idx][:, i] = np.zeros(
dx[dx_idx].shape, dtype=np_type).flatten()
_set_item(dy_t, i, 0, np_type)
return jacobian
def grad_check(x,
y,
x_init=None,
place=None,
program=None,
eps=1e-6,
atol=1e-5,
rtol=1e-3,
raise_exception=True):
"""
Check numerical and analytical gradients for dy/dx.
Each Jacobian gradients is a 2-D array with shape [xi_size, yi_size].
Args:
x (Variable|list[Variable]): input variables to the program.
y (Variable|list[Variable]): output variables to the program.
x_init (numpy.array|list[numpy.array]|None): the init value for input x.
place (fluid.CPUPlace or fluid.CUDAPlace): the device.
program (Program|None): a Program with forward pass.
If None, use fluid.default_main_program().
eps (float): perturbation for finite differences.
atol (float): absolute tolerance.
rtol (float): relative tolerance.
raise_exception (bool): whether to raise an exception if
the check fails. Default is True.
Returns:
True if all differences satisfy numpy.allclose condition.
"""
def fail_test(msg):
if raise_exception:
raise RuntimeError(msg)
return False
# check input arguments
x = _as_list(x)
y = _as_list(y)
for v in x:
v.stop_gradient = False
v.persistable = True
if place is None:
place = fluid.CPUPlace()
if program is None:
program = fluid.default_main_program()
# init variable in strtup program
scope = fluid.executor.global_scope()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
x_init = _as_list(x_init)
# init inputs if x_init is not None
if x_init:
if len(x_init) != len(x):
raise ValueError('len(x_init) (=%d) is not the same'
' as len(x) (= %d)' % (len(x_init), len(x)))
# init variable in main program
for var, arr in zip(x, x_init):
assert var.shape == arr.shape
feeds = {k.name: v for k, v in zip(x, x_init)}
exe.run(program, feed=feeds, scope=scope)
# [x_idx, y_idx]
numerical = [
_compute_numerical_jacobian(program, xi, y, place, scope, eps)
for xi in x
]
# [y_idx, x_idx]
analytical = []
for yi in y:
prog = program.clone()
clone_x = []
clone_y = None
for b in prog.blocks:
if b.has_var(yi.name):
clone_y = b.var(yi.name)
break
for xi in x:
for b in prog.blocks:
if b.has_var(xi.name):
clone_x.append(b.var(xi.name))
break
analytical.append(
_compute_analytical_jacobian(prog, clone_x, clone_y, place, scope))
for i, (x_idx,
y_idx) in enumerate(product(*[range(len(x)), range(len(y))])):
a = analytical[y_idx][x_idx]
n = numerical[x_idx][y_idx]
if not np.allclose(a, n, rtol, atol):
msg = 'Jacobian mismatch for output %s ' \
'with respect to input %s on %s,\n' \
'numerical:%s\nanalytical:%s\n' \
% (y[y_idx].name, x[x_idx].name, str(place), n, a)
return fail_test(msg)
return True
def double_grad_check(x,
y,
x_init=None,
y_grads=None,
place=None,
program=None,
eps=1e-6,
atol=1e-5,
rtol=1e-3,
raise_exception=True):
"""
Check gradients of gradients. This function will append backward to the
program before second order gradient check.
Args:
x (Variable|list[Variable]): input variables to the program.
y (Variable|list[Variable]): output variables to the program.
x_init (numpy.array|list[numpy.array]|None): the init value for input x.
y_grads (numpy.array|list[numpy.array]|None): the gradients with respect to y.
place (fluid.CPUPlace or fluid.CUDAPlace): the device.
program (Program|None): a Program with forward pass.
If None, use fluid.default_main_program().
eps (float): perturbation for finite differences.
atol (float): absolute tolerance.
rtol (float): relative tolerance.
raise_exception (bool): whether to raise an exception if
the check fails. Default is True.
Returns:
True if all differences satisfy numpy.allclose condition.
"""
# check input arguments
x = _as_list(x)
for v in x:
v.stop_gradient = False
v.persistable = True
y = _as_list(y)
if program is None:
program = fluid.default_main_program()
if y_grads is None:
scope = fluid.executor.global_scope()
y_grads = []
y_grads_init = []
for yi in y:
dyi_name = _append_grad_suffix_(yi.name)
np_type = dtype_to_np_dtype(yi.dtype)
dy = program.global_block().create_var(
name=dyi_name, shape=yi.shape, dtype=np_type, persistable=True)
dy.stop_gradient = False
v = np.random.random(size=yi.shape).astype(np_type)
set_var_in_scope(scope, place, dyi_name, v)
y_grads.append(dy)
y_grads_init.append(v)
else:
y_grads = _as_list(y_grads)
y_grads_init = [
var_to_np_array_in_scope(scope, place, v.name) for v in y_grads
]
# append first order grads
target_grads = fluid.gradients(y, x, y_grads)
# y_grads are the input of first-order backward,
# so, they are also the input of second-order backward.
x += y_grads
x_init = _as_list(x_init)
x_init += y_grads_init
grad_check(x, target_grads, x_init, place, program, eps, atol, rtol)