You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Paddle/python/paddle/fluid/tests/unittests/gradient_checker.py

388 lines
12 KiB

# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import six
import collections
import numpy as np
from itertools import product
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.executor import Executor
from paddle.fluid.backward import calc_gradient
from paddle.fluid.backward import _append_grad_suffix_, _as_list
def _product(t):
if isinstance(t, int):
return t
else:
return np.product(t)
def dtype_to_np_dtype(dtype):
if dtype == core.VarDesc.VarType.FP32:
return np.float32
elif dtype == core.VarDesc.VarType.FP64:
return np.float64
elif dtype == core.VarDesc.VarType.FP16:
return np.float16
else:
raise ValueError("Not supported data type " + str(dtype))
def _get_item(t, i, np_dtype):
if np_dtype == np.float16:
np_t = np.array(t).astype(np.float16)
np_t = np_t.flatten()
return np_t[i]
elif np_dtype == np.float32:
return t._get_float_element(i)
elif np_dtype == np.float64:
return t._get_double_element(i)
else:
raise ValueError("Not supported data type " + str(np_dtype))
def _set_item(t, i, e, np_dtype):
if np_dtype == np.float16:
np_t = np.array(t).astype(np.float16)
shape = np_t.shape
np_t = np_t.flatten()
np_t[i] = e
np_t = np_t.reshape(shape).view(np.uint16)
t.set(np_t, place)
elif np_dtype == np.float32:
t._set_float_element(i, e)
elif np_dtype == np.float64:
t._set_double_element(i, e)
else:
raise ValueError("Not supported data type " + str(np_dtype))
def set_var_in_scope(scope, place, name, value, recursive_seq_len=None):
t = scope.var(name).get_tensor()
t.set(value, place)
if recursive_seq_len:
t.set_recursive_sequence_lengths(recursive_seq_len)
return t
def var_to_np_array_in_scope(scope, place, name):
return np.array(scope.var(name).get_tensor())
def make_jacobian(x, y_size, np_dtype):
if isinstance(x, fluid.framework.Variable):
return np.zeros((_product(x.shape), y_size), dtype=np_dtype)
elif isinstance(x, collections.Sequence):
jacobians = list(
filter(lambda t: t is not None, (make_jacobian(
item, y_size, np_dtype) for item in x)))
return jacobians
else:
None
def _compute_numerical_jacobian(program, x, y, place, scope, delta):
"""Computes the numeric Jacobian for dy/dx.
Computes the numeric Jacobian by slightly perturbing the inputs and
measuring the differences on the output.
Args:
program (Program): the network program.
x (Variable): the input variables.
y (list[Variable]): the output variables.
place (fluid.CPUPlace or fluid.CUDAPlace): the device.
scope (Scope): the scope used to run program.
delta: the amount of perturbation we give to the input
Returns:
A list of 2-D numpy array, the list length is len(y).
Each 2-D numpy array represents the Jacobian for dy_i/dx.
It has "x_size" rows and "y_size" columns
where "x_size" is the number of elements in x and
"y_size" is the number of elements in each y_i.
"""
if not isinstance(x, fluid.framework.Variable):
raise TypeError('x is not Variable')
# To compute the jacobian, treat x and y as one-dimensional vectors.
y = _as_list(y)
exe = fluid.Executor(place)
def run():
y_res = exe.run(program, scope=scope, fetch_list=y)
return [yi.flatten() for yi in y_res]
x_name = x.name
x_shape = x.shape
x_size = _product(x_shape)
x_t = scope.find_var(x_name).get_tensor()
np_type = dtype_to_np_dtype(x.dtype)
jacobian = [make_jacobian(x, _product(yi.shape), np_type) for yi in y]
for i in six.moves.xrange(x_size):
orig = _get_item(x_t, i, np_type)
x_pos = orig + delta
_set_item(x_t, i, x_pos, np_type)
y_pos = run()
x_neg = orig - delta
_set_item(x_t, i, x_neg, np_type)
y_neg = run()
_set_item(x_t, i, orig, np_type)
for j in six.moves.xrange(len(y)):
jacobian[j][i, :] = (y_pos[j] - y_neg[j]) / delta / 2.
return jacobian
def _compute_analytical_jacobian(program, x, y, place, scope):
"""Computes the analytical Jacobian for dy/dx.
Args:
program (Program): a Program with forward pass.
x (Variable|list[Variable]): a variable or list of variable
y (Variable): the target variable.
place (fluid.CPUPlace or fluid.CUDAPlace): the device.
scope (Scope): the scope used to run program.
Returns:
A list of 2-D numpy array. The list length is len(x).
Each 2-D numpy array represents the Jacobian for dy/dx_i.
It has "xi_size" rows and "dy_size" columns
where "x_size" is the number of elements in x_i and
"dy_size" is the number of elements in y.
"""
if not isinstance(y, fluid.framework.Variable):
raise TypeError('y is not Variable')
dy_name = _append_grad_suffix_(y.name)
np_type = dtype_to_np_dtype(y.dtype)
# create dy Variable in Program
dy = program.global_block().create_var(
name=dy_name, shape=y.shape, dtype=np_type, persistable=True)
# append backward
dx = calc_gradient(y, x, dy)
# init dy tensor in scope
value = np.zeros(y.shape, dtype=np_type)
dy_t = set_var_in_scope(scope, place, dy_name, value)
exe = fluid.Executor(place)
y_size = _product(y.shape)
x = _as_list(x)
jacobian = make_jacobian(x, y_size, np_type)
for i in six.moves.xrange(y_size):
_set_item(dy_t, i, 1, np_type)
dx_res = exe.run(program, scope=scope, fetch_list=dx)
for j in six.moves.xrange(len(x)):
if dx_res[j] is not None:
jacobian[j][:, i] = dx_res[j].flatten()
else:
jacobian[j][:, i] = np.zeros(
dx[j].shape, dtype=np_type).flatten()
_set_item(dy_t, i, 0, np_type)
return jacobian
def grad_check(x,
y,
x_init=None,
place=None,
program=None,
eps=1e-6,
atol=1e-5,
rtol=1e-3,
raise_exception=True):
"""
Check numerical and analytical gradients for dy/dx.
Each Jacobian gradients is a 2-D array with shape [xi_size, yi_size].
Args:
x (Variable|list[Variable]): input variables to the program.
y (Variable|list[Variable]): output variables to the program.
x_init (numpy.array|list[numpy.array]|None): the init value for input x.
place (fluid.CPUPlace or fluid.CUDAPlace): the device.
program (Program|None): a Program with forward pass.
If None, use fluid.default_main_program().
eps (float): perturbation for finite differences.
atol (float): absolute tolerance.
rtol (float): relative tolerance.
raise_exception (bool): whether to raise an exception if
the check fails. Default is True.
Returns:
True if all differences satisfy numpy.allclose condition.
"""
def fail_test(msg):
if raise_exception:
raise RuntimeError(msg)
return False
# check input arguments
x = _as_list(x)
y = _as_list(y)
for v in x:
v.stop_gradient = False
v.persistable = True
if place is None:
place = fluid.CPUPlace()
if program is None:
program = fluid.default_main_program()
# init variable in strtup program
scope = fluid.executor.global_scope()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
x_init = _as_list(x_init)
# init inputs if x_init is not None
if x_init:
if len(x_init) != len(x):
raise ValueError('len(x_init) (=%d) is not the same'
' as len(x) (= %d)' % (len(x_init), len(x)))
# init variable in main program
for var, arr in zip(x, x_init):
assert var.shape == arr.shape
feeds = {k.name: v for k, v in zip(x, x_init)}
exe.run(program, feed=feeds, scope=scope)
# [x_idx, y_idx]
numerical = [
_compute_numerical_jacobian(program, xi, y, place, scope, eps)
for xi in x
]
# [y_idx, x_idx]
analytical = []
for yi in y:
prog = program.clone()
clone_x = []
clone_y = None
for b in prog.blocks:
if b.has_var(yi.name):
clone_y = b.var(yi.name)
break
for xi in x:
for b in prog.blocks:
if b.has_var(xi.name):
clone_x.append(b.var(xi.name))
break
analytical.append(
_compute_analytical_jacobian(prog, clone_x, clone_y, place, scope))
for i, (x_idx,
y_idx) in enumerate(product(*[range(len(x)), range(len(y))])):
a = analytical[y_idx][x_idx]
n = numerical[x_idx][y_idx]
if not np.allclose(a, n, rtol, atol):
msg = 'Jacobian mismatch for output %s ' \
'with respect to input %s on %s,\n' \
'numerical:%s\nanalytical:%s\n' \
% (y[y_idx].name, x[x_idx].name, str(place), n, a)
return fail_test(msg)
return True
def double_grad_check(x,
y,
x_init=None,
y_grads=None,
place=None,
program=None,
eps=1e-6,
atol=1e-5,
rtol=1e-3,
raise_exception=True):
"""
Check gradients of gradients. This function will append backward to the
program before second order gradient check.
Args:
x (Variable|list[Variable]): input variables to the program.
y (Variable|list[Variable]): output variables to the program.
x_init (numpy.array|list[numpy.array]|None): the init value for input x.
y_grads (numpy.array|list[numpy.array]|None): the gradients with respect to y.
place (fluid.CPUPlace or fluid.CUDAPlace): the device.
program (Program|None): a Program with forward pass.
If None, use fluid.default_main_program().
eps (float): perturbation for finite differences.
atol (float): absolute tolerance.
rtol (float): relative tolerance.
raise_exception (bool): whether to raise an exception if
the check fails. Default is True.
Returns:
True if all differences satisfy numpy.allclose condition.
"""
# check input arguments
x = _as_list(x)
for v in x:
v.stop_gradient = False
v.persistable = True
y = _as_list(y)
if program is None:
program = fluid.default_main_program()
if y_grads is None:
scope = fluid.executor.global_scope()
y_grads = []
y_grads_init = []
for yi in y:
dyi_name = _append_grad_suffix_(yi.name)
np_type = dtype_to_np_dtype(yi.dtype)
dy = program.global_block().create_var(
name=dyi_name, shape=yi.shape, dtype=np_type, persistable=True)
dy.stop_gradient = False
v = np.random.random(size=yi.shape).astype(np_type)
set_var_in_scope(scope, place, dyi_name, v)
y_grads.append(dy)
y_grads_init.append(v)
else:
y_grads = _as_list(y_grads)
y_grads_init = [
var_to_np_array_in_scope(scope, place, v.name) for v in y_grads
]
# append first order grads
target_grads = calc_gradient(y, x, y_grads)
# y_grads are the input of first-order backward,
# so, they are also the input of second-order backward.
x += y_grads
x_init = _as_list(x_init)
x_init += y_grads_init
grad_check(x, target_grads, x_init, place, program, eps, atol, rtol)