PaddlePaddle · abhinavarora · Oct 9, 2017 · Sep 30, 2017 · Oct 2, 2017 · Oct 3, 2017
diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/adamax_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AdamaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("param"),
+                   "Input(param) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("grad"),
+                   "Input(grad) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("moment"),
+                   "Input(moment) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("inf_norm"),
+                   "Input(inf_norm) of AdamaxOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("param_out"),
+                   "Output(param_out) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("moment_out"),
+                   "Output(moment_out) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("inf_norm_out"),
+                   "Output(inf_norm_out) of AdamaxOp should not be null.");
+
+    auto param_dim = ctx->GetInputDim("param");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("grad"),
+        "param and grad input of AdamaxOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("moment"),
+        "param and moment input of AdamaxOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("inf_norm"),
+        "param and inf_norm input of AdamaxOp should have same dimension");
+
+    ctx->SetOutputDim("param_out", param_dim);
+    ctx->SetOutputDim("moment_out", param_dim);
+    ctx->SetOutputDim("inf_norm_out", param_dim);
+  }
+};
+
+class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AdamaxOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("param", "Input parameter");
+    AddInput("grad", "Input gradient");
+    AddInput("moment", "First moment");
+    AddInput("inf_norm", "Input exponentially weighted infinity norm");
+
+    AddOutput("param_out", "Output parameter");
+    AddOutput("moment_out", "Output first moment");
+    AddOutput("inf_norm_out", "Output exponentially weighted infinity norm");
+
+    AddAttr<int>("time_step", "Time step");
+    AddAttr<float>("learning_rate", "Learning rate");
+    AddAttr<float>("beta_1",
+                   "exponential decay rate for the 1st moment estimates.");
+    AddAttr<float>(
+        "beta_2",
+        "exponential decay rate for the weighted infinity norm estimates.");
+    AddAttr<float>("epsilon", "Constant for numerical stability");
+    AddComment(R"DOC(
+Adamax Updates Operator.
+
+This implements the Adamax optimizer from Section 7 of the Adam
+paper(https://arxiv.org/abs/1412.6980). Adamax is a variant of the
+Adam algorithm based on the infinity norm.
+
+Adamax updates:
+
+moment_out = beta_1 * moment + (1 - beta_1) * grad
+inf_norm_out = max(beta_2 * inf_norm + epsilon, abs(grad))
+param_out = param - (learning_rate/(1 - beta_1^t)) * moment_out/inf_norm_out
+
+The original paper(https://arxiv.org/abs/1412.6980) does not have an
+epsilon attribute. However, it is added here for numerical stability
+by preventing divide by 0.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
+REGISTER_OP_CPU_KERNEL(adamax,
+                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/adamax_op.cu b/paddle/operators/adamax_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/adamax_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(adamax,
+                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class AdamaxOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out = ctx.Output<Tensor>("param_out");
+    auto moment_out = ctx.Output<Tensor>("moment_out");
+    auto norm_out = ctx.Output<Tensor>("inf_norm_out");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    moment_out->mutable_data<T>(ctx.GetPlace());
+    norm_out->mutable_data<T>(ctx.GetPlace());
+
+    float lr = ctx.Attr<float>("learning_rate");
+    float beta_1 = ctx.Attr<float>("beta_1");
+    float beta_2 = ctx.Attr<float>("beta_2");
+    float epsilon = ctx.Attr<float>("epsilon");
+    int t = ctx.Attr<int>("time_step");
+
+    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("param"));
+    auto g = EigenVector<T>::Flatten(*ctx.Input<Tensor>("grad"));
+    auto m = EigenVector<T>::Flatten(*ctx.Input<Tensor>("moment"));
+    auto u = EigenVector<T>::Flatten(*ctx.Input<Tensor>("inf_norm"));
+    auto p_out = EigenVector<T>::Flatten(*param_out);
+    auto m_out = EigenVector<T>::Flatten(*moment_out);
+    auto u_out = EigenVector<T>::Flatten(*norm_out);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    m_out.device(place) = beta_1 * m + (1 - beta_1) * g;
+    u_out.device(place) = g.abs().cwiseMax((beta_2 * u) + epsilon);
+
+    float lr_t = lr / (1 - std::pow(beta_1, t));
+    p_out.device(place) = p - lr_t * (m_out / u_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/v2/framework/tests/test_adamax_op.py b/python/paddle/v2/framework/tests/test_adamax_op.py
@@ -0,0 +1,52 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestAdamaxOp(OpTest):
+    def setUp(self):
+        self.op_type = "adamax"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The infinity norm is positive
+        inf_norm = np.random.random((102, 105)).astype("float32")
+
+        time_step = 9
+        learning_rate = 0.002
+        beta_1 = 0.9
+        beta_2 = 0.999
+        epsilon = 1e-8
+
+        self.inputs = {
+            'param': param,
+            'grad': grad,
+            'moment': moment,
+            'inf_norm': inf_norm
+        }
+
+        self.attrs = {
+            'time_step': time_step,
+            'learning_rate': learning_rate,
+            'beta_1': beta_1,
+            'beta_2': beta_2,
+            'epsilon': epsilon
+        }
+
+        moment_out = beta_1 * moment + (1 - beta_1) * grad
+        inf_norm_out = np.maximum(beta_2 * inf_norm + epsilon, np.abs(grad))
+        lr_t = (learning_rate / (1 - beta_1**time_step))
+        param_out = param - lr_t * np.divide(moment_out, inf_norm_out)
+
+        self.outputs = {
+            'param_out': param_out,
+            'moment_out': moment_out,
+            'inf_norm_out': inf_norm_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()