From 0d61bf91038f3d8dcfec5917d4c425364844cb7d Mon Sep 17 00:00:00 2001 From: Penghao Cen Date: Mon, 12 Mar 2018 21:06:47 +0800 Subject: [PATCH 01/24] Update API to v1alpha2 (#457) per this design proposal kubeflow/community#30. Update API to v1alpha2 --- hack/update-codegen.sh | 2 +- pkg/apis/tensorflow/v1alpha2/doc.go | 20 ++ pkg/apis/tensorflow/v1alpha2/register.go | 63 ++++ pkg/apis/tensorflow/v1alpha2/types.go | 210 +++++++++++++ .../v1alpha2/zz_generated.deepcopy.go | 282 ++++++++++++++++++ pkg/client/clientset/versioned/clientset.go | 24 +- .../versioned/fake/clientset_generated.go | 16 +- .../clientset/versioned/fake/register.go | 4 +- .../clientset/versioned/scheme/register.go | 4 +- .../versioned/typed/kubeflow/v1alpha2/doc.go | 20 ++ .../typed/kubeflow/v1alpha2/fake/doc.go | 20 ++ .../v1alpha2/fake/fake_kubeflow_client.go | 38 +++ .../kubeflow/v1alpha2/fake/fake_tfjob.go | 126 ++++++++ .../kubeflow/v1alpha2/generated_expansion.go | 19 ++ .../kubeflow/v1alpha2/kubeflow_client.go | 88 ++++++ .../typed/kubeflow/v1alpha2/tfjob.go | 155 ++++++++++ .../typed/tensorflow/v1alpha2/doc.go | 20 ++ .../typed/tensorflow/v1alpha2/fake/doc.go | 20 ++ .../v1alpha2/fake/fake_tensorflow_client.go | 38 +++ .../tensorflow/v1alpha2/fake/fake_tfjob.go | 126 ++++++++ .../v1alpha2/generated_expansion.go | 19 ++ .../tensorflow/v1alpha2/tensorflow_client.go | 88 ++++++ .../typed/tensorflow/v1alpha2/tfjob.go | 155 ++++++++++ .../informers/externalversions/generic.go | 8 +- .../externalversions/kubeflow/interface.go | 12 +- .../kubeflow/v1alpha2/interface.go | 43 +++ .../kubeflow/v1alpha2/tfjob.go | 73 +++++ .../externalversions/tensorflow/interface.go | 44 +++ .../tensorflow/v1alpha2/interface.go | 43 +++ .../tensorflow/v1alpha2/tfjob.go | 73 +++++ .../kubeflow/v1alpha2/expansion_generated.go | 27 ++ pkg/client/listers/kubeflow/v1alpha2/tfjob.go | 94 ++++++ .../v1alpha2/expansion_generated.go | 27 ++ .../listers/tensorflow/v1alpha2/tfjob.go | 94 ++++++ 34 files changed, 2060 insertions(+), 35 deletions(-) create mode 100644 pkg/apis/tensorflow/v1alpha2/doc.go create mode 100644 pkg/apis/tensorflow/v1alpha2/register.go create mode 100644 pkg/apis/tensorflow/v1alpha2/types.go create mode 100644 pkg/apis/tensorflow/v1alpha2/zz_generated.deepcopy.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/doc.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/doc.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/fake_kubeflow_client.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/fake_tfjob.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/generated_expansion.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/kubeflow_client.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/tfjob.go create mode 100644 pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/doc.go create mode 100644 pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/doc.go create mode 100644 pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/fake_tensorflow_client.go create mode 100644 pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/fake_tfjob.go create mode 100644 pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/generated_expansion.go create mode 100644 pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/tensorflow_client.go create mode 100644 pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/tfjob.go create mode 100644 pkg/client/informers/externalversions/kubeflow/v1alpha2/interface.go create mode 100644 pkg/client/informers/externalversions/kubeflow/v1alpha2/tfjob.go create mode 100644 pkg/client/informers/externalversions/tensorflow/interface.go create mode 100644 pkg/client/informers/externalversions/tensorflow/v1alpha2/interface.go create mode 100644 pkg/client/informers/externalversions/tensorflow/v1alpha2/tfjob.go create mode 100644 pkg/client/listers/kubeflow/v1alpha2/expansion_generated.go create mode 100644 pkg/client/listers/kubeflow/v1alpha2/tfjob.go create mode 100644 pkg/client/listers/tensorflow/v1alpha2/expansion_generated.go create mode 100644 pkg/client/listers/tensorflow/v1alpha2/tfjob.go diff --git a/hack/update-codegen.sh b/hack/update-codegen.sh index fd357ca6e3..d73dd7e5fd 100755 --- a/hack/update-codegen.sh +++ b/hack/update-codegen.sh @@ -30,5 +30,5 @@ CODEGEN_PKG=${CODEGEN_PKG:-$(cd ${SCRIPT_ROOT}; ls -d -1 ./vendor/k8s.io/code-ge # instead of the $GOPATH directly. For normal projects this can be dropped. ${CODEGEN_PKG}/generate-groups.sh "defaulter,deepcopy,client,informer,lister" \ github.com/kubeflow/tf-operator/pkg/client github.com/kubeflow/tf-operator/pkg/apis \ - tensorflow:v1alpha1 \ + tensorflow:v1alpha2 \ --go-header-file ${SCRIPT_ROOT}/hack/boilerplate/boilerplate.go.txt diff --git a/pkg/apis/tensorflow/v1alpha2/doc.go b/pkg/apis/tensorflow/v1alpha2/doc.go new file mode 100644 index 0000000000..b778f31168 --- /dev/null +++ b/pkg/apis/tensorflow/v1alpha2/doc.go @@ -0,0 +1,20 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +k8s:deepcopy-gen=package,register +// +k8s:defaulter-gen=TypeMeta + +// Package v1alpha2 is the v1alpha2 version of the API. +// +groupName=kubeflow.org +package v1alpha2 diff --git a/pkg/apis/tensorflow/v1alpha2/register.go b/pkg/apis/tensorflow/v1alpha2/register.go new file mode 100644 index 0000000000..694b971720 --- /dev/null +++ b/pkg/apis/tensorflow/v1alpha2/register.go @@ -0,0 +1,63 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1alpha2 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +var ( + // TODO: move SchemeBuilder with zz_generated.deepcopy.go to k8s.io/api. + // localSchemeBuilder and AddToScheme will stay in k8s.io/kubernetes. + SchemeBuilder runtime.SchemeBuilder + localSchemeBuilder = &SchemeBuilder + AddToScheme = localSchemeBuilder.AddToScheme +) + +const ( + // GroupName is the group name use in this package. + GroupName = "kubeflow.org" + // TFJobResourceKind is the kind name. + TFJobResourceKind = "TFJob" + // GroupVersion is the version. + GroupVersion = "v1alpha2" +) + +// SchemeGroupVersion is the group version used to register these objects. +var SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: GroupVersion} + +func init() { + // We only register manually written functions here. The registration of the + // generated functions takes place in the generated files. The separation + // makes the code compile even when the generated files are missing. + localSchemeBuilder.Register(addKnownTypes) +} + +// Resource takes an unqualified resource and returns a Group-qualified GroupResource. +func Resource(resource string) schema.GroupResource { + return SchemeGroupVersion.WithResource(resource).GroupResource() +} + +// addKnownTypes adds the set of types defined in this package to the supplied scheme. +func addKnownTypes(scheme *runtime.Scheme) error { + scheme.AddKnownTypes(SchemeGroupVersion, + &TFJob{}, + &TFJobList{}, + ) + metav1.AddToGroupVersion(scheme, SchemeGroupVersion) + return nil +} diff --git a/pkg/apis/tensorflow/v1alpha2/types.go b/pkg/apis/tensorflow/v1alpha2/types.go new file mode 100644 index 0000000000..b45267c8b8 --- /dev/null +++ b/pkg/apis/tensorflow/v1alpha2/types.go @@ -0,0 +1,210 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1alpha2 + +import ( + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// +genclient +// +genclient:noStatus +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// +resource:path=tfjob + +// TFJob represents the configuration of signal TFJob +type TFJob struct { + metav1.TypeMeta `json:",inline"` + + // Standard object's metadata. + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Specification of the desired behavior of the TFJob. + Spec TFJobSpec `json:"spec,omitempty"` + + // Most recently observed status of the TFJob. + // This data may not be up to date. + // Populated by the system. + // Read-only. + Status TFJobStatus `json:"status,omitempty"` +} + +// TFJobSpec is a desired state description of the TFJob. +type TFJobSpec struct { + // TFReplicaSpecs is map of TFReplicaType and TFReplicaSpec + // specifies the TF replicas to run. + // For example, + // { + // "PS": TFReplicaSpec, + // "Worker": TFReplicaSpec, + // } + TFReplicaSpecs map[TFReplicaType]*TFReplicaSpec `json:"tfReplicaSpecs"` + + // Restart policy for all TFReplicas within the TFJob. + // One of Always, OnFailure, Never and ExitCode. + // Default to Always. + RestartPolicy RestartPolicy `json:"restartPolicy,omitempty"` +} + +// RestartPolicy describes how the TFReplicas should be restarted. +// Only one of the following restart policies may be specified. +// If none of the following policies is specified, the default one +// is RestartPolicyAlways. +type RestartPolicy string + +const ( + RestartPolicyAlways RestartPolicy = "Always" + RestartPolicyOnFailure RestartPolicy = "OnFailure" + RestartPolicyNever RestartPolicy = "Never" + + // `ExitCode` policy means that user should add exit code by themselves, + // `tf-operator` will check these exit codes to + // determine the behavior when an error occurs: + // - 1-127: permanent error, do not restart. + // - 128-255: retryable error, will restart the pod. + RestartPolicyExitCode RestartPolicy = "ExitCode" +) + +// TFReplicaSpec is a description of the TFReplica +type TFReplicaSpec struct { + // Replicas is the desired number of replicas of the given template. + // If unspecified, defaults to 1. + Replicas *int32 `json:"replicas,omitempty"` + + // Template is the object that describes the pod that + // will be created for this TFReplica. + // We use RestartPolicy in PodTemplateSpec + // to describe how the containers within the pod should be restarted. + // Please set this restart policy carefully according to your code. + Template v1.PodTemplateSpec `json:"template,omitempty"` +} + +// TFReplicaType is the type for TFReplica. +type TFReplicaType string + +const ( + // TFReplicaTypePS is the type for parameter servers of distributed TensorFlow. + TFReplicaTypePS TFReplicaType = "PS" + + // TFReplicaTypeWorker is the type for workers of distributed TensorFlow. + // This is also used for non-distributed TensorFlow. + TFReplicaTypeWorker TFReplicaType = "Worker" + + // TFReplicaTypeChief is the type for chief worker of distributed TensorFlow. + // If there is "chief" replica type, it's the "chief worker". + // Else, worker:0 is the chief worker. + TFReplicaTypeChief TFReplicaType = "Chief" + + // TFReplicaTypeEval is the type for evaluation replica in TensorFlow. + TFReplicaTypeEval TFReplicaType = "Eval" +) + +// TFJobStatus represents the current observed state of the TFJob. +type TFJobStatus struct { + // Represents is an array of current observed TFJob conditions. + Conditions []TFJobCondition `json:"conditions"` + + // TFReplicaStatuses is map of TFReplicaType and TFReplicaStatus, + // specifies the status of each TFReplica. + TFReplicaStatuses map[TFReplicaType]*TFReplicaStatus `json:"tfReplicaStatuses"` + + // Represents time when the TFJob was acknowledged by the TFJob controller. + // It is not guaranteed to be set in happens-before order across separate operations. + // It is represented in RFC3339 form and is in UTC. + StartTime *metav1.Time `json:"startTime,omitempty"` + + // Represents time when the TFJob was completed. It is not guaranteed to + // be set in happens-before order across separate operations. + // It is represented in RFC3339 form and is in UTC. + CompletionTime *metav1.Time `json:"completionTime,omitempty"` + + // Represents last time when the TFJob was reconciled. It is not guaranteed to + // be set in happens-before order across separate operations. + // It is represented in RFC3339 form and is in UTC. + LastReconcileTime *metav1.Time `json:"lastReconcileTime,omitempty"` +} + +// TFReplicaStatus represents the current observed state of the TFReplica. +type TFReplicaStatus struct { + // The number of actively running pods. + Active int32 `json:"active,omitempty""` + + // The number of pods which reached phase Succeeded. + Succeeded int32 `json:"succeeded,omitempty"` + + // The number of pods which reached phase Failed. + Failed int32 `json:"failed,omitempty"` +} + +// TFJobCondition describes the state of the TFJob at a certain point. +type TFJobCondition struct { + // Type of TFJob condition. + Type TFJobConditionType `json:"type"` + // Status of the condition, one of True, False, Unknown. + Status v1.ConditionStatus `json:"status"` + // The reason for the condition's last transition. + Reason string `json:"reason,omitempty"` + // A human readable message indicating details about the transition. + Message string `json:"message,omitempty"` + // The last time this condition was updated. + LastUpdateTime metav1.Time `json:"lastUpdateTime,omitempty"` + // Last time the condition transitioned from one status to another. + LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"` +} + +// TFJobConditionType defines all kinds of types of TFJobStatus. +type TFJobConditionType string + +const ( + // TFJobCreated means all sub-resources (e.g. services/pods) of this TFJob + // have been successfully created. + // But they are waiting to be scheduled and launched. + TFJobCreated TFJobConditionType = "Created" + + // TFJobRunning means all sub-resources (e.g. services/pods) of this TFJob + // have been successfully scheduled and launched. + // The training is running without error. + TFJobRunning TFJobConditionType = "Running" + + // TFJobRestarting means one or more sub-resources (e.g. services/pods) of this TFJob + // reached phase failed but maybe restarted according to it's restart policy + // which specified by user in v1.PodTemplateSpec. + // The training is freezing/pending. + TFJobRestarting TFJobConditionType = "Restarting" + + // TFJobSucceeded means all sub-resources (e.g. services/pods) of this TFJob + // reached phase have terminated in success. + // The training is complete without error. + TFJobSucceeded TFJobConditionType = "Succeeded" + + // TFJobFailed means one or more sub-resources (e.g. services/pods) of this TFJob + // reached phase failed with no restarting. + // The training has failed its execution. + TFJobFailed TFJobConditionType = "Failed" +) + +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// +resource:path=tfjobs + +// TFJobList is a list of TFJobs. +type TFJobList struct { + metav1.TypeMeta `json:",inline"` + + // Standard list metadata. + metav1.ListMeta `json:"metadata,omitempty"` + + // List of TFJobs. + Items []TFJob `json:"items"` +} diff --git a/pkg/apis/tensorflow/v1alpha2/zz_generated.deepcopy.go b/pkg/apis/tensorflow/v1alpha2/zz_generated.deepcopy.go new file mode 100644 index 0000000000..cae276f299 --- /dev/null +++ b/pkg/apis/tensorflow/v1alpha2/zz_generated.deepcopy.go @@ -0,0 +1,282 @@ +// +build !ignore_autogenerated + +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This file was autogenerated by deepcopy-gen. Do not edit it manually! + +package v1alpha2 + +import ( + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + conversion "k8s.io/apimachinery/pkg/conversion" + runtime "k8s.io/apimachinery/pkg/runtime" + reflect "reflect" +) + +func init() { + SchemeBuilder.Register(RegisterDeepCopies) +} + +// RegisterDeepCopies adds deep-copy functions to the given scheme. Public +// to allow building arbitrary schemes. +// +// Deprecated: deepcopy registration will go away when static deepcopy is fully implemented. +func RegisterDeepCopies(scheme *runtime.Scheme) error { + return scheme.AddGeneratedDeepCopyFuncs( + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*TFJob).DeepCopyInto(out.(*TFJob)) + return nil + }, InType: reflect.TypeOf(&TFJob{})}, + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*TFJobCondition).DeepCopyInto(out.(*TFJobCondition)) + return nil + }, InType: reflect.TypeOf(&TFJobCondition{})}, + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*TFJobList).DeepCopyInto(out.(*TFJobList)) + return nil + }, InType: reflect.TypeOf(&TFJobList{})}, + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*TFJobSpec).DeepCopyInto(out.(*TFJobSpec)) + return nil + }, InType: reflect.TypeOf(&TFJobSpec{})}, + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*TFJobStatus).DeepCopyInto(out.(*TFJobStatus)) + return nil + }, InType: reflect.TypeOf(&TFJobStatus{})}, + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*TFReplicaSpec).DeepCopyInto(out.(*TFReplicaSpec)) + return nil + }, InType: reflect.TypeOf(&TFReplicaSpec{})}, + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*TFReplicaStatus).DeepCopyInto(out.(*TFReplicaStatus)) + return nil + }, InType: reflect.TypeOf(&TFReplicaStatus{})}, + ) +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TFJob) DeepCopyInto(out *TFJob) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFJob. +func (in *TFJob) DeepCopy() *TFJob { + if in == nil { + return nil + } + out := new(TFJob) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *TFJob) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } else { + return nil + } +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TFJobCondition) DeepCopyInto(out *TFJobCondition) { + *out = *in + in.LastUpdateTime.DeepCopyInto(&out.LastUpdateTime) + in.LastTransitionTime.DeepCopyInto(&out.LastTransitionTime) + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFJobCondition. +func (in *TFJobCondition) DeepCopy() *TFJobCondition { + if in == nil { + return nil + } + out := new(TFJobCondition) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TFJobList) DeepCopyInto(out *TFJobList) { + *out = *in + out.TypeMeta = in.TypeMeta + out.ListMeta = in.ListMeta + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]TFJob, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFJobList. +func (in *TFJobList) DeepCopy() *TFJobList { + if in == nil { + return nil + } + out := new(TFJobList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *TFJobList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } else { + return nil + } +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TFJobSpec) DeepCopyInto(out *TFJobSpec) { + *out = *in + if in.TFReplicaSpecs != nil { + in, out := &in.TFReplicaSpecs, &out.TFReplicaSpecs + *out = make(map[TFReplicaType]*TFReplicaSpec, len(*in)) + for key, val := range *in { + if val == nil { + (*out)[key] = nil + } else { + (*out)[key] = new(TFReplicaSpec) + val.DeepCopyInto((*out)[key]) + } + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFJobSpec. +func (in *TFJobSpec) DeepCopy() *TFJobSpec { + if in == nil { + return nil + } + out := new(TFJobSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TFJobStatus) DeepCopyInto(out *TFJobStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]TFJobCondition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.TFReplicaStatuses != nil { + in, out := &in.TFReplicaStatuses, &out.TFReplicaStatuses + *out = make(map[TFReplicaType]*TFReplicaStatus, len(*in)) + for key, val := range *in { + if val == nil { + (*out)[key] = nil + } else { + (*out)[key] = new(TFReplicaStatus) + val.DeepCopyInto((*out)[key]) + } + } + } + if in.StartTime != nil { + in, out := &in.StartTime, &out.StartTime + if *in == nil { + *out = nil + } else { + *out = new(v1.Time) + (*in).DeepCopyInto(*out) + } + } + if in.CompletionTime != nil { + in, out := &in.CompletionTime, &out.CompletionTime + if *in == nil { + *out = nil + } else { + *out = new(v1.Time) + (*in).DeepCopyInto(*out) + } + } + if in.LastReconcileTime != nil { + in, out := &in.LastReconcileTime, &out.LastReconcileTime + if *in == nil { + *out = nil + } else { + *out = new(v1.Time) + (*in).DeepCopyInto(*out) + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFJobStatus. +func (in *TFJobStatus) DeepCopy() *TFJobStatus { + if in == nil { + return nil + } + out := new(TFJobStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TFReplicaSpec) DeepCopyInto(out *TFReplicaSpec) { + *out = *in + if in.Replicas != nil { + in, out := &in.Replicas, &out.Replicas + if *in == nil { + *out = nil + } else { + *out = new(int32) + **out = **in + } + } + in.Template.DeepCopyInto(&out.Template) + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFReplicaSpec. +func (in *TFReplicaSpec) DeepCopy() *TFReplicaSpec { + if in == nil { + return nil + } + out := new(TFReplicaSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TFReplicaStatus) DeepCopyInto(out *TFReplicaStatus) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFReplicaStatus. +func (in *TFReplicaStatus) DeepCopy() *TFReplicaStatus { + if in == nil { + return nil + } + out := new(TFReplicaStatus) + in.DeepCopyInto(out) + return out +} diff --git a/pkg/client/clientset/versioned/clientset.go b/pkg/client/clientset/versioned/clientset.go index 6cf131f520..897759d9c3 100644 --- a/pkg/client/clientset/versioned/clientset.go +++ b/pkg/client/clientset/versioned/clientset.go @@ -15,7 +15,7 @@ package versioned import ( glog "github.com/golang/glog" - kubeflowv1alpha1 "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1" + kubeflowv1alpha2 "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2" discovery "k8s.io/client-go/discovery" rest "k8s.io/client-go/rest" flowcontrol "k8s.io/client-go/util/flowcontrol" @@ -23,27 +23,27 @@ import ( type Interface interface { Discovery() discovery.DiscoveryInterface - KubeflowV1alpha1() kubeflowv1alpha1.KubeflowV1alpha1Interface + KubeflowV1alpha2() kubeflowv1alpha2.KubeflowV1alpha2Interface // Deprecated: please explicitly pick a version if possible. - Kubeflow() kubeflowv1alpha1.KubeflowV1alpha1Interface + Kubeflow() kubeflowv1alpha2.KubeflowV1alpha2Interface } // Clientset contains the clients for groups. Each group has exactly one // version included in a Clientset. type Clientset struct { *discovery.DiscoveryClient - kubeflowV1alpha1 *kubeflowv1alpha1.KubeflowV1alpha1Client + kubeflowV1alpha2 *kubeflowv1alpha2.KubeflowV1alpha2Client } -// KubeflowV1alpha1 retrieves the KubeflowV1alpha1Client -func (c *Clientset) KubeflowV1alpha1() kubeflowv1alpha1.KubeflowV1alpha1Interface { - return c.kubeflowV1alpha1 +// KubeflowV1alpha2 retrieves the KubeflowV1alpha2Client +func (c *Clientset) KubeflowV1alpha2() kubeflowv1alpha2.KubeflowV1alpha2Interface { + return c.kubeflowV1alpha2 } // Deprecated: Kubeflow retrieves the default version of KubeflowClient. // Please explicitly pick a version. -func (c *Clientset) Kubeflow() kubeflowv1alpha1.KubeflowV1alpha1Interface { - return c.kubeflowV1alpha1 +func (c *Clientset) Kubeflow() kubeflowv1alpha2.KubeflowV1alpha2Interface { + return c.kubeflowV1alpha2 } // Discovery retrieves the DiscoveryClient @@ -62,7 +62,7 @@ func NewForConfig(c *rest.Config) (*Clientset, error) { } var cs Clientset var err error - cs.kubeflowV1alpha1, err = kubeflowv1alpha1.NewForConfig(&configShallowCopy) + cs.kubeflowV1alpha2, err = kubeflowv1alpha2.NewForConfig(&configShallowCopy) if err != nil { return nil, err } @@ -79,7 +79,7 @@ func NewForConfig(c *rest.Config) (*Clientset, error) { // panics if there is an error in the config. func NewForConfigOrDie(c *rest.Config) *Clientset { var cs Clientset - cs.kubeflowV1alpha1 = kubeflowv1alpha1.NewForConfigOrDie(c) + cs.kubeflowV1alpha2 = kubeflowv1alpha2.NewForConfigOrDie(c) cs.DiscoveryClient = discovery.NewDiscoveryClientForConfigOrDie(c) return &cs @@ -88,7 +88,7 @@ func NewForConfigOrDie(c *rest.Config) *Clientset { // New creates a new Clientset for the given RESTClient. func New(c rest.Interface) *Clientset { var cs Clientset - cs.kubeflowV1alpha1 = kubeflowv1alpha1.New(c) + cs.kubeflowV1alpha2 = kubeflowv1alpha2.New(c) cs.DiscoveryClient = discovery.NewDiscoveryClient(c) return &cs diff --git a/pkg/client/clientset/versioned/fake/clientset_generated.go b/pkg/client/clientset/versioned/fake/clientset_generated.go index 3b227cbc69..ba187d2627 100644 --- a/pkg/client/clientset/versioned/fake/clientset_generated.go +++ b/pkg/client/clientset/versioned/fake/clientset_generated.go @@ -15,8 +15,8 @@ package fake import ( clientset "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" - kubeflowv1alpha1 "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1" - fakekubeflowv1alpha1 "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake" + kubeflowv1alpha2 "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2" + fakekubeflowv1alpha2 "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/discovery" @@ -57,12 +57,12 @@ func (c *Clientset) Discovery() discovery.DiscoveryInterface { var _ clientset.Interface = &Clientset{} -// KubeflowV1alpha1 retrieves the KubeflowV1alpha1Client -func (c *Clientset) KubeflowV1alpha1() kubeflowv1alpha1.KubeflowV1alpha1Interface { - return &fakekubeflowv1alpha1.FakeKubeflowV1alpha1{Fake: &c.Fake} +// KubeflowV1alpha2 retrieves the KubeflowV1alpha2Client +func (c *Clientset) KubeflowV1alpha2() kubeflowv1alpha2.KubeflowV1alpha2Interface { + return &fakekubeflowv1alpha2.FakeKubeflowV1alpha2{Fake: &c.Fake} } -// Kubeflow retrieves the KubeflowV1alpha1Client -func (c *Clientset) Kubeflow() kubeflowv1alpha1.KubeflowV1alpha1Interface { - return &fakekubeflowv1alpha1.FakeKubeflowV1alpha1{Fake: &c.Fake} +// Kubeflow retrieves the KubeflowV1alpha2Client +func (c *Clientset) Kubeflow() kubeflowv1alpha2.KubeflowV1alpha2Interface { + return &fakekubeflowv1alpha2.FakeKubeflowV1alpha2{Fake: &c.Fake} } diff --git a/pkg/client/clientset/versioned/fake/register.go b/pkg/client/clientset/versioned/fake/register.go index 2a8f7aa4c6..cb575beb0d 100644 --- a/pkg/client/clientset/versioned/fake/register.go +++ b/pkg/client/clientset/versioned/fake/register.go @@ -14,7 +14,7 @@ package fake import ( - kubeflowv1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" + kubeflowv1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" schema "k8s.io/apimachinery/pkg/runtime/schema" @@ -45,6 +45,6 @@ func init() { // After this, RawExtensions in Kubernetes types will serialize kube-aggregator types // correctly. func AddToScheme(scheme *runtime.Scheme) { - kubeflowv1alpha1.AddToScheme(scheme) + kubeflowv1alpha2.AddToScheme(scheme) } diff --git a/pkg/client/clientset/versioned/scheme/register.go b/pkg/client/clientset/versioned/scheme/register.go index a3c04345b0..f72adde908 100644 --- a/pkg/client/clientset/versioned/scheme/register.go +++ b/pkg/client/clientset/versioned/scheme/register.go @@ -14,7 +14,7 @@ package scheme import ( - kubeflowv1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" + kubeflowv1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" schema "k8s.io/apimachinery/pkg/runtime/schema" @@ -45,6 +45,6 @@ func init() { // After this, RawExtensions in Kubernetes types will serialize kube-aggregator types // correctly. func AddToScheme(scheme *runtime.Scheme) { - kubeflowv1alpha1.AddToScheme(scheme) + kubeflowv1alpha2.AddToScheme(scheme) } diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/doc.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/doc.go new file mode 100644 index 0000000000..ef161aeae6 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/doc.go @@ -0,0 +1,20 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This package is generated by client-gen with custom arguments. + +// This package has the automatically generated typed clients. +package v1alpha2 diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/doc.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/doc.go new file mode 100644 index 0000000000..d4003d501b --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/doc.go @@ -0,0 +1,20 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This package is generated by client-gen with custom arguments. + +// Package fake has the automatically generated clients. +package fake diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/fake_kubeflow_client.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/fake_kubeflow_client.go new file mode 100644 index 0000000000..7338a5d7da --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/fake_kubeflow_client.go @@ -0,0 +1,38 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package fake + +import ( + v1alpha2 "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2" + rest "k8s.io/client-go/rest" + testing "k8s.io/client-go/testing" +) + +type FakeKubeflowV1alpha2 struct { + *testing.Fake +} + +func (c *FakeKubeflowV1alpha2) TFJobs(namespace string) v1alpha2.TFJobInterface { + return &FakeTFJobs{c, namespace} +} + +// RESTClient returns a RESTClient that is used to communicate +// with API server by this client implementation. +func (c *FakeKubeflowV1alpha2) RESTClient() rest.Interface { + var ret *rest.RESTClient + return ret +} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/fake_tfjob.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/fake_tfjob.go new file mode 100644 index 0000000000..796e1dfff0 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/fake_tfjob.go @@ -0,0 +1,126 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package fake + +import ( + v1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + labels "k8s.io/apimachinery/pkg/labels" + schema "k8s.io/apimachinery/pkg/runtime/schema" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + testing "k8s.io/client-go/testing" +) + +// FakeTFJobs implements TFJobInterface +type FakeTFJobs struct { + Fake *FakeKubeflowV1alpha2 + ns string +} + +var tfjobsResource = schema.GroupVersionResource{Group: "kubeflow.org", Version: "v1alpha2", Resource: "tfjobs"} + +var tfjobsKind = schema.GroupVersionKind{Group: "kubeflow.org", Version: "v1alpha2", Kind: "TFJob"} + +// Get takes name of the tFJob, and returns the corresponding tFJob object, and an error if there is any. +func (c *FakeTFJobs) Get(name string, options v1.GetOptions) (result *v1alpha2.TFJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewGetAction(tfjobsResource, c.ns, name), &v1alpha2.TFJob{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha2.TFJob), err +} + +// List takes label and field selectors, and returns the list of TFJobs that match those selectors. +func (c *FakeTFJobs) List(opts v1.ListOptions) (result *v1alpha2.TFJobList, err error) { + obj, err := c.Fake. + Invokes(testing.NewListAction(tfjobsResource, tfjobsKind, c.ns, opts), &v1alpha2.TFJobList{}) + + if obj == nil { + return nil, err + } + + label, _, _ := testing.ExtractFromListOptions(opts) + if label == nil { + label = labels.Everything() + } + list := &v1alpha2.TFJobList{} + for _, item := range obj.(*v1alpha2.TFJobList).Items { + if label.Matches(labels.Set(item.Labels)) { + list.Items = append(list.Items, item) + } + } + return list, err +} + +// Watch returns a watch.Interface that watches the requested tFJobs. +func (c *FakeTFJobs) Watch(opts v1.ListOptions) (watch.Interface, error) { + return c.Fake. + InvokesWatch(testing.NewWatchAction(tfjobsResource, c.ns, opts)) + +} + +// Create takes the representation of a tFJob and creates it. Returns the server's representation of the tFJob, and an error, if there is any. +func (c *FakeTFJobs) Create(tFJob *v1alpha2.TFJob) (result *v1alpha2.TFJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewCreateAction(tfjobsResource, c.ns, tFJob), &v1alpha2.TFJob{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha2.TFJob), err +} + +// Update takes the representation of a tFJob and updates it. Returns the server's representation of the tFJob, and an error, if there is any. +func (c *FakeTFJobs) Update(tFJob *v1alpha2.TFJob) (result *v1alpha2.TFJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewUpdateAction(tfjobsResource, c.ns, tFJob), &v1alpha2.TFJob{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha2.TFJob), err +} + +// Delete takes name of the tFJob and deletes it. Returns an error if one occurs. +func (c *FakeTFJobs) Delete(name string, options *v1.DeleteOptions) error { + _, err := c.Fake. + Invokes(testing.NewDeleteAction(tfjobsResource, c.ns, name), &v1alpha2.TFJob{}) + + return err +} + +// DeleteCollection deletes a collection of objects. +func (c *FakeTFJobs) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { + action := testing.NewDeleteCollectionAction(tfjobsResource, c.ns, listOptions) + + _, err := c.Fake.Invokes(action, &v1alpha2.TFJobList{}) + return err +} + +// Patch applies the patch and returns the patched tFJob. +func (c *FakeTFJobs) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha2.TFJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewPatchSubresourceAction(tfjobsResource, c.ns, name, data, subresources...), &v1alpha2.TFJob{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha2.TFJob), err +} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/generated_expansion.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/generated_expansion.go new file mode 100644 index 0000000000..7e99eae6c8 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/generated_expansion.go @@ -0,0 +1,19 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha2 + +type TFJobExpansion interface{} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/kubeflow_client.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/kubeflow_client.go new file mode 100644 index 0000000000..5d9041f2b3 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/kubeflow_client.go @@ -0,0 +1,88 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha2 + +import ( + v1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" + "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" + serializer "k8s.io/apimachinery/pkg/runtime/serializer" + rest "k8s.io/client-go/rest" +) + +type KubeflowV1alpha2Interface interface { + RESTClient() rest.Interface + TFJobsGetter +} + +// KubeflowV1alpha2Client is used to interact with features provided by the kubeflow.org group. +type KubeflowV1alpha2Client struct { + restClient rest.Interface +} + +func (c *KubeflowV1alpha2Client) TFJobs(namespace string) TFJobInterface { + return newTFJobs(c, namespace) +} + +// NewForConfig creates a new KubeflowV1alpha2Client for the given config. +func NewForConfig(c *rest.Config) (*KubeflowV1alpha2Client, error) { + config := *c + if err := setConfigDefaults(&config); err != nil { + return nil, err + } + client, err := rest.RESTClientFor(&config) + if err != nil { + return nil, err + } + return &KubeflowV1alpha2Client{client}, nil +} + +// NewForConfigOrDie creates a new KubeflowV1alpha2Client for the given config and +// panics if there is an error in the config. +func NewForConfigOrDie(c *rest.Config) *KubeflowV1alpha2Client { + client, err := NewForConfig(c) + if err != nil { + panic(err) + } + return client +} + +// New creates a new KubeflowV1alpha2Client for the given RESTClient. +func New(c rest.Interface) *KubeflowV1alpha2Client { + return &KubeflowV1alpha2Client{c} +} + +func setConfigDefaults(config *rest.Config) error { + gv := v1alpha2.SchemeGroupVersion + config.GroupVersion = &gv + config.APIPath = "/apis" + config.NegotiatedSerializer = serializer.DirectCodecFactory{CodecFactory: scheme.Codecs} + + if config.UserAgent == "" { + config.UserAgent = rest.DefaultKubernetesUserAgent() + } + + return nil +} + +// RESTClient returns a RESTClient that is used to communicate +// with API server by this client implementation. +func (c *KubeflowV1alpha2Client) RESTClient() rest.Interface { + if c == nil { + return nil + } + return c.restClient +} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/tfjob.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/tfjob.go new file mode 100644 index 0000000000..5a07e9730d --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/tfjob.go @@ -0,0 +1,155 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha2 + +import ( + v1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" + scheme "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + rest "k8s.io/client-go/rest" +) + +// TFJobsGetter has a method to return a TFJobInterface. +// A group's client should implement this interface. +type TFJobsGetter interface { + TFJobs(namespace string) TFJobInterface +} + +// TFJobInterface has methods to work with TFJob resources. +type TFJobInterface interface { + Create(*v1alpha2.TFJob) (*v1alpha2.TFJob, error) + Update(*v1alpha2.TFJob) (*v1alpha2.TFJob, error) + Delete(name string, options *v1.DeleteOptions) error + DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error + Get(name string, options v1.GetOptions) (*v1alpha2.TFJob, error) + List(opts v1.ListOptions) (*v1alpha2.TFJobList, error) + Watch(opts v1.ListOptions) (watch.Interface, error) + Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha2.TFJob, err error) + TFJobExpansion +} + +// tFJobs implements TFJobInterface +type tFJobs struct { + client rest.Interface + ns string +} + +// newTFJobs returns a TFJobs +func newTFJobs(c *KubeflowV1alpha2Client, namespace string) *tFJobs { + return &tFJobs{ + client: c.RESTClient(), + ns: namespace, + } +} + +// Get takes name of the tFJob, and returns the corresponding tFJob object, and an error if there is any. +func (c *tFJobs) Get(name string, options v1.GetOptions) (result *v1alpha2.TFJob, err error) { + result = &v1alpha2.TFJob{} + err = c.client.Get(). + Namespace(c.ns). + Resource("tfjobs"). + Name(name). + VersionedParams(&options, scheme.ParameterCodec). + Do(). + Into(result) + return +} + +// List takes label and field selectors, and returns the list of TFJobs that match those selectors. +func (c *tFJobs) List(opts v1.ListOptions) (result *v1alpha2.TFJobList, err error) { + result = &v1alpha2.TFJobList{} + err = c.client.Get(). + Namespace(c.ns). + Resource("tfjobs"). + VersionedParams(&opts, scheme.ParameterCodec). + Do(). + Into(result) + return +} + +// Watch returns a watch.Interface that watches the requested tFJobs. +func (c *tFJobs) Watch(opts v1.ListOptions) (watch.Interface, error) { + opts.Watch = true + return c.client.Get(). + Namespace(c.ns). + Resource("tfjobs"). + VersionedParams(&opts, scheme.ParameterCodec). + Watch() +} + +// Create takes the representation of a tFJob and creates it. Returns the server's representation of the tFJob, and an error, if there is any. +func (c *tFJobs) Create(tFJob *v1alpha2.TFJob) (result *v1alpha2.TFJob, err error) { + result = &v1alpha2.TFJob{} + err = c.client.Post(). + Namespace(c.ns). + Resource("tfjobs"). + Body(tFJob). + Do(). + Into(result) + return +} + +// Update takes the representation of a tFJob and updates it. Returns the server's representation of the tFJob, and an error, if there is any. +func (c *tFJobs) Update(tFJob *v1alpha2.TFJob) (result *v1alpha2.TFJob, err error) { + result = &v1alpha2.TFJob{} + err = c.client.Put(). + Namespace(c.ns). + Resource("tfjobs"). + Name(tFJob.Name). + Body(tFJob). + Do(). + Into(result) + return +} + +// Delete takes name of the tFJob and deletes it. Returns an error if one occurs. +func (c *tFJobs) Delete(name string, options *v1.DeleteOptions) error { + return c.client.Delete(). + Namespace(c.ns). + Resource("tfjobs"). + Name(name). + Body(options). + Do(). + Error() +} + +// DeleteCollection deletes a collection of objects. +func (c *tFJobs) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { + return c.client.Delete(). + Namespace(c.ns). + Resource("tfjobs"). + VersionedParams(&listOptions, scheme.ParameterCodec). + Body(options). + Do(). + Error() +} + +// Patch applies the patch and returns the patched tFJob. +func (c *tFJobs) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha2.TFJob, err error) { + result = &v1alpha2.TFJob{} + err = c.client.Patch(pt). + Namespace(c.ns). + Resource("tfjobs"). + SubResource(subresources...). + Name(name). + Body(data). + Do(). + Into(result) + return +} diff --git a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/doc.go b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/doc.go new file mode 100644 index 0000000000..ef161aeae6 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/doc.go @@ -0,0 +1,20 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This package is generated by client-gen with custom arguments. + +// This package has the automatically generated typed clients. +package v1alpha2 diff --git a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/doc.go b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/doc.go new file mode 100644 index 0000000000..d4003d501b --- /dev/null +++ b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/doc.go @@ -0,0 +1,20 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This package is generated by client-gen with custom arguments. + +// Package fake has the automatically generated clients. +package fake diff --git a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/fake_tensorflow_client.go b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/fake_tensorflow_client.go new file mode 100644 index 0000000000..b214b508d4 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/fake_tensorflow_client.go @@ -0,0 +1,38 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package fake + +import ( + v1alpha2 "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2" + rest "k8s.io/client-go/rest" + testing "k8s.io/client-go/testing" +) + +type FakeTensorflowV1alpha2 struct { + *testing.Fake +} + +func (c *FakeTensorflowV1alpha2) TFJobs(namespace string) v1alpha2.TFJobInterface { + return &FakeTFJobs{c, namespace} +} + +// RESTClient returns a RESTClient that is used to communicate +// with API server by this client implementation. +func (c *FakeTensorflowV1alpha2) RESTClient() rest.Interface { + var ret *rest.RESTClient + return ret +} diff --git a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/fake_tfjob.go b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/fake_tfjob.go new file mode 100644 index 0000000000..5b666a9d66 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/fake_tfjob.go @@ -0,0 +1,126 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package fake + +import ( + v1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + labels "k8s.io/apimachinery/pkg/labels" + schema "k8s.io/apimachinery/pkg/runtime/schema" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + testing "k8s.io/client-go/testing" +) + +// FakeTFJobs implements TFJobInterface +type FakeTFJobs struct { + Fake *FakeTensorflowV1alpha2 + ns string +} + +var tfjobsResource = schema.GroupVersionResource{Group: "tensorflow", Version: "v1alpha2", Resource: "tfjobs"} + +var tfjobsKind = schema.GroupVersionKind{Group: "tensorflow", Version: "v1alpha2", Kind: "TFJob"} + +// Get takes name of the tFJob, and returns the corresponding tFJob object, and an error if there is any. +func (c *FakeTFJobs) Get(name string, options v1.GetOptions) (result *v1alpha2.TFJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewGetAction(tfjobsResource, c.ns, name), &v1alpha2.TFJob{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha2.TFJob), err +} + +// List takes label and field selectors, and returns the list of TFJobs that match those selectors. +func (c *FakeTFJobs) List(opts v1.ListOptions) (result *v1alpha2.TFJobList, err error) { + obj, err := c.Fake. + Invokes(testing.NewListAction(tfjobsResource, tfjobsKind, c.ns, opts), &v1alpha2.TFJobList{}) + + if obj == nil { + return nil, err + } + + label, _, _ := testing.ExtractFromListOptions(opts) + if label == nil { + label = labels.Everything() + } + list := &v1alpha2.TFJobList{} + for _, item := range obj.(*v1alpha2.TFJobList).Items { + if label.Matches(labels.Set(item.Labels)) { + list.Items = append(list.Items, item) + } + } + return list, err +} + +// Watch returns a watch.Interface that watches the requested tFJobs. +func (c *FakeTFJobs) Watch(opts v1.ListOptions) (watch.Interface, error) { + return c.Fake. + InvokesWatch(testing.NewWatchAction(tfjobsResource, c.ns, opts)) + +} + +// Create takes the representation of a tFJob and creates it. Returns the server's representation of the tFJob, and an error, if there is any. +func (c *FakeTFJobs) Create(tFJob *v1alpha2.TFJob) (result *v1alpha2.TFJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewCreateAction(tfjobsResource, c.ns, tFJob), &v1alpha2.TFJob{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha2.TFJob), err +} + +// Update takes the representation of a tFJob and updates it. Returns the server's representation of the tFJob, and an error, if there is any. +func (c *FakeTFJobs) Update(tFJob *v1alpha2.TFJob) (result *v1alpha2.TFJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewUpdateAction(tfjobsResource, c.ns, tFJob), &v1alpha2.TFJob{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha2.TFJob), err +} + +// Delete takes name of the tFJob and deletes it. Returns an error if one occurs. +func (c *FakeTFJobs) Delete(name string, options *v1.DeleteOptions) error { + _, err := c.Fake. + Invokes(testing.NewDeleteAction(tfjobsResource, c.ns, name), &v1alpha2.TFJob{}) + + return err +} + +// DeleteCollection deletes a collection of objects. +func (c *FakeTFJobs) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { + action := testing.NewDeleteCollectionAction(tfjobsResource, c.ns, listOptions) + + _, err := c.Fake.Invokes(action, &v1alpha2.TFJobList{}) + return err +} + +// Patch applies the patch and returns the patched tFJob. +func (c *FakeTFJobs) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha2.TFJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewPatchSubresourceAction(tfjobsResource, c.ns, name, data, subresources...), &v1alpha2.TFJob{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha2.TFJob), err +} diff --git a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/generated_expansion.go b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/generated_expansion.go new file mode 100644 index 0000000000..7e99eae6c8 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/generated_expansion.go @@ -0,0 +1,19 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha2 + +type TFJobExpansion interface{} diff --git a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/tensorflow_client.go b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/tensorflow_client.go new file mode 100644 index 0000000000..0535a3b66e --- /dev/null +++ b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/tensorflow_client.go @@ -0,0 +1,88 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha2 + +import ( + v1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" + "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" + serializer "k8s.io/apimachinery/pkg/runtime/serializer" + rest "k8s.io/client-go/rest" +) + +type TensorflowV1alpha2Interface interface { + RESTClient() rest.Interface + TFJobsGetter +} + +// TensorflowV1alpha2Client is used to interact with features provided by the tensorflow group. +type TensorflowV1alpha2Client struct { + restClient rest.Interface +} + +func (c *TensorflowV1alpha2Client) TFJobs(namespace string) TFJobInterface { + return newTFJobs(c, namespace) +} + +// NewForConfig creates a new TensorflowV1alpha2Client for the given config. +func NewForConfig(c *rest.Config) (*TensorflowV1alpha2Client, error) { + config := *c + if err := setConfigDefaults(&config); err != nil { + return nil, err + } + client, err := rest.RESTClientFor(&config) + if err != nil { + return nil, err + } + return &TensorflowV1alpha2Client{client}, nil +} + +// NewForConfigOrDie creates a new TensorflowV1alpha2Client for the given config and +// panics if there is an error in the config. +func NewForConfigOrDie(c *rest.Config) *TensorflowV1alpha2Client { + client, err := NewForConfig(c) + if err != nil { + panic(err) + } + return client +} + +// New creates a new TensorflowV1alpha2Client for the given RESTClient. +func New(c rest.Interface) *TensorflowV1alpha2Client { + return &TensorflowV1alpha2Client{c} +} + +func setConfigDefaults(config *rest.Config) error { + gv := v1alpha2.SchemeGroupVersion + config.GroupVersion = &gv + config.APIPath = "/apis" + config.NegotiatedSerializer = serializer.DirectCodecFactory{CodecFactory: scheme.Codecs} + + if config.UserAgent == "" { + config.UserAgent = rest.DefaultKubernetesUserAgent() + } + + return nil +} + +// RESTClient returns a RESTClient that is used to communicate +// with API server by this client implementation. +func (c *TensorflowV1alpha2Client) RESTClient() rest.Interface { + if c == nil { + return nil + } + return c.restClient +} diff --git a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/tfjob.go b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/tfjob.go new file mode 100644 index 0000000000..9f66609712 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/tfjob.go @@ -0,0 +1,155 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha2 + +import ( + v1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" + scheme "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + rest "k8s.io/client-go/rest" +) + +// TFJobsGetter has a method to return a TFJobInterface. +// A group's client should implement this interface. +type TFJobsGetter interface { + TFJobs(namespace string) TFJobInterface +} + +// TFJobInterface has methods to work with TFJob resources. +type TFJobInterface interface { + Create(*v1alpha2.TFJob) (*v1alpha2.TFJob, error) + Update(*v1alpha2.TFJob) (*v1alpha2.TFJob, error) + Delete(name string, options *v1.DeleteOptions) error + DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error + Get(name string, options v1.GetOptions) (*v1alpha2.TFJob, error) + List(opts v1.ListOptions) (*v1alpha2.TFJobList, error) + Watch(opts v1.ListOptions) (watch.Interface, error) + Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha2.TFJob, err error) + TFJobExpansion +} + +// tFJobs implements TFJobInterface +type tFJobs struct { + client rest.Interface + ns string +} + +// newTFJobs returns a TFJobs +func newTFJobs(c *TensorflowV1alpha2Client, namespace string) *tFJobs { + return &tFJobs{ + client: c.RESTClient(), + ns: namespace, + } +} + +// Get takes name of the tFJob, and returns the corresponding tFJob object, and an error if there is any. +func (c *tFJobs) Get(name string, options v1.GetOptions) (result *v1alpha2.TFJob, err error) { + result = &v1alpha2.TFJob{} + err = c.client.Get(). + Namespace(c.ns). + Resource("tfjobs"). + Name(name). + VersionedParams(&options, scheme.ParameterCodec). + Do(). + Into(result) + return +} + +// List takes label and field selectors, and returns the list of TFJobs that match those selectors. +func (c *tFJobs) List(opts v1.ListOptions) (result *v1alpha2.TFJobList, err error) { + result = &v1alpha2.TFJobList{} + err = c.client.Get(). + Namespace(c.ns). + Resource("tfjobs"). + VersionedParams(&opts, scheme.ParameterCodec). + Do(). + Into(result) + return +} + +// Watch returns a watch.Interface that watches the requested tFJobs. +func (c *tFJobs) Watch(opts v1.ListOptions) (watch.Interface, error) { + opts.Watch = true + return c.client.Get(). + Namespace(c.ns). + Resource("tfjobs"). + VersionedParams(&opts, scheme.ParameterCodec). + Watch() +} + +// Create takes the representation of a tFJob and creates it. Returns the server's representation of the tFJob, and an error, if there is any. +func (c *tFJobs) Create(tFJob *v1alpha2.TFJob) (result *v1alpha2.TFJob, err error) { + result = &v1alpha2.TFJob{} + err = c.client.Post(). + Namespace(c.ns). + Resource("tfjobs"). + Body(tFJob). + Do(). + Into(result) + return +} + +// Update takes the representation of a tFJob and updates it. Returns the server's representation of the tFJob, and an error, if there is any. +func (c *tFJobs) Update(tFJob *v1alpha2.TFJob) (result *v1alpha2.TFJob, err error) { + result = &v1alpha2.TFJob{} + err = c.client.Put(). + Namespace(c.ns). + Resource("tfjobs"). + Name(tFJob.Name). + Body(tFJob). + Do(). + Into(result) + return +} + +// Delete takes name of the tFJob and deletes it. Returns an error if one occurs. +func (c *tFJobs) Delete(name string, options *v1.DeleteOptions) error { + return c.client.Delete(). + Namespace(c.ns). + Resource("tfjobs"). + Name(name). + Body(options). + Do(). + Error() +} + +// DeleteCollection deletes a collection of objects. +func (c *tFJobs) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { + return c.client.Delete(). + Namespace(c.ns). + Resource("tfjobs"). + VersionedParams(&listOptions, scheme.ParameterCodec). + Body(options). + Do(). + Error() +} + +// Patch applies the patch and returns the patched tFJob. +func (c *tFJobs) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha2.TFJob, err error) { + result = &v1alpha2.TFJob{} + err = c.client.Patch(pt). + Namespace(c.ns). + Resource("tfjobs"). + SubResource(subresources...). + Name(name). + Body(data). + Do(). + Into(result) + return +} diff --git a/pkg/client/informers/externalversions/generic.go b/pkg/client/informers/externalversions/generic.go index 470a80e618..f2671fdceb 100644 --- a/pkg/client/informers/externalversions/generic.go +++ b/pkg/client/informers/externalversions/generic.go @@ -18,7 +18,7 @@ package externalversions import ( "fmt" - v1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" + v1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" schema "k8s.io/apimachinery/pkg/runtime/schema" cache "k8s.io/client-go/tools/cache" ) @@ -49,9 +49,9 @@ func (f *genericInformer) Lister() cache.GenericLister { // TODO extend this to unknown resources with a client pool func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) { switch resource { - // Group=Kubeflow, Version=V1alpha1 - case v1alpha1.SchemeGroupVersion.WithResource("tfjobs"): - return &genericInformer{resource: resource.GroupResource(), informer: f.Kubeflow().V1alpha1().TFJobs().Informer()}, nil + // Group=Kubeflow, Version=V1alpha2 + case v1alpha2.SchemeGroupVersion.WithResource("tfjobs"): + return &genericInformer{resource: resource.GroupResource(), informer: f.Kubeflow().V1alpha2().TFJobs().Informer()}, nil } diff --git a/pkg/client/informers/externalversions/kubeflow/interface.go b/pkg/client/informers/externalversions/kubeflow/interface.go index 859091fe81..1753f9cb56 100644 --- a/pkg/client/informers/externalversions/kubeflow/interface.go +++ b/pkg/client/informers/externalversions/kubeflow/interface.go @@ -18,13 +18,13 @@ package kubeflow import ( internalinterfaces "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/internalinterfaces" - v1alpha1 "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/kubeflow/v1alpha1" + v1alpha2 "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/kubeflow/v1alpha2" ) // Interface provides access to each of this group's versions. type Interface interface { - // V1alpha1 provides access to shared informers for resources in V1alpha1. - V1alpha1() v1alpha1.Interface + // V1alpha2 provides access to shared informers for resources in V1alpha2. + V1alpha2() v1alpha2.Interface } type group struct { @@ -36,7 +36,7 @@ func New(f internalinterfaces.SharedInformerFactory) Interface { return &group{f} } -// V1alpha1 returns a new v1alpha1.Interface. -func (g *group) V1alpha1() v1alpha1.Interface { - return v1alpha1.New(g.SharedInformerFactory) +// V1alpha2 returns a new v1alpha2.Interface. +func (g *group) V1alpha2() v1alpha2.Interface { + return v1alpha2.New(g.SharedInformerFactory) } diff --git a/pkg/client/informers/externalversions/kubeflow/v1alpha2/interface.go b/pkg/client/informers/externalversions/kubeflow/v1alpha2/interface.go new file mode 100644 index 0000000000..f137397993 --- /dev/null +++ b/pkg/client/informers/externalversions/kubeflow/v1alpha2/interface.go @@ -0,0 +1,43 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This file was automatically generated by informer-gen + +package v1alpha2 + +import ( + internalinterfaces "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/internalinterfaces" +) + +// Interface provides access to all the informers in this group version. +type Interface interface { + // TFJobs returns a TFJobInformer. + TFJobs() TFJobInformer +} + +type version struct { + internalinterfaces.SharedInformerFactory +} + +// New returns a new Interface. +func New(f internalinterfaces.SharedInformerFactory) Interface { + return &version{f} +} + +// TFJobs returns a TFJobInformer. +func (v *version) TFJobs() TFJobInformer { + return &tFJobInformer{factory: v.SharedInformerFactory} +} diff --git a/pkg/client/informers/externalversions/kubeflow/v1alpha2/tfjob.go b/pkg/client/informers/externalversions/kubeflow/v1alpha2/tfjob.go new file mode 100644 index 0000000000..d91c0d7335 --- /dev/null +++ b/pkg/client/informers/externalversions/kubeflow/v1alpha2/tfjob.go @@ -0,0 +1,73 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This file was automatically generated by informer-gen + +package v1alpha2 + +import ( + tensorflow_v1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" + versioned "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" + internalinterfaces "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/internalinterfaces" + v1alpha2 "github.com/kubeflow/tf-operator/pkg/client/listers/kubeflow/v1alpha2" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + watch "k8s.io/apimachinery/pkg/watch" + cache "k8s.io/client-go/tools/cache" + time "time" +) + +// TFJobInformer provides access to a shared informer and lister for +// TFJobs. +type TFJobInformer interface { + Informer() cache.SharedIndexInformer + Lister() v1alpha2.TFJobLister +} + +type tFJobInformer struct { + factory internalinterfaces.SharedInformerFactory +} + +// NewTFJobInformer constructs a new informer for TFJob type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewTFJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { + return cache.NewSharedIndexInformer( + &cache.ListWatch{ + ListFunc: func(options v1.ListOptions) (runtime.Object, error) { + return client.KubeflowV1alpha2().TFJobs(namespace).List(options) + }, + WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { + return client.KubeflowV1alpha2().TFJobs(namespace).Watch(options) + }, + }, + &tensorflow_v1alpha2.TFJob{}, + resyncPeriod, + indexers, + ) +} + +func defaultTFJobInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { + return NewTFJobInformer(client, v1.NamespaceAll, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) +} + +func (f *tFJobInformer) Informer() cache.SharedIndexInformer { + return f.factory.InformerFor(&tensorflow_v1alpha2.TFJob{}, defaultTFJobInformer) +} + +func (f *tFJobInformer) Lister() v1alpha2.TFJobLister { + return v1alpha2.NewTFJobLister(f.Informer().GetIndexer()) +} diff --git a/pkg/client/informers/externalversions/tensorflow/interface.go b/pkg/client/informers/externalversions/tensorflow/interface.go new file mode 100644 index 0000000000..c6dd1e50d0 --- /dev/null +++ b/pkg/client/informers/externalversions/tensorflow/interface.go @@ -0,0 +1,44 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This file was automatically generated by informer-gen + +package tensorflow + +import ( + internalinterfaces "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/internalinterfaces" + v1alpha2 "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/tensorflow/v1alpha2" +) + +// Interface provides access to each of this group's versions. +type Interface interface { + // V1alpha2 provides access to shared informers for resources in V1alpha2. + V1alpha2() v1alpha2.Interface +} + +type group struct { + internalinterfaces.SharedInformerFactory +} + +// New returns a new Interface. +func New(f internalinterfaces.SharedInformerFactory) Interface { + return &group{f} +} + +// V1alpha2 returns a new v1alpha2.Interface. +func (g *group) V1alpha2() v1alpha2.Interface { + return v1alpha2.New(g.SharedInformerFactory) +} diff --git a/pkg/client/informers/externalversions/tensorflow/v1alpha2/interface.go b/pkg/client/informers/externalversions/tensorflow/v1alpha2/interface.go new file mode 100644 index 0000000000..f137397993 --- /dev/null +++ b/pkg/client/informers/externalversions/tensorflow/v1alpha2/interface.go @@ -0,0 +1,43 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This file was automatically generated by informer-gen + +package v1alpha2 + +import ( + internalinterfaces "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/internalinterfaces" +) + +// Interface provides access to all the informers in this group version. +type Interface interface { + // TFJobs returns a TFJobInformer. + TFJobs() TFJobInformer +} + +type version struct { + internalinterfaces.SharedInformerFactory +} + +// New returns a new Interface. +func New(f internalinterfaces.SharedInformerFactory) Interface { + return &version{f} +} + +// TFJobs returns a TFJobInformer. +func (v *version) TFJobs() TFJobInformer { + return &tFJobInformer{factory: v.SharedInformerFactory} +} diff --git a/pkg/client/informers/externalversions/tensorflow/v1alpha2/tfjob.go b/pkg/client/informers/externalversions/tensorflow/v1alpha2/tfjob.go new file mode 100644 index 0000000000..d0bdd3f9ed --- /dev/null +++ b/pkg/client/informers/externalversions/tensorflow/v1alpha2/tfjob.go @@ -0,0 +1,73 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This file was automatically generated by informer-gen + +package v1alpha2 + +import ( + tensorflow_v1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" + versioned "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" + internalinterfaces "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/internalinterfaces" + v1alpha2 "github.com/kubeflow/tf-operator/pkg/client/listers/tensorflow/v1alpha2" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + watch "k8s.io/apimachinery/pkg/watch" + cache "k8s.io/client-go/tools/cache" + time "time" +) + +// TFJobInformer provides access to a shared informer and lister for +// TFJobs. +type TFJobInformer interface { + Informer() cache.SharedIndexInformer + Lister() v1alpha2.TFJobLister +} + +type tFJobInformer struct { + factory internalinterfaces.SharedInformerFactory +} + +// NewTFJobInformer constructs a new informer for TFJob type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewTFJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { + return cache.NewSharedIndexInformer( + &cache.ListWatch{ + ListFunc: func(options v1.ListOptions) (runtime.Object, error) { + return client.TensorflowV1alpha2().TFJobs(namespace).List(options) + }, + WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { + return client.TensorflowV1alpha2().TFJobs(namespace).Watch(options) + }, + }, + &tensorflow_v1alpha2.TFJob{}, + resyncPeriod, + indexers, + ) +} + +func defaultTFJobInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { + return NewTFJobInformer(client, v1.NamespaceAll, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) +} + +func (f *tFJobInformer) Informer() cache.SharedIndexInformer { + return f.factory.InformerFor(&tensorflow_v1alpha2.TFJob{}, defaultTFJobInformer) +} + +func (f *tFJobInformer) Lister() v1alpha2.TFJobLister { + return v1alpha2.NewTFJobLister(f.Informer().GetIndexer()) +} diff --git a/pkg/client/listers/kubeflow/v1alpha2/expansion_generated.go b/pkg/client/listers/kubeflow/v1alpha2/expansion_generated.go new file mode 100644 index 0000000000..6880fff70c --- /dev/null +++ b/pkg/client/listers/kubeflow/v1alpha2/expansion_generated.go @@ -0,0 +1,27 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This file was automatically generated by lister-gen + +package v1alpha2 + +// TFJobListerExpansion allows custom methods to be added to +// TFJobLister. +type TFJobListerExpansion interface{} + +// TFJobNamespaceListerExpansion allows custom methods to be added to +// TFJobNamespaceLister. +type TFJobNamespaceListerExpansion interface{} diff --git a/pkg/client/listers/kubeflow/v1alpha2/tfjob.go b/pkg/client/listers/kubeflow/v1alpha2/tfjob.go new file mode 100644 index 0000000000..601b03a7d8 --- /dev/null +++ b/pkg/client/listers/kubeflow/v1alpha2/tfjob.go @@ -0,0 +1,94 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This file was automatically generated by lister-gen + +package v1alpha2 + +import ( + v1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/tools/cache" +) + +// TFJobLister helps list TFJobs. +type TFJobLister interface { + // List lists all TFJobs in the indexer. + List(selector labels.Selector) (ret []*v1alpha2.TFJob, err error) + // TFJobs returns an object that can list and get TFJobs. + TFJobs(namespace string) TFJobNamespaceLister + TFJobListerExpansion +} + +// tFJobLister implements the TFJobLister interface. +type tFJobLister struct { + indexer cache.Indexer +} + +// NewTFJobLister returns a new TFJobLister. +func NewTFJobLister(indexer cache.Indexer) TFJobLister { + return &tFJobLister{indexer: indexer} +} + +// List lists all TFJobs in the indexer. +func (s *tFJobLister) List(selector labels.Selector) (ret []*v1alpha2.TFJob, err error) { + err = cache.ListAll(s.indexer, selector, func(m interface{}) { + ret = append(ret, m.(*v1alpha2.TFJob)) + }) + return ret, err +} + +// TFJobs returns an object that can list and get TFJobs. +func (s *tFJobLister) TFJobs(namespace string) TFJobNamespaceLister { + return tFJobNamespaceLister{indexer: s.indexer, namespace: namespace} +} + +// TFJobNamespaceLister helps list and get TFJobs. +type TFJobNamespaceLister interface { + // List lists all TFJobs in the indexer for a given namespace. + List(selector labels.Selector) (ret []*v1alpha2.TFJob, err error) + // Get retrieves the TFJob from the indexer for a given namespace and name. + Get(name string) (*v1alpha2.TFJob, error) + TFJobNamespaceListerExpansion +} + +// tFJobNamespaceLister implements the TFJobNamespaceLister +// interface. +type tFJobNamespaceLister struct { + indexer cache.Indexer + namespace string +} + +// List lists all TFJobs in the indexer for a given namespace. +func (s tFJobNamespaceLister) List(selector labels.Selector) (ret []*v1alpha2.TFJob, err error) { + err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) { + ret = append(ret, m.(*v1alpha2.TFJob)) + }) + return ret, err +} + +// Get retrieves the TFJob from the indexer for a given namespace and name. +func (s tFJobNamespaceLister) Get(name string) (*v1alpha2.TFJob, error) { + obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name) + if err != nil { + return nil, err + } + if !exists { + return nil, errors.NewNotFound(v1alpha2.Resource("tfjob"), name) + } + return obj.(*v1alpha2.TFJob), nil +} diff --git a/pkg/client/listers/tensorflow/v1alpha2/expansion_generated.go b/pkg/client/listers/tensorflow/v1alpha2/expansion_generated.go new file mode 100644 index 0000000000..6880fff70c --- /dev/null +++ b/pkg/client/listers/tensorflow/v1alpha2/expansion_generated.go @@ -0,0 +1,27 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This file was automatically generated by lister-gen + +package v1alpha2 + +// TFJobListerExpansion allows custom methods to be added to +// TFJobLister. +type TFJobListerExpansion interface{} + +// TFJobNamespaceListerExpansion allows custom methods to be added to +// TFJobNamespaceLister. +type TFJobNamespaceListerExpansion interface{} diff --git a/pkg/client/listers/tensorflow/v1alpha2/tfjob.go b/pkg/client/listers/tensorflow/v1alpha2/tfjob.go new file mode 100644 index 0000000000..601b03a7d8 --- /dev/null +++ b/pkg/client/listers/tensorflow/v1alpha2/tfjob.go @@ -0,0 +1,94 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This file was automatically generated by lister-gen + +package v1alpha2 + +import ( + v1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/tools/cache" +) + +// TFJobLister helps list TFJobs. +type TFJobLister interface { + // List lists all TFJobs in the indexer. + List(selector labels.Selector) (ret []*v1alpha2.TFJob, err error) + // TFJobs returns an object that can list and get TFJobs. + TFJobs(namespace string) TFJobNamespaceLister + TFJobListerExpansion +} + +// tFJobLister implements the TFJobLister interface. +type tFJobLister struct { + indexer cache.Indexer +} + +// NewTFJobLister returns a new TFJobLister. +func NewTFJobLister(indexer cache.Indexer) TFJobLister { + return &tFJobLister{indexer: indexer} +} + +// List lists all TFJobs in the indexer. +func (s *tFJobLister) List(selector labels.Selector) (ret []*v1alpha2.TFJob, err error) { + err = cache.ListAll(s.indexer, selector, func(m interface{}) { + ret = append(ret, m.(*v1alpha2.TFJob)) + }) + return ret, err +} + +// TFJobs returns an object that can list and get TFJobs. +func (s *tFJobLister) TFJobs(namespace string) TFJobNamespaceLister { + return tFJobNamespaceLister{indexer: s.indexer, namespace: namespace} +} + +// TFJobNamespaceLister helps list and get TFJobs. +type TFJobNamespaceLister interface { + // List lists all TFJobs in the indexer for a given namespace. + List(selector labels.Selector) (ret []*v1alpha2.TFJob, err error) + // Get retrieves the TFJob from the indexer for a given namespace and name. + Get(name string) (*v1alpha2.TFJob, error) + TFJobNamespaceListerExpansion +} + +// tFJobNamespaceLister implements the TFJobNamespaceLister +// interface. +type tFJobNamespaceLister struct { + indexer cache.Indexer + namespace string +} + +// List lists all TFJobs in the indexer for a given namespace. +func (s tFJobNamespaceLister) List(selector labels.Selector) (ret []*v1alpha2.TFJob, err error) { + err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) { + ret = append(ret, m.(*v1alpha2.TFJob)) + }) + return ret, err +} + +// Get retrieves the TFJob from the indexer for a given namespace and name. +func (s tFJobNamespaceLister) Get(name string) (*v1alpha2.TFJob, error) { + obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name) + if err != nil { + return nil, err + } + if !exists { + return nil, errors.NewNotFound(v1alpha2.Resource("tfjob"), name) + } + return obj.(*v1alpha2.TFJob), nil +} From e559cea720054438a23e928e97d829705b8d4211 Mon Sep 17 00:00:00 2001 From: Penghao Cen Date: Thu, 15 Mar 2018 11:50:59 +0800 Subject: [PATCH 02/24] Add options kubeconfig/master/threadiness and remove deprecated options --- cmd/tf-operator/app/options/options.go | 33 +++++++++++++++----------- cmd/tf-operator/main.go | 2 +- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/cmd/tf-operator/app/options/options.go b/cmd/tf-operator/app/options/options.go index 5f7cb90e43..f7e152a13d 100644 --- a/cmd/tf-operator/app/options/options.go +++ b/cmd/tf-operator/app/options/options.go @@ -16,17 +16,15 @@ package options import ( "flag" - "time" ) // ServerOption is the main context object for the controller manager. type ServerOption struct { - ChaosLevel int - ControllerConfigFile string - PrintVersion bool - GCInterval time.Duration - JsonLogFormat bool - EnableGangScheduling bool + Kubeconfig string + MasterURL string + Threadiness int + PrintVersion bool + JSONLogFormat bool } // NewServerOption creates a new CMServer with a default config. @@ -35,13 +33,20 @@ func NewServerOption() *ServerOption { return &s } -// AddFlags adds flags for a specific CMServer to the specified FlagSet +// AddFlags adds flags for a specific CMServer to the specified FlagSet. func (s *ServerOption) AddFlags(fs *flag.FlagSet) { - // chaos level will be removed once we have a formal tool to inject failures. - fs.IntVar(&s.ChaosLevel, "chaos-level", -1, "DO NOT USE IN PRODUCTION - level of chaos injected into the TFJob created by the operator.") + fs.StringVar(&s.Kubeconfig, "kubeconfig", "~/.kube/config", + `Path to a kubeconfig, only required if out-of-cluster.`) + + fs.StringVar(&s.MasterURL, "master", "", + `The url of the Kubernetes API server, + will overrides any value in kubeconfig, only required if out-of-cluster.`) + + fs.IntVar(&s.Threadiness, "threadiness", 2, + `How many threads to process the main logic`) + fs.BoolVar(&s.PrintVersion, "version", false, "Show version and quit") - fs.DurationVar(&s.GCInterval, "gc-interval", 10*time.Minute, "GC interval") - fs.StringVar(&s.ControllerConfigFile, "controller-config-file", "", "Path to file containing the controller config.") - fs.BoolVar(&s.JsonLogFormat, "json-log-format", true, "Set true to use json style log format. Set false to use plaintext style log format") - fs.BoolVar(&s.EnableGangScheduling, "enable-gang-scheduling", false, "Set true to enable gang scheduling by kube-arbitrator.") + + fs.BoolVar(&s.JSONLogFormat, "json-log-format", true, + "Set true to use json style log format. Set false to use plaintext style log format") } diff --git a/cmd/tf-operator/main.go b/cmd/tf-operator/main.go index d01b288526..a91fe1aa2f 100644 --- a/cmd/tf-operator/main.go +++ b/cmd/tf-operator/main.go @@ -37,7 +37,7 @@ func main() { flag.Parse() - if s.JsonLogFormat { + if s.JSONLogFormat { // Output logs in a json format so that it can be parsed by services like Stackdriver log.SetFormatter(&log.JSONFormatter{}) } From 278ceb7685f157c77e9e86d27f6a7c31ed425f36 Mon Sep 17 00:00:00 2001 From: Penghao Cen Date: Thu, 15 Mar 2018 10:30:29 +0800 Subject: [PATCH 03/24] Update copyright with new boilerplate --- .../v1alpha2/zz_generated.deepcopy.go | 28 +++++++++--------- .../versioned/typed/kubeflow/v1alpha2/doc.go | 28 +++++++++--------- .../typed/kubeflow/v1alpha2/fake/doc.go | 28 +++++++++--------- .../v1alpha2/fake/fake_kubeflow_client.go | 29 +++++++++---------- .../kubeflow/v1alpha2/fake/fake_tfjob.go | 29 +++++++++---------- .../kubeflow/v1alpha2/generated_expansion.go | 29 +++++++++---------- .../kubeflow/v1alpha2/kubeflow_client.go | 29 +++++++++---------- .../typed/kubeflow/v1alpha2/tfjob.go | 29 +++++++++---------- .../kubeflow/v1alpha2/interface.go | 28 +++++++++--------- .../kubeflow/v1alpha2/tfjob.go | 28 +++++++++--------- .../kubeflow/v1alpha2/expansion_generated.go | 28 +++++++++--------- pkg/client/listers/kubeflow/v1alpha2/tfjob.go | 28 +++++++++--------- 12 files changed, 156 insertions(+), 185 deletions(-) diff --git a/pkg/apis/tensorflow/v1alpha2/zz_generated.deepcopy.go b/pkg/apis/tensorflow/v1alpha2/zz_generated.deepcopy.go index cae276f299..47cc880955 100644 --- a/pkg/apis/tensorflow/v1alpha2/zz_generated.deepcopy.go +++ b/pkg/apis/tensorflow/v1alpha2/zz_generated.deepcopy.go @@ -1,20 +1,18 @@ // +build !ignore_autogenerated -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. // This file was autogenerated by deepcopy-gen. Do not edit it manually! diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/doc.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/doc.go index ef161aeae6..a74a243e30 100644 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/doc.go +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/doc.go @@ -1,18 +1,16 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. // This package is generated by client-gen with custom arguments. diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/doc.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/doc.go index d4003d501b..41d860c548 100644 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/doc.go +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/doc.go @@ -1,18 +1,16 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. // This package is generated by client-gen with custom arguments. diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/fake_kubeflow_client.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/fake_kubeflow_client.go index 7338a5d7da..bb1d2e3d0a 100644 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/fake_kubeflow_client.go +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/fake_kubeflow_client.go @@ -1,19 +1,16 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. package fake import ( diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/fake_tfjob.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/fake_tfjob.go index 796e1dfff0..acf7b2b10f 100644 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/fake_tfjob.go +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake/fake_tfjob.go @@ -1,19 +1,16 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. package fake import ( diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/generated_expansion.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/generated_expansion.go index 7e99eae6c8..328c1bbb10 100644 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/generated_expansion.go +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/generated_expansion.go @@ -1,19 +1,16 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. package v1alpha2 type TFJobExpansion interface{} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/kubeflow_client.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/kubeflow_client.go index 5d9041f2b3..5f8b649e49 100644 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/kubeflow_client.go +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/kubeflow_client.go @@ -1,19 +1,16 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. package v1alpha2 import ( diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/tfjob.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/tfjob.go index 5a07e9730d..273e9e1bdf 100644 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/tfjob.go +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/tfjob.go @@ -1,19 +1,16 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. package v1alpha2 import ( diff --git a/pkg/client/informers/externalversions/kubeflow/v1alpha2/interface.go b/pkg/client/informers/externalversions/kubeflow/v1alpha2/interface.go index f137397993..8238b87109 100644 --- a/pkg/client/informers/externalversions/kubeflow/v1alpha2/interface.go +++ b/pkg/client/informers/externalversions/kubeflow/v1alpha2/interface.go @@ -1,18 +1,16 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. // This file was automatically generated by informer-gen diff --git a/pkg/client/informers/externalversions/kubeflow/v1alpha2/tfjob.go b/pkg/client/informers/externalversions/kubeflow/v1alpha2/tfjob.go index d91c0d7335..f270ffbbe6 100644 --- a/pkg/client/informers/externalversions/kubeflow/v1alpha2/tfjob.go +++ b/pkg/client/informers/externalversions/kubeflow/v1alpha2/tfjob.go @@ -1,18 +1,16 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. // This file was automatically generated by informer-gen diff --git a/pkg/client/listers/kubeflow/v1alpha2/expansion_generated.go b/pkg/client/listers/kubeflow/v1alpha2/expansion_generated.go index 6880fff70c..133df30978 100644 --- a/pkg/client/listers/kubeflow/v1alpha2/expansion_generated.go +++ b/pkg/client/listers/kubeflow/v1alpha2/expansion_generated.go @@ -1,18 +1,16 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. // This file was automatically generated by lister-gen diff --git a/pkg/client/listers/kubeflow/v1alpha2/tfjob.go b/pkg/client/listers/kubeflow/v1alpha2/tfjob.go index 601b03a7d8..eb0d0202eb 100644 --- a/pkg/client/listers/kubeflow/v1alpha2/tfjob.go +++ b/pkg/client/listers/kubeflow/v1alpha2/tfjob.go @@ -1,18 +1,16 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. // This file was automatically generated by lister-gen From de18f5287e2dd7bc45fd3ec8d1fb947946aa1d02 Mon Sep 17 00:00:00 2001 From: Penghao Cen Date: Thu, 15 Mar 2018 16:32:15 +0800 Subject: [PATCH 04/24] Cleanup v1alpha1 code --- pkg/apis/tensorflow/helper/helpers.go | 119 ---- pkg/apis/tensorflow/helper/helpers_test.go | 248 --------- pkg/apis/tensorflow/validation/validation.go | 79 --- .../tensorflow/validation/validation_test.go | 113 ---- pkg/trainer/labels.go | 33 -- pkg/trainer/replicas.go | 509 ------------------ pkg/trainer/replicas_test.go | 368 ------------- pkg/trainer/training.go | 468 ---------------- pkg/trainer/training_test.go | 489 ----------------- pkg/util/k8sutil/k8sutil.go | 120 ----- pkg/util/util.go | 74 --- 11 files changed, 2620 deletions(-) delete mode 100644 pkg/apis/tensorflow/helper/helpers.go delete mode 100644 pkg/apis/tensorflow/helper/helpers_test.go delete mode 100644 pkg/apis/tensorflow/validation/validation.go delete mode 100644 pkg/apis/tensorflow/validation/validation_test.go delete mode 100644 pkg/trainer/labels.go delete mode 100644 pkg/trainer/replicas.go delete mode 100644 pkg/trainer/replicas_test.go delete mode 100644 pkg/trainer/training.go delete mode 100644 pkg/trainer/training_test.go delete mode 100644 pkg/util/k8sutil/k8sutil.go delete mode 100644 pkg/util/util.go diff --git a/pkg/apis/tensorflow/helper/helpers.go b/pkg/apis/tensorflow/helper/helpers.go deleted file mode 100644 index 632194e2c4..0000000000 --- a/pkg/apis/tensorflow/helper/helpers.go +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package helper - -import ( - "fmt" - - tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" - "github.com/kubeflow/tf-operator/pkg/util" - "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime/schema" -) - -var ( - groupVersionKind = schema.GroupVersionKind{ - Group: tfv1.GroupName, - Version: tfv1.GroupVersion, - Kind: tfv1.TFJobResourceKind, - } -) - -// AsOwner make OwnerReference according to the parameter -func AsOwner(tfJob *tfv1.TFJob) metav1.OwnerReference { - trueVar := true - // Both api.OwnerReference and metatypes.OwnerReference are combined into that. - return metav1.OwnerReference{ - APIVersion: groupVersionKind.GroupVersion().String(), - Kind: groupVersionKind.Kind, - Name: tfJob.ObjectMeta.Name, - UID: tfJob.ObjectMeta.UID, - Controller: &trueVar, - BlockOwnerDeletion: &trueVar, - } -} - -// ConfigureAcceleratorsForTFJobSpec adds any accelerator specific configuration to the pods. -func ConfigureAcceleratorsForTFJobSpec(c *tfv1.TFJobSpec, accelerators map[string]tfv1.AcceleratorConfig) error { - for _, r := range c.ReplicaSpecs { - if r.Template == nil { - return fmt.Errorf("Replica is missing Template; %v", util.Pformat(r)) - } - for i, c := range r.Template.Spec.Containers { - if c.Name == tfv1.DefaultTFContainer { - // Identify the accelerators attached to this container. - a := map[string]tfv1.AcceleratorConfig{} - - lists := []v1.ResourceList{c.Resources.Limits, c.Resources.Requests} - for _, resources := range lists { - for name, _ := range resources { - - if _, ok := accelerators[string(name)]; !ok { - continue - } - - // Add the expected mounts to the pods. - a[string(name)] = accelerators[string(name)] - } - } - - // Add accelerator information to the pod. - for _, config := range a { - for _, v := range config.Volumes { - r.Template.Spec.Volumes = append(r.Template.Spec.Volumes, - v1.Volume{ - Name: v.Name, - VolumeSource: v1.VolumeSource{ - HostPath: &v1.HostPathVolumeSource{ - Path: v.HostPath, - }, - }, - }) - c.VolumeMounts = append(c.VolumeMounts, v1.VolumeMount{ - Name: v.Name, - MountPath: v.MountPath, - }) - } - - for _, envVar := range config.EnvVars { - c.Env = append(c.Env, v1.EnvVar{ - Name: envVar.Name, - Value: envVar.Value, - }) - } - } - r.Template.Spec.Containers[i] = c - break - } - } - } - return nil -} - -// Cleanup cleans up user passed spec, e.g. defaulting, transforming fields. -// TODO: move this to admission controller -func Cleanup(c *tfv1.TFJobSpec) { - // TODO(jlewi): Add logic to cleanup user provided spec; e.g. by filling in defaults. - // We should have default container images so user doesn't have to provide these. -} - -func CRDName() string { - return fmt.Sprintf("%s.%s", tfv1.CRDKindPlural, tfv1.CRDGroup) -} - -func scalingReason(from, to int) string { - return fmt.Sprintf("Current cluster size: %d, desired cluster size: %d", from, to) -} diff --git a/pkg/apis/tensorflow/helper/helpers_test.go b/pkg/apis/tensorflow/helper/helpers_test.go deleted file mode 100644 index 9380555eaa..0000000000 --- a/pkg/apis/tensorflow/helper/helpers_test.go +++ /dev/null @@ -1,248 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package helper - -import ( - "reflect" - "testing" - - "github.com/gogo/protobuf/proto" - tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" - "github.com/kubeflow/tf-operator/pkg/util" - "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" -) - -func TestAddAccelertor(t *testing.T) { - type testCase struct { - in *tfv1.TFJobSpec - expected *tfv1.TFJobSpec - config map[string]tfv1.AcceleratorConfig - } - - testCases := []testCase{ - // Case 1 checks that we look at requests. - { - in: &tfv1.TFJobSpec{ - ReplicaSpecs: []*tfv1.TFReplicaSpec{ - { - Replicas: proto.Int32(2), - TFPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - Resources: v1.ResourceRequirements{ - Requests: map[v1.ResourceName]resource.Quantity{ - "nvidia-gpu": resource.MustParse("1"), - }, - }, - }, - }, - }, - }, - TFReplicaType: tfv1.PS, - }, - }, - }, - expected: &tfv1.TFJobSpec{ - ReplicaSpecs: []*tfv1.TFReplicaSpec{ - { - Replicas: proto.Int32(2), - TFPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - Resources: v1.ResourceRequirements{ - Requests: map[v1.ResourceName]resource.Quantity{ - "nvidia-gpu": resource.MustParse("1"), - }, - }, - VolumeMounts: []v1.VolumeMount{ - { - Name: "cuda-lib", - MountPath: "/usr/local/cuda", - }, - }, - }, - }, - Volumes: []v1.Volume{ - { - Name: "cuda-lib", - VolumeSource: v1.VolumeSource{ - HostPath: &v1.HostPathVolumeSource{ - Path: "/home/cuda", - }, - }, - }, - }, - }, - }, - TFReplicaType: tfv1.PS, - }, - }, - }, - config: map[string]tfv1.AcceleratorConfig{ - "nvidia-gpu": tfv1.AcceleratorConfig{ - Volumes: []tfv1.AcceleratorVolume{ - { - Name: "cuda-lib", - HostPath: "/home/cuda", - MountPath: "/usr/local/cuda", - }, - }, - }, - }, - }, - // Case 2 checks that we look at limit. - { - in: &tfv1.TFJobSpec{ - ReplicaSpecs: []*tfv1.TFReplicaSpec{ - { - Replicas: proto.Int32(2), - TFPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - Resources: v1.ResourceRequirements{ - Limits: map[v1.ResourceName]resource.Quantity{ - "nvidia-gpu": resource.MustParse("1"), - }, - }, - }, - }, - }, - }, - TFReplicaType: tfv1.PS, - }, - }, - }, - expected: &tfv1.TFJobSpec{ - ReplicaSpecs: []*tfv1.TFReplicaSpec{ - { - Replicas: proto.Int32(2), - TFPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - Resources: v1.ResourceRequirements{ - Limits: map[v1.ResourceName]resource.Quantity{ - "nvidia-gpu": resource.MustParse("1"), - }, - }, - VolumeMounts: []v1.VolumeMount{ - { - Name: "cuda-lib", - MountPath: "/usr/local/cuda", - }, - }, - }, - }, - Volumes: []v1.Volume{ - { - Name: "cuda-lib", - VolumeSource: v1.VolumeSource{ - HostPath: &v1.HostPathVolumeSource{ - Path: "/home/cuda", - }, - }, - }, - }, - }, - }, - TFReplicaType: tfv1.PS, - }, - }, - }, - config: map[string]tfv1.AcceleratorConfig{ - "nvidia-gpu": tfv1.AcceleratorConfig{ - Volumes: []tfv1.AcceleratorVolume{ - { - Name: "cuda-lib", - HostPath: "/home/cuda", - MountPath: "/usr/local/cuda", - }, - }, - }, - }, - }, - // Case 3 no GPUs - { - in: &tfv1.TFJobSpec{ - ReplicaSpecs: []*tfv1.TFReplicaSpec{ - { - Replicas: proto.Int32(2), - TFPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - }, - }, - }, - }, - TFReplicaType: tfv1.PS, - }, - }, - }, - expected: &tfv1.TFJobSpec{ - ReplicaSpecs: []*tfv1.TFReplicaSpec{ - { - Replicas: proto.Int32(2), - TFPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - }, - }, - }, - }, - TFReplicaType: tfv1.PS, - }, - }, - }, - config: map[string]tfv1.AcceleratorConfig{ - "nvidia-gpu": tfv1.AcceleratorConfig{ - Volumes: []tfv1.AcceleratorVolume{ - { - Name: "cuda-lib", - HostPath: "/home/cuda", - MountPath: "/usr/local/cuda", - }, - }, - }, - }, - }, - } - - for _, c := range testCases { - if err := ConfigureAcceleratorsForTFJobSpec(c.in, c.config); err != nil { - t.Errorf("ConfigureAccelerators error; %v", err) - } - if !reflect.DeepEqual(c.in, c.expected) { - t.Errorf("Want\n%v; Got\n %v", util.Pformat(c.expected), util.Pformat(c.in)) - } - } -} diff --git a/pkg/apis/tensorflow/validation/validation.go b/pkg/apis/tensorflow/validation/validation.go deleted file mode 100644 index fa0f012319..0000000000 --- a/pkg/apis/tensorflow/validation/validation.go +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package validation - -import ( - "errors" - "fmt" - - tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" - "github.com/kubeflow/tf-operator/pkg/util" -) - -// ValidateTFJobSpec checks that the TFJobSpec is valid. -func ValidateTFJobSpec(c *tfv1.TFJobSpec) error { - if c.TerminationPolicy == nil || c.TerminationPolicy.Chief == nil { - return fmt.Errorf("invalid termination policy: %v", c.TerminationPolicy) - } - - chiefExists := false - - // Check that each replica has a TensorFlow container and a chief. - for _, r := range c.ReplicaSpecs { - found := false - if r.Template == nil { - return fmt.Errorf("Replica is missing Template; %v", util.Pformat(r)) - } - - if r.TFReplicaType == tfv1.TFReplicaType(c.TerminationPolicy.Chief.ReplicaName) { - chiefExists = true - } - - if r.TFPort == nil { - return errors.New("tfReplicaSpec.TFPort can't be nil.") - } - - // Make sure the replica type is valid. - validReplicaTypes := []tfv1.TFReplicaType{tfv1.MASTER, tfv1.PS, tfv1.WORKER} - - isValidReplicaType := false - for _, t := range validReplicaTypes { - if t == r.TFReplicaType { - isValidReplicaType = true - break - } - } - - if !isValidReplicaType { - return fmt.Errorf("tfReplicaSpec.TFReplicaType is %v but must be one of %v", r.TFReplicaType, validReplicaTypes) - } - - for _, c := range r.Template.Spec.Containers { - if c.Name == tfv1.DefaultTFContainer { - found = true - break - } - } - if !found { - return fmt.Errorf("Replica type %v is missing a container named %s", r.TFReplicaType, tfv1.DefaultTFContainer) - } - } - - if !chiefExists { - return fmt.Errorf("Missing ReplicaSpec for chief: %v", c.TerminationPolicy.Chief.ReplicaName) - } - - return nil -} diff --git a/pkg/apis/tensorflow/validation/validation_test.go b/pkg/apis/tensorflow/validation/validation_test.go deleted file mode 100644 index 670c0b4517..0000000000 --- a/pkg/apis/tensorflow/validation/validation_test.go +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package validation - -import ( - "testing" - - tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" - - "github.com/gogo/protobuf/proto" - "k8s.io/api/core/v1" -) - -func TestValidate(t *testing.T) { - type testCase struct { - in *tfv1.TFJobSpec - expectingError bool - } - - testCases := []testCase{ - { - in: &tfv1.TFJobSpec{ - ReplicaSpecs: []*tfv1.TFReplicaSpec{ - { - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - }, - }, - }, - }, - TFReplicaType: tfv1.MASTER, - Replicas: proto.Int32(1), - }, - }, - TFImage: "tensorflow/tensorflow:1.3.0", - }, - expectingError: false, - }, - { - in: &tfv1.TFJobSpec{ - ReplicaSpecs: []*tfv1.TFReplicaSpec{ - { - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - }, - }, - }, - }, - TFReplicaType: tfv1.WORKER, - Replicas: proto.Int32(1), - }, - }, - TFImage: "tensorflow/tensorflow:1.3.0", - }, - expectingError: true, - }, - { - in: &tfv1.TFJobSpec{ - ReplicaSpecs: []*tfv1.TFReplicaSpec{ - { - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - }, - }, - }, - }, - TFReplicaType: tfv1.WORKER, - Replicas: proto.Int32(1), - }, - }, - TFImage: "tensorflow/tensorflow:1.3.0", - TerminationPolicy: &tfv1.TerminationPolicySpec{ - Chief: &tfv1.ChiefSpec{ - ReplicaName: "WORKER", - ReplicaIndex: 0, - }, - }, - }, - expectingError: false, - }, - } - - for _, c := range testCases { - job := &tfv1.TFJob{ - Spec: *c.in, - } - tfv1.SetObjectDefaults_TFJob(job) - if err := ValidateTFJobSpec(&job.Spec); (err != nil) != c.expectingError { - t.Errorf("unexpected validation result: %v", err) - } - } -} diff --git a/pkg/trainer/labels.go b/pkg/trainer/labels.go deleted file mode 100644 index 1e1a698f32..0000000000 --- a/pkg/trainer/labels.go +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package trainer - -import ( - "fmt" - "strings" -) - -// KubernetesLabels represents a set of labels to apply to a Kubernetes resources. -type KubernetesLabels map[string]string - -// ToSelector converts the labels to a selector matching the labels. -func (l KubernetesLabels) ToSelector() (string, error) { - pieces := make([]string, 0, len(l)) - for k, v := range l { - pieces = append(pieces, fmt.Sprintf("%v=%v", k, v)) - } - - return strings.Join(pieces, ","), nil -} diff --git a/pkg/trainer/replicas.go b/pkg/trainer/replicas.go deleted file mode 100644 index dbb4c95a6d..0000000000 --- a/pkg/trainer/replicas.go +++ /dev/null @@ -1,509 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package trainer - -import ( - "encoding/json" - "errors" - "fmt" - "strings" - - log "github.com/golang/glog" - "k8s.io/api/core/v1" - k8s_errors "k8s.io/apimachinery/pkg/api/errors" - meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - k8sErrors "k8s.io/apimachinery/pkg/util/errors" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/tools/record" - - tfv1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" - "github.com/kubeflow/tf-operator/pkg/util/k8sutil" - // TOOO(jlewi): Rename to apiErrors - "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/helper" - "github.com/kubeflow/tf-operator/pkg/util" -) - -const ( - SuccessfulCreateReason = "SuccessfulCreate" - FailedCreateReason = "FailedCreate" -) - -// TFReplicaSet is a set of TF processes all acting as the same role (e.g. worker -type TFReplicaSet struct { - ClientSet kubernetes.Interface - recorder record.EventRecorder - // Job is a pointer to the TrainingJob to which this replica belongs. - Job *TrainingJob - Spec tfv1alpha1.TFReplicaSpec -} - -// TFReplicas is an interface for managing a set of replicas. -type TFReplicaSetInterface interface { - Create() error - Delete() error - GetStatus() (tfv1alpha1.TFReplicaStatus, error) -} - -// TFConfig is a struct representing the TensorFlow config. This struct is turned into an environment -// which is used by TensorFlow processes to configure themselves. -type TFConfig struct { - // Cluster represents a TensorFlow ClusterSpec. - // See: https://www.tensorflow.org/api_docs/python/tf/train/ClusterSpechttps://www.tensorflow.org/api_docs/python/tf/train/ClusterSpec - Cluster ClusterSpec `json:"cluster"` - Task TaskSpec `json:"task"` - // Environment is used by tensorflow.contrib.learn.python.learn in versions <= 1.3 - // TODO(jlewi): I don't think it is used in versions TF >- 1.4. So we can eventually get rid of it. - Environment string `json:"environment"` -} - -func NewTFReplicaSet(clientSet kubernetes.Interface, recorder record.EventRecorder, tfReplicaSpec tfv1alpha1.TFReplicaSpec, job *TrainingJob) (*TFReplicaSet, error) { - if tfReplicaSpec.TFReplicaType == tfv1alpha1.MASTER && *tfReplicaSpec.Replicas != 1 { - return nil, errors.New("The MASTER must have Replicas = 1") - } - - if tfReplicaSpec.TFPort == nil { - return nil, errors.New("tfReplicaSpec.TFPort can't be nil.") - } - - if tfReplicaSpec.Template == nil && tfReplicaSpec.TFReplicaType != tfv1alpha1.PS { - return nil, fmt.Errorf("tfReplicatfv1alpha1.Template can't be nil for replica type %v.", tfReplicaSpec.TFReplicaType) - } - - // Make sure the replica type is valid. - validReplicaTypes := []tfv1alpha1.TFReplicaType{tfv1alpha1.MASTER, tfv1alpha1.PS, tfv1alpha1.WORKER} - - isValidReplicaType := false - for _, t := range validReplicaTypes { - if t == tfReplicaSpec.TFReplicaType { - isValidReplicaType = true - break - } - } - - if !isValidReplicaType { - return nil, fmt.Errorf("tfReplicaSpec.TFReplicaType is %v but must be one of %v", tfReplicaSpec.TFReplicaType, validReplicaTypes) - } - - return &TFReplicaSet{ - ClientSet: clientSet, - recorder: recorder, - Job: job, - Spec: tfReplicaSpec, - }, nil -} - -// Labels returns the labels for this replica set. -func (s *TFReplicaSet) Labels() KubernetesLabels { - return KubernetesLabels(map[string]string{ - "kubeflow.org": "", - "job_type": string(s.Spec.TFReplicaType), - // runtime_id is set by Job.setup, which is called after the TFReplicaSet is created. - // this is why labels aren't a member variable. - "runtime_id": s.Job.job.Spec.RuntimeId, - "tf_job_name": s.Job.job.ObjectMeta.Name}) -} - -// LabelsByIndex returns the labels for a pod in this replica set. -func (s *TFReplicaSet) LabelsByIndex(index int32) KubernetesLabels { - labels := s.Labels() - labels["task_index"] = fmt.Sprintf("%v", index) - return labels -} - -// CreateServiceWithIndex will create a new service with specify index -func (s *TFReplicaSet) CreateServiceWithIndex(index int32) (*v1.Service, error) { - taskLabels := s.LabelsByIndex(index) - - // Create the service. - service := &v1.Service{ - ObjectMeta: meta_v1.ObjectMeta{ - Name: s.genName(index), - Labels: taskLabels, - OwnerReferences: []meta_v1.OwnerReference{ - helper.AsOwner(s.Job.job), - }, - }, - Spec: v1.ServiceSpec{ - Selector: taskLabels, - // We use headless services here, because we don't need load balancing - // since there is a single pod that is the backend for each service. - ClusterIP: "None", - Ports: []v1.ServicePort{ - { - Name: "tf-port", - Port: *s.Spec.TFPort, - }, - }, - }, - } - - log.Infof("Creating service: %v", service.ObjectMeta.Name) - return s.ClientSet.CoreV1().Services(s.Job.job.ObjectMeta.Namespace).Create(service) -} - -// CreatePodWithIndex will create a new pod with specify index -func (s *TFReplicaSet) CreatePodWithIndex(index int32) (*v1.Pod, error) { - taskLabels := s.LabelsByIndex(index) - - pod := &v1.Pod{ - ObjectMeta: meta_v1.ObjectMeta{ - Name: s.genPodName(index), - Labels: taskLabels, - OwnerReferences: []meta_v1.OwnerReference{ - helper.AsOwner(s.Job.job), - }, - }, - Spec: *s.Spec.Template.Spec.DeepCopy(), - } - - pod.Spec.SchedulerName = s.Job.SchedulerName() - - // Configure the TFCONFIG environment variable. - tfConfig := TFConfig{ - Cluster: s.Job.ClusterSpec(), - Task: TaskSpec{ - Type: strings.ToLower(string(s.Spec.TFReplicaType)), - Index: int(index), - }, - // We need to set environment to cloud otherwise it will default to local which isn't what we want. - Environment: "cloud", - } - - tfConfigJson, err := json.Marshal(tfConfig) - if err != nil { - log.Errorf("Job: %v serializing tfConfig: %v return error; %v", s.Job.job.ObjectMeta.Name, util.Pformat(tfConfig), err) - return nil, err - } - - // Add TF_CONFIG environment variable. - for i, _ := range pod.Spec.Containers { - // We can't get c in the loop variable because that would be by value so our modifications - // wouldn't have any effect. - c := &pod.Spec.Containers[i] - if c.Name != tfv1alpha1.DefaultTFContainer { - continue - } - if len(c.Env) == 0 { - c.Env = make([]v1.EnvVar, 0) - } - c.Env = append(c.Env, v1.EnvVar{ - Name: "TF_CONFIG", - Value: string(tfConfigJson), - }) - } - - log.Infof("Creating pod: %v", pod.ObjectMeta.Name) - return s.ClientSet.CoreV1().Pods(s.Job.job.ObjectMeta.Namespace).Create(pod) -} - -// Delete deletes the replicas -func (s *TFReplicaSet) Delete() error { - selector, err := s.Labels().ToSelector() - if err != nil { - return err - } - - failures := false - - options := meta_v1.ListOptions{ - LabelSelector: selector, - } - - log.V(1).Infof("Deleting Jobs namespace=%v selector=%v", s.Job.job.ObjectMeta.Namespace, selector) - err = s.ClientSet.CoreV1().Pods(s.Job.job.ObjectMeta.Namespace).DeleteCollection(&meta_v1.DeleteOptions{}, options) - - if err != nil { - log.Errorf("There was a problem deleting the jobs; %v", err) - failures = true - } - - // We need to delete the completed pods. - log.Infof("Deleting Pods namespace=%v selector=%v", s.Job.job.ObjectMeta.Namespace, selector) - err = s.ClientSet.CoreV1().Pods(s.Job.job.ObjectMeta.Namespace).DeleteCollection(&meta_v1.DeleteOptions{}, options) - - if err != nil { - log.Errorf("There was a problem deleting the pods; %v", err) - failures = true - } - - // Services doesn't support DeleteCollection so we delete them individually. - // TODO(jlewi): We should check if this has changed with K8s 1.8 or other releases. - for index := int32(0); index < *s.Spec.Replicas; index++ { - log.V(1).Infof("Deleting Service %v:%v", s.Job.job.ObjectMeta.Namespace, s.genName((index))) - err = s.ClientSet.CoreV1().Services(s.Job.job.ObjectMeta.Namespace).Delete(s.genName(index), &meta_v1.DeleteOptions{}) - - if err != nil { - log.Errorf("Error deleting service %v; %v", s.genName(index), err) - failures = true - } - } - - // If the ConfigMap for the default parameter server exists, we delete it - log.Infof("Get ConfigMaps %v:%v", s.Job.job.ObjectMeta.Namespace, s.defaultPSConfigMapName()) - _, err = s.ClientSet.CoreV1().ConfigMaps(s.Job.job.ObjectMeta.Namespace).Get(s.defaultPSConfigMapName(), meta_v1.GetOptions{}) - if err != nil { - if !k8sutil.IsKubernetesResourceNotFoundError(err) { - log.Errorf("Error deleting ConfigMap %v; %v", s.defaultPSConfigMapName(), err) - failures = true - } - } else { - log.Infof("Delete ConfigMaps %v:%v", s.Job.job.ObjectMeta.Namespace, s.defaultPSConfigMapName()) - err = s.ClientSet.CoreV1().ConfigMaps(s.Job.job.ObjectMeta.Namespace).Delete(s.defaultPSConfigMapName(), &meta_v1.DeleteOptions{}) - if err != nil { - log.Errorf("There was a problem deleting the ConfigMaps; %v", err) - failures = true - } - } - - if failures { - return errors.New("Some of the replicas resources could not be deleted") - } - return nil -} - -// replicaStatusFromPodList returns a status from a list of pods for a job. -func replicaStatusFromPodList(l v1.PodList, name string) tfv1alpha1.ReplicaState { - var latest *v1.Pod - for _, i := range l.Items { - if latest == nil { - latest = &i - continue - } - if latest.Status.StartTime.Before(i.Status.StartTime) { - latest = &i - } - } - - if latest == nil { - return tfv1alpha1.ReplicaStateRunning - } - - var tfState v1.ContainerState - - for _, i := range latest.Status.ContainerStatuses { - if i.Name != name { - continue - } - - // We need to decide whether to use the current state or the previous termination state. - tfState = i.State - - // If the container previously terminated we will look at the termination to decide whether it is a retryable - // or permanenent error. - if i.LastTerminationState.Terminated != nil { - tfState = i.LastTerminationState - } - } - - if tfState.Running != nil || tfState.Waiting != nil { - return tfv1alpha1.ReplicaStateRunning - } - - if tfState.Terminated != nil { - if tfState.Terminated.ExitCode == 0 { - return tfv1alpha1.ReplicaStateSucceeded - } - - if isRetryableTerminationState(tfState.Terminated) { - // Since its a retryable error just return RUNNING. - // We can just let Kubernetes restart the container to retry. - return tfv1alpha1.ReplicaStateRunning - } - - return tfv1alpha1.ReplicaStateFailed - } - - return tfv1alpha1.ReplicaStateUnknown -} - -func (s *TFReplicaSet) GetSingleReplicaStatus(index int32) tfv1alpha1.ReplicaState { - p, err := s.ClientSet.CoreV1().Pods(s.Job.job.ObjectMeta.Namespace).Get(s.genName(index), meta_v1.GetOptions{}) - - if err != nil { - return tfv1alpha1.ReplicaStateUnknown - } - - if v1.PodSucceeded == p.Status.Phase { - return tfv1alpha1.ReplicaStateSucceeded - } - - labels := s.LabelsByIndex(index) - selector, err := labels.ToSelector() - if err != nil { - log.Errorf("labels.ToSelector() error; %v", err) - return tfv1alpha1.ReplicaStateFailed - } - - // TODO(jlewi): Handle errors. We need to get the pod and looking at recent container exits. - l, err := s.ClientSet.CoreV1().Pods(s.Job.job.ObjectMeta.Namespace).List(meta_v1.ListOptions{ - // TODO(jlewi): Why isn't the label selector working? - LabelSelector: selector, - }) - - if err != nil { - // TODO(jlewi): Are there errors that should be treated as retryable errors? - return tfv1alpha1.ReplicaStateFailed - } - - status := replicaStatusFromPodList(*l, tfv1alpha1.DefaultTFContainer) - return status -} - -// Status returns the status of the replica set. -func (s *TFReplicaSet) GetStatus() (tfv1alpha1.TFReplicaStatus, error) { - status := tfv1alpha1.TFReplicaStatus{ - TFReplicaType: s.Spec.TFReplicaType, - State: tfv1alpha1.ReplicaStateUnknown, - ReplicasStates: make(map[tfv1alpha1.ReplicaState]int), - } - - increment := func(state tfv1alpha1.ReplicaState) { - v, ok := status.ReplicasStates[state] - if ok { - status.ReplicasStates[state] = v + 1 - } else { - status.ReplicasStates[state] = 1 - } - } - - for index := int32(0); index < *s.Spec.Replicas; index++ { - increment(s.GetSingleReplicaStatus(index)) - } - - // Determine the overall status for the replica set based on the status of the individual - // replicas. - // If any of the replicas failed mark the set as failed. - if _, ok := status.ReplicasStates[tfv1alpha1.ReplicaStateFailed]; ok { - status.State = tfv1alpha1.ReplicaStateFailed - return status, nil - } - - // If any replicas are RUNNING mark it as RUNNING. - if _, ok := status.ReplicasStates[tfv1alpha1.ReplicaStateRunning]; ok { - status.State = tfv1alpha1.ReplicaStateRunning - return status, nil - } - - // If all of the replicas succeeded consider it success. - if v, ok := status.ReplicasStates[tfv1alpha1.ReplicaStateSucceeded]; ok && int32(v) == *s.Spec.Replicas { - status.State = tfv1alpha1.ReplicaStateSucceeded - return status, nil - } - - return status, nil -} - -// SyncPods will try to check current pods for this TFReplicaSet and try to make it as desired. -func (s *TFReplicaSet) SyncPods() error { - for index := int32(0); index < *s.Spec.Replicas; index++ { - - // Label to get all pods of this TFReplicaType + index - labels := s.LabelsByIndex(index) - - labelSelector, err := labels.ToSelector() - if err != nil { - return err - } - - // Filter the unactive pods - fieldSelector := fmt.Sprintf("status.phase!=%s", string(v1.PodFailed)) - - options := meta_v1.ListOptions{ - LabelSelector: labelSelector, - FieldSelector: fieldSelector, - } - - // List to get pods - pl, err := s.ClientSet.CoreV1().Pods(s.Job.job.ObjectMeta.Namespace).List(options) - if err != nil { - return err - } - - if len(pl.Items) == 0 { - log.Infof("No pod found for job %s, creating a new one.", s.Job.name) - // Create the pod - createdPod, err := s.CreatePodWithIndex(index) - - // If the pod already exists do nothing. - if err != nil { - if k8s_errors.IsAlreadyExists(err) { - log.Infof("Pod: %v already exists.", createdPod.ObjectMeta.Name) - continue - } - s.recorder.Eventf(s.Job.job, v1.EventTypeWarning, FailedCreateReason, "Error creating: %v", err) - return k8sErrors.NewAggregate([]error{fmt.Errorf("Creating pod %v returned error.", createdPod.ObjectMeta.Name), err}) - } - - s.recorder.Eventf(s.Job.job, v1.EventTypeNormal, SuccessfulCreateReason, "Created pod: %v", createdPod.Name) - continue - } - - if err != nil { - // TODO: handing this error - continue - } - } - - return nil -} - -// SyncServices will try to check current services for this TFReplicaSet and try to make it as desired. -func (s *TFReplicaSet) SyncServices() error { - for index := int32(0); index < *s.Spec.Replicas; index++ { - _, err := s.ClientSet.CoreV1().Services(s.Job.job.ObjectMeta.Namespace).Get(s.genName(index), meta_v1.GetOptions{}) - if err != nil && k8s_errors.IsNotFound(err) { - log.Infof("Service: %v not found, create new one.", s.genName(index)) - // Create the service - createdService, err := s.CreateServiceWithIndex(index) - - // If the service already exists do nothing. - if err != nil { - if k8s_errors.IsAlreadyExists(err) { - log.Infof("Service: %v already exists.", s.genName(index)) - continue - } - s.recorder.Eventf(s.Job.job, v1.EventTypeWarning, FailedCreateReason, "Error creating: %v", err) - return k8sErrors.NewAggregate([]error{fmt.Errorf("Creating Service %v returned error.", createdService.ObjectMeta.Name), err}) - } - - s.recorder.Eventf(s.Job.job, v1.EventTypeNormal, SuccessfulCreateReason, "Created Service: %v", createdService.Name) - continue - } - - if err != nil { - // TODO: handing this error - continue - } - } - - return nil -} - -func (s *TFReplicaSet) genName(index int32) string { - // Truncate tfjob name to 40 characters - // The whole job name should be compliant with the DNS_LABEL spec, up to a max length of 63 characters - // Thus genName(40 chars)-replicaType(6 chars)-runtimeId(4 chars)-index(4 chars), also leaving some spaces - // See https://github.com/kubernetes/community/blob/master/contributors/design-proposals/architecture/identifiers.md - return fmt.Sprintf("%v-%v-%v-%v", fmt.Sprintf("%.40s", s.Job.job.ObjectMeta.Name), strings.ToLower(string(s.Spec.TFReplicaType)), s.Job.job.Spec.RuntimeId, index) -} - -func (s *TFReplicaSet) genPodName(index int32) string { - // Generate a new pod name with random string - return s.genName(index) + "-" + util.RandString(5) -} - -func (s *TFReplicaSet) defaultPSConfigMapName() string { - return fmt.Sprintf("cm-ps-%v", s.Job.job.Spec.RuntimeId) -} diff --git a/pkg/trainer/replicas_test.go b/pkg/trainer/replicas_test.go deleted file mode 100644 index d2638e2f37..0000000000 --- a/pkg/trainer/replicas_test.go +++ /dev/null @@ -1,368 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package trainer - -import ( - "encoding/json" - "fmt" - "reflect" - "strings" - "testing" - "time" - - "github.com/golang/protobuf/proto" - "k8s.io/api/core/v1" - meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/client-go/kubernetes/fake" - "k8s.io/client-go/tools/record" - - tfv1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" - tfJobFake "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/fake" - "github.com/kubeflow/tf-operator/pkg/util" -) - -var ( - groupVersionKind = schema.GroupVersionKind{ - Group: tfv1alpha1.GroupName, - Version: tfv1alpha1.GroupVersion, - Kind: tfv1alpha1.TFJobResourceKind, - } -) - -func TestTFReplicaSet(t *testing.T) { - clientSet := fake.NewSimpleClientset() - - testSchedulerName := "test-scheduler" - - jobSpec := &tfv1alpha1.TFJob{ - ObjectMeta: meta_v1.ObjectMeta{ - Name: "some-job", - UID: "some-uid", - }, - Spec: tfv1alpha1.TFJobSpec{ - RuntimeId: "some-runtime", - ReplicaSpecs: []*tfv1alpha1.TFReplicaSpec{ - { - Replicas: proto.Int32(2), - TFPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - }, - }, - }, - }, - TFReplicaType: tfv1alpha1.PS, - }, - }, - SchedulerName: testSchedulerName, - }, - } - - recorder := record.NewFakeRecorder(100) - job, err := initJob(clientSet, &tfJobFake.Clientset{}, recorder, jobSpec) - - if err != nil { - t.Fatalf("initJob failed: %v", err) - } - - replica, err := NewTFReplicaSet(clientSet, recorder, *jobSpec.Spec.ReplicaSpecs[0], job) - - if err != nil { - t.Fatalf("NewTFReplicaSet failed: %v", err) - } - - if err := replica.SyncPods(); err != nil { - t.Fatalf("replica.SyncPods() error; %v", err) - } - - if err := replica.SyncServices(); err != nil { - t.Fatalf("replica.SyncServices() error; %v", err) - } - - trueVal := true - expectedOwnerReference := meta_v1.OwnerReference{ - APIVersion: groupVersionKind.GroupVersion().String(), - Kind: groupVersionKind.Kind, - Name: "some-job", - UID: "some-uid", - Controller: &trueVal, - BlockOwnerDeletion: &trueVal, - } - - for index := 0; index < 2; index++ { - // Expected labels - expectedLabels := map[string]string{ - "kubeflow.org": "", - "task_index": fmt.Sprintf("%v", index), - "job_type": "PS", - "runtime_id": "some-runtime", - "tf_job_name": "some-job", - } - - // Check that a service was created. - sList, err := clientSet.CoreV1().Services(replica.Job.job.ObjectMeta.Namespace).List(meta_v1.ListOptions{}) - if err != nil { - t.Fatalf("List services error; %v", err) - } - - if len(sList.Items) != 2 { - t.Fatalf("Expected 2 services got %v", len(sList.Items)) - } - - s := sList.Items[index] - - if !reflect.DeepEqual(expectedLabels, s.ObjectMeta.Labels) { - t.Fatalf("Service Labels; Got %v Want: %v", s.ObjectMeta.Labels, expectedLabels) - } - - name := fmt.Sprintf("some-job-ps-some-runtime-%v", index) - if s.ObjectMeta.Name != name { - t.Fatalf("Job.ObjectMeta.Name = %v; want %v", s.ObjectMeta.Name, name) - } - - if len(s.ObjectMeta.OwnerReferences) != 1 { - t.Fatalf("Expected 1 owner reference got %v", len(s.ObjectMeta.OwnerReferences)) - } - - if !reflect.DeepEqual(s.ObjectMeta.OwnerReferences[0], expectedOwnerReference) { - t.Fatalf("Service.Metadata.OwnerReferences; Got %v; want %v", util.Pformat(s.ObjectMeta.OwnerReferences[0]), util.Pformat(expectedOwnerReference)) - } - - // Check that a pod was created. - l, err := clientSet.CoreV1().Pods(replica.Job.job.ObjectMeta.Namespace).List(meta_v1.ListOptions{}) - if err != nil { - t.Fatalf("List pods error; %v", err) - } - - if len(l.Items) != 2 { - t.Fatalf("Expected 1 pod got %v", len(l.Items)) - } - - p := l.Items[index] - - if !reflect.DeepEqual(expectedLabels, p.ObjectMeta.Labels) { - t.Fatalf("Pod Labels; Got %v Want: %v", expectedLabels, p.ObjectMeta.Labels) - } - - if len(p.Spec.Containers) != 1 { - t.Fatalf("Expected 1 container got %v", len(p.Spec.Containers)) - } - - if len(p.ObjectMeta.OwnerReferences) != 1 { - t.Fatalf("Expected 1 owner reference got %v", len(p.ObjectMeta.OwnerReferences)) - } - - if !reflect.DeepEqual(p.ObjectMeta.OwnerReferences[0], expectedOwnerReference) { - t.Fatalf("Pod.Metadata.OwnerReferences; Got %v; want %v", util.Pformat(p.ObjectMeta.OwnerReferences[0]), util.Pformat(expectedOwnerReference)) - } - - c := p.Spec.Containers[0] - if len(c.Env) != 1 { - t.Fatalf("Expected 1 environment variable got %v", len(c.Env)) - } - - if strings.Compare(p.Spec.SchedulerName, testSchedulerName) != 0 { - t.Fatalf("p.Spec.Template.Spec.SchedulerName; Got %v; want %v", p.Spec.SchedulerName, testSchedulerName) - } - - actualTFConfig := &TFConfig{} - if err := json.Unmarshal([]byte(c.Env[0].Value), actualTFConfig); err != nil { - t.Fatalf("Could not unmarshal TFConfig %v", err) - } - - expectedTFConfig := &TFConfig{ - Cluster: ClusterSpec{}, - Task: TaskSpec{ - Type: "ps", - Index: index, - }, - Environment: "cloud", - } - - if !reflect.DeepEqual(expectedTFConfig, actualTFConfig) { - t.Fatalf("Got %v, Want %v", actualTFConfig, expectedTFConfig) - } - } - // Delete the job. - // N.B it doesn't look like the Fake clientset is sophisticated enough to delete jobs in response to a - // DeleteCollection request (deleting individual jobs does appear to work with the Fake). So if we were to list - // the jobs after calling Delete we'd still see the job. So we will rely on E2E tests to verify Delete works - // correctly. - if err := replica.Delete(); err != nil { - t.Fatalf("replica.Delete() error; %v", err) - } -} - -func TestTFReplicaSetStatusFromPodList(t *testing.T) { - type TestCase struct { - PodList v1.PodList - Name string - Expected tfv1alpha1.ReplicaState - } - - cases := []TestCase{ - { - PodList: v1.PodList{ - Items: []v1.Pod{ - { - Status: v1.PodStatus{ - ContainerStatuses: []v1.ContainerStatus{ - { - Name: "master", - State: v1.ContainerState{ - Running: &v1.ContainerStateRunning{}, - }, - }, - }, - }, - }, - }, - }, - Name: "master", - Expected: tfv1alpha1.ReplicaStateRunning, - }, - { - PodList: v1.PodList{ - Items: []v1.Pod{ - { - Status: v1.PodStatus{ - ContainerStatuses: []v1.ContainerStatus{ - { - Name: "master", - State: v1.ContainerState{ - Terminated: &v1.ContainerStateTerminated{ - ExitCode: 0, - }, - }, - }, - }, - }, - }, - }, - }, - Name: "master", - Expected: tfv1alpha1.ReplicaStateSucceeded, - }, - { - // Multiple containers; make sure we match by name. - PodList: v1.PodList{ - Items: []v1.Pod{ - { - Status: v1.PodStatus{ - ContainerStatuses: []v1.ContainerStatus{ - { - Name: "other", - State: v1.ContainerState{ - Running: &v1.ContainerStateRunning{}, - }, - }, - { - Name: "master", - State: v1.ContainerState{ - Terminated: &v1.ContainerStateTerminated{ - ExitCode: 0, - }, - }, - }, - }, - }, - }, - }, - }, - Name: "master", - Expected: tfv1alpha1.ReplicaStateSucceeded, - }, - { - // Container failed with permanent error and then got restarted. - PodList: v1.PodList{ - Items: []v1.Pod{ - { - Status: v1.PodStatus{ - ContainerStatuses: []v1.ContainerStatus{ - { - Name: "master", - State: v1.ContainerState{ - Running: &v1.ContainerStateRunning{}, - }, - LastTerminationState: v1.ContainerState{ - Terminated: &v1.ContainerStateTerminated{ - ExitCode: 100, - Message: "some reason", - }, - }, - }, - }, - }, - }, - }, - }, - Name: "master", - Expected: tfv1alpha1.ReplicaStateFailed, - }, - { - // Multiple Pods; check we get the most recent. - PodList: v1.PodList{ - Items: []v1.Pod{ - { - Status: v1.PodStatus{ - ContainerStatuses: []v1.ContainerStatus{ - { - Name: "master", - State: v1.ContainerState{ - Running: &v1.ContainerStateRunning{}, - }, - }, - }, - StartTime: &meta_v1.Time{ - Time: time.Date(2017, 0, 0, 0, 0, 0, 0, time.UTC), - }, - }, - }, - { - Status: v1.PodStatus{ - ContainerStatuses: []v1.ContainerStatus{ - { - Name: "master", - State: v1.ContainerState{ - Terminated: &v1.ContainerStateTerminated{ - ExitCode: 100, - Message: "some reason", - }, - }, - }, - }, - StartTime: &meta_v1.Time{ - Time: time.Date(2018, 0, 0, 0, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - Name: "master", - Expected: tfv1alpha1.ReplicaStateFailed, - }, - } - - for _, c := range cases { - status := replicaStatusFromPodList(c.PodList, c.Name) - if status != c.Expected { - t.Errorf("replicaStatusFromPodList(%+v, %v)=%v ; want %v", c.PodList, c.Name, status, c.Expected) - } - } -} diff --git a/pkg/trainer/training.go b/pkg/trainer/training.go deleted file mode 100644 index 39a365e423..0000000000 --- a/pkg/trainer/training.go +++ /dev/null @@ -1,468 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package trainer is to manage TensorFlow training jobs. -package trainer - -import ( - "fmt" - "reflect" - "strings" - - log "github.com/sirupsen/logrus" - "k8s.io/api/core/v1" - "k8s.io/api/policy/v1beta1" - k8s_errors "k8s.io/apimachinery/pkg/api/errors" - meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/intstr" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/tools/record" - - "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/helper" - tfv1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" - "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/validation" - tfjobclient "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" - "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" - "github.com/kubeflow/tf-operator/pkg/util" -) - -// TODO(jlewi): We should switch a New pattern and make trainingJob private so we can -// ensure correctness on creation. -type TrainingJob struct { - job *tfv1alpha1.TFJob - - KubeCli kubernetes.Interface - - recorder record.EventRecorder - - Replicas []*TFReplicaSet - - tfJobClient tfjobclient.Interface - - // in memory state of the job. - // status is the source of truth after job struct is materialized. Changes to the status to be persisted - // should be made here. - status tfv1alpha1.TFJobStatus - - memberCounter int - - pdb *v1beta1.PodDisruptionBudget -} - -// ClusterSpec represents a cluster TensorFlow specification. -// https://www.tensorflow.org/deploy/distributed#create_a_tftrainclusterspec_to_describe_the_cluster -// It is a map from job names to network addresses. -type ClusterSpec map[string][]string - -type TaskSpec struct { - Type string `json:"type"` - Index int `json:"index"` -} - -func initJob(kubeCli kubernetes.Interface, tfJobClient tfjobclient.Interface, recorder record.EventRecorder, job *tfv1alpha1.TFJob) (*TrainingJob, error) { - j := &TrainingJob{ - KubeCli: kubeCli, - tfJobClient: tfJobClient, - recorder: recorder, - Replicas: make([]*TFReplicaSet, 0), - job: job, - status: *job.Status.DeepCopy(), - } - - return j, nil -} - -func NewJob(kubeCli kubernetes.Interface, tfJobClient tfjobclient.Interface, recorder record.EventRecorder, job *tfv1alpha1.TFJob, config *tfv1alpha1.ControllerConfig) (*TrainingJob, error) { - j, err := initJob(kubeCli, tfJobClient, recorder, job) - if err != nil { - return nil, err - } - - return j, nil -} - -func (j *TrainingJob) UID() types.UID { - return j.job.ObjectMeta.UID -} - -func (j *TrainingJob) ClusterSpec() ClusterSpec { - clusterSpec := make(ClusterSpec) - - for _, p := range j.Replicas { - replicaNames := make([]string, 0, *p.Spec.Replicas) - - for i := int32(0); i < *p.Spec.Replicas; i++ { - replicaNames = append(replicaNames, fmt.Sprintf("%v:%v", p.genName(i), *p.Spec.TFPort)) - } - - clusterSpec[strings.ToLower(string(p.Spec.TFReplicaType))] = replicaNames - } - - return clusterSpec -} - -// deleteResources deletes the replicas it it was created -func (j *TrainingJob) deleteResources() error { - for _, r := range j.Replicas { - if err := r.Delete(); err != nil { - return err - } - } - - return nil -} - -func (j *TrainingJob) GetStatus() (tfv1alpha1.State, []*tfv1alpha1.TFReplicaStatus, error) { - chief := j.job.Spec.TerminationPolicy.Chief - chiefState := tfv1alpha1.ReplicaStateUnknown - - state := tfv1alpha1.StateUnknown - replicaStatuses := make([]*tfv1alpha1.TFReplicaStatus, 0) - - // The state for each replica. - // TODO(jlewi): We will need to modify this code if we want to allow multiples of a given type of replica. - replicaSetStates := make(map[tfv1alpha1.TFReplicaType]tfv1alpha1.ReplicaState) - - for _, r := range j.Replicas { - rStatus, err := r.GetStatus() - if err != nil { - log.Errorf("GetStatus() for %v returned error; %v", r.Spec.TFReplicaType, err) - } - - replicaSetStates[r.Spec.TFReplicaType] = rStatus.State - - replicaStatuses = append(replicaStatuses, &rStatus) - - if string(r.Spec.TFReplicaType) == chief.ReplicaName { - chiefState = r.GetSingleReplicaStatus(int32(chief.ReplicaIndex)) - } - } - - if chiefState == tfv1alpha1.ReplicaStateRunning { - state = tfv1alpha1.StateRunning - } else if chiefState == tfv1alpha1.ReplicaStateFailed { - state = tfv1alpha1.StateFailed - } else if chiefState == tfv1alpha1.ReplicaStateSucceeded { - state = tfv1alpha1.StateSucceeded - } - - return state, replicaStatuses, nil -} - -// isRetryableTerminationState returns true if a container terminated in a state -// that we consider retryable. -func isRetryableTerminationState(s *v1.ContainerStateTerminated) bool { - // TODO(jlewi): Need to match logic in - // https://cs.corp.google.com/piper///depot/google3/cloud/ml/beta/job/training_job_state_util.cc?l=88 - if s.Reason == "OOMKilled" { - // If the user's process causes an OOM and Docker kills the container, - // the termination reason of ContainerState will be specified to - // 'OOMKilled'. In this case, we can't assume this to be a retryable error. - // - // This check should happen before checking the termination log, since - // if the container terminated with an OOM, the termination log may not - // be written. - return false - } - - // TODO(jlewi): Should we use the exit code reported in the termination - // log message and not the ExitCode reported by the container. - - if s.ExitCode >= 0 && s.ExitCode <= 127 { - // For the exit_code in [0, 127]: - // 0 means success, - // 1 - 127 corresponds to permanent user errors. - // We don't want to retry for both cases. - // More info about exit status can be found in: - // https://www.gnu.org/software/bash/manual/html_node/Exit-Status.html - return false - } - - // For the remaining cases that exit_code from workers that doesn't - // fall into [0, 127]. They can be: - // 137 corresponds to SIGKILL, - // 143 corresponds to SIGTERM, - // other values that have undefined behavior. - // We treat them as internal errors for now and all the internal errors - // will be retired. - return true -} - -func (j *TrainingJob) masterName() string { - return fmt.Sprintf("master-%v-0", j.job.Spec.RuntimeId) -} - -// setup the training job. -func (j *TrainingJob) setup(config *tfv1alpha1.ControllerConfig) { - err := func() error { - // If the job has already started we shouldn't set it up again. - if j.status.Phase != tfv1alpha1.TFJobPhaseNone { - log.Warningf("Job %v has already been setup.", j.name()) - return nil - } - - // Set defaults. - scheme.Scheme.Default(j.job) - - err := validation.ValidateTFJobSpec(&j.job.Spec) - if err != nil { - return fmt.Errorf("invalid job spec: %v", err) - } - - if err := helper.ConfigureAcceleratorsForTFJobSpec(&j.job.Spec, config.Accelerators); err != nil { - return fmt.Errorf("ConfigureAccelerators(...) error; %v", err) - } - - if j.job.Spec.RuntimeId == "" { - j.job.Spec.RuntimeId = util.RandString(4) - } - return nil - }() - - if err != nil { - j.status.Reason = err.Error() - j.status.Phase = tfv1alpha1.TFJobPhaseFailed - j.status.State = tfv1alpha1.StateFailed - } else { - j.status.Phase = tfv1alpha1.TFJobPhaseCreating - j.status.State = tfv1alpha1.StateRunning - } -} - -// setup Replicas. This creates in memory data structures corresponding to the replicas. -func (j *TrainingJob) setupReplicas() error { - if len(j.Replicas) != len(j.job.Spec.ReplicaSpecs) { - j.Replicas = make([]*TFReplicaSet, 0, len(j.job.Spec.ReplicaSpecs)) - for _, t := range j.job.Spec.ReplicaSpecs { - r, err := NewTFReplicaSet(j.KubeCli, j.recorder, *t, j) - if err != nil { - return err - } - j.Replicas = append(j.Replicas, r) - } - } - - return nil -} - -func (j *TrainingJob) Delete() { - // TODO(jlewi): Delete is what should cause us to delete the Pods. - // we shouldn't delete the pods when the jobs finish because leaving the pods - // allows us to get the logs from the pods after the job finishes. - // - log.Infof("TFJob %v deleted by the user", j.fullname()) - // TODO(jlewi): This logic is probably insufficient. - if j.job.Status.Phase != tfv1alpha1.TFJobPhaseCleanUp { - j.status.Phase = tfv1alpha1.TFJobPhaseCleanUp - } - - // TODO(jlewi): Does it make sense to explicitly delete the resources? Should - // we just rely on K8s garbage collection to delete the resources before - // deleting TFJob? - if cErr := j.deleteResources(); cErr != nil { - log.Errorf("trainingJob.deleteResources() error; %v", cErr) - } - - if j.pdb != nil { - // if the job has PDB for gang scheduling, delete it - err := j.KubeCli.PolicyV1beta1().PodDisruptionBudgets(j.job.ObjectMeta.Namespace).Delete(j.pdb.ObjectMeta.Name, &meta_v1.DeleteOptions{}) - if err != nil { - log.Errorf("Error deleting PDB %v; %v", j.pdb.ObjectMeta.Name, err) - } - } -} - -// updateCRDStatus updates the job status based on TraingingJob.status. -func (j *TrainingJob) updateCRDStatus() error { - // If the status hasn't changed then there's no reason to update the CRD. - if reflect.DeepEqual(j.job.Status, j.status) { - return nil - } - - newJob := j.job - newJob.Status = j.status - newJob, err := j.tfJobClient.KubeflowV1alpha1().TFJobs(j.job.ObjectMeta.Namespace).Update(newJob) - if err != nil { - return err - } - - j.job = newJob - - return nil -} - -// reconcile tries to get the job into the desired state. -func (j *TrainingJob) Reconcile(config *tfv1alpha1.ControllerConfig, enableGangScheduling bool) error { - if j.job.Status.Phase == tfv1alpha1.TFJobPhaseNone { - // The job hasn't been setup. - j.setup(config) - - if err := j.updateCRDStatus(); err != nil { - log.Warningf("failed to update CRD status: %v", err) - return err - } - } - - // setupreplicas initializes data structures inside TrainingJob representing the replicas. - // These are go-lang structures which aren't preserved in the APIServer. So we always need to call setupReplicas - // unlike setup which only needs to be called once during the lifecycle of the job. - if err := j.setupReplicas(); err != nil { - log.Errorf("failed to create replicas: %v", err) - j.status.Reason = fmt.Sprintf("Could not create in memory datastructures; %v", err) - if uErr := j.updateCRDStatus(); err != nil { - log.Warningf("Job %v; failed to update status error: %v", j.job.ObjectMeta.Name, uErr) - } - return err - } - - // sync PDB for gang scheduling - // TODO(mitake): replace PDB with a newer mechanism if it is replaced - if enableGangScheduling { - err := j.syncPdb() - if err != nil { - log.Errorf("SyncPdb error: %v", err) - } - } - - // sync pods - for _, rc := range j.Replicas { - err := rc.SyncPods() - if err != nil { - log.Errorf("SyncPods error: %v", err) - } - } - - // sync services - for _, rc := range j.Replicas { - err := rc.SyncServices() - if err != nil { - log.Errorf("SyncServices error: %v", err) - } - } - - if err := j.updateCRDStatus(); err != nil { - log.Warningf("Job %v; failed to update status error: %v", j.job.ObjectMeta.Name, err) - return err - } - - // Call GetStatus in each reconcile loop - state, replicaStatuses, err := j.GetStatus() - - j.status.ReplicaStatuses = replicaStatuses - if err != nil { - log.Errorf("GetStatus() for job %v returned error: %v", j.job.ObjectMeta.Name, err) - return err - } - - // TODO(jlewi): We should update the Phase if we detect the job is done. - if state == tfv1alpha1.StateFailed { - log.Errorf("Master failed Job: %v.", j.job.ObjectMeta.Name) - j.status.Phase = tfv1alpha1.TFJobPhaseDone - j.status.State = tfv1alpha1.StateFailed - } else if state == tfv1alpha1.StateSucceeded { - log.Infof("Master succeeded Job: %v.", j.job.ObjectMeta.Name) - j.status.Phase = tfv1alpha1.TFJobPhaseDone - j.status.State = tfv1alpha1.StateSucceeded - } else if state == tfv1alpha1.StateRunning { - log.Infof("Master running Job: %v.", j.job.ObjectMeta.Name) - j.status.Phase = tfv1alpha1.TFJobPhaseRunning - j.status.State = tfv1alpha1.StateRunning - } else { - log.Infof("Job %v status=%v", j.job.ObjectMeta.Name, util.Pformat(j.status)) - } - - // If the phase changed we should update the CRD. - if err := j.updateCRDStatus(); err != nil { - log.Warningf("Job %v, failed to update CRD status error: %v", j.job.ObjectMeta.Name, err) - return err - } - - if j.job.Status.Phase == tfv1alpha1.TFJobPhaseCleanUp { - if cErr := j.deleteResources(); cErr != nil { - log.Errorf("Job %v trainingJob.Delete() error; %v", j.job.ObjectMeta.Name, cErr) - } - // j.status.SetPhase(spec.TFJobPhaseDone) - // Return from run because we want to stop reconciling the object. - return nil - } - - // updateCRDStatus will update the status of the CRD with c.Status if c.Status - // doesn't match c.Cluster.status. So you can change c.Status in order to propagate - // changes to the CRD status. - if err := j.updateCRDStatus(); err != nil { - log.Warningf("Job %v; failed to update CRD status error: %v", j.job.ObjectMeta.Name, err) - return err - } - - return nil -} - -func (j *TrainingJob) name() string { - return j.job.ObjectMeta.GetName() -} - -// fullname returns the namespace and name for the job. -func (j *TrainingJob) fullname() string { - return j.job.ObjectMeta.GetNamespace() + ":" + j.job.ObjectMeta.GetName() -} - -func (j *TrainingJob) SchedulerName() string { - return j.job.Spec.SchedulerName -} - -// SyncPdb will create a PDB for gang scheduling by kube-arbitrator. -func (j *TrainingJob) syncPdb() error { - nrReplicas := int32(0) - for _, r := range j.Replicas { - nrReplicas += *r.Spec.Replicas - } - - if nrReplicas == 1 { - // gang scheduling isn't required by a non distributed training process - return nil - } - - minAvailable := intstr.FromInt(int(nrReplicas)) - pdb := &v1beta1.PodDisruptionBudget{ - ObjectMeta: meta_v1.ObjectMeta{ - GenerateName: "tf-job-pdb-", - }, - Spec: v1beta1.PodDisruptionBudgetSpec{ - MinAvailable: &minAvailable, - Selector: &meta_v1.LabelSelector{ - MatchLabels: map[string]string{ - "runtime_id": j.job.Spec.RuntimeId, - "tf_job_name": j.job.ObjectMeta.Name, - }, - }, - }, - } - - createdPdb, err := j.KubeCli.PolicyV1beta1().PodDisruptionBudgets(j.job.ObjectMeta.Namespace).Create(pdb) - if err != nil { - if k8s_errors.IsAlreadyExists(err) { - log.Infof("PDB: %v already exists.", j.job.ObjectMeta.Name) - return nil - } - - j.recorder.Eventf(j.job, v1.EventTypeWarning, FailedCreateReason, "Error creating: %v", err) - return err - } - - j.pdb = createdPdb - - j.recorder.Eventf(j.job, v1.EventTypeNormal, SuccessfulCreateReason, "Created PDB: %v", createdPdb.Name) - return nil -} diff --git a/pkg/trainer/training_test.go b/pkg/trainer/training_test.go deleted file mode 100644 index 5c9718c613..0000000000 --- a/pkg/trainer/training_test.go +++ /dev/null @@ -1,489 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package trainer - -import ( - "reflect" - "testing" - - "github.com/gogo/protobuf/proto" - tfv1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" - tfJobFake "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/fake" - "k8s.io/api/core/v1" - "k8s.io/api/policy/v1beta1" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/intstr" - "k8s.io/client-go/kubernetes/fake" - "k8s.io/client-go/tools/record" -) - -func TestIsRetryableTerminationState(t *testing.T) { - type TestCase struct { - State v1.ContainerStateTerminated - Expected bool - } - - cases := []TestCase{ - { - // Since reason is empty we don't trust the exit code. - State: v1.ContainerStateTerminated{ - ExitCode: 0, - }, - Expected: false, - }, - { - State: v1.ContainerStateTerminated{ - ExitCode: 0, - Message: "some reason", - }, - Expected: false, - }, - { - State: v1.ContainerStateTerminated{ - ExitCode: 1, - Message: "some reason", - }, - Expected: false, - }, - { - State: v1.ContainerStateTerminated{ - ExitCode: 1, - }, - Expected: false, - }, - { - State: v1.ContainerStateTerminated{ - ExitCode: 244, - Message: "some reason", - }, - Expected: true, - }, - { - State: v1.ContainerStateTerminated{ - ExitCode: 244, - Reason: "OOMKilled", - }, - Expected: false, - }, - } - - for _, c := range cases { - actual := isRetryableTerminationState(&c.State) - if actual != c.Expected { - t.Errorf("isRetryableTerminationState(%+v)=%v want %v", c.State, actual, c.Expected) - } - } -} - -func TestClusterSpec(t *testing.T) { - type TestCase struct { - Spec *tfv1alpha1.TFJob - Expected map[string][]string - } - - cases := []TestCase{ - { - Spec: &tfv1alpha1.TFJob{ - ObjectMeta: metav1.ObjectMeta{ - Name: "myjob", - }, - Spec: tfv1alpha1.TFJobSpec{ - RuntimeId: "runtime", - ReplicaSpecs: []*tfv1alpha1.TFReplicaSpec{ - { - Replicas: proto.Int32(2), - TFPort: proto.Int32(22), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - }, - }, - }, - }, - TFReplicaType: tfv1alpha1.PS, - }, - { - Replicas: proto.Int32(1), - TFPort: proto.Int32(42), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - }, - }, - }, - }, - TFReplicaType: tfv1alpha1.MASTER, - }, - { - Replicas: proto.Int32(3), - TFPort: proto.Int32(40), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - }, - }, - }, - }, - TFReplicaType: tfv1alpha1.WORKER, - }, - }, - }, - }, - - Expected: map[string][]string{ - "ps": []string{"myjob-ps-runtime-0:22", "myjob-ps-runtime-1:22"}, - "master": []string{"myjob-master-runtime-0:42"}, - "worker": []string{"myjob-worker-runtime-0:40", "myjob-worker-runtime-1:40", "myjob-worker-runtime-2:40"}, - }, - }, - } - - for _, c := range cases { - - clientSet := fake.NewSimpleClientset() - - recorder := record.NewFakeRecorder(100) - job, err := initJob(clientSet, &tfJobFake.Clientset{}, recorder, c.Spec) - - if err != nil { - t.Fatalf("initJob failed: %v", err) - } - - job.setup(&tfv1alpha1.ControllerConfig{}) - job.setupReplicas() - actual := job.ClusterSpec() - - for k, v := range c.Expected { - actualV, ok := actual[k] - if !ok { - t.Errorf("Actual cluster spec is missing key: %v", k) - continue - } - if !reflect.DeepEqual(actualV, v) { - t.Errorf("Key %v got %v want %v", k, actualV, v) - } - } - } -} - -func TestJobSetup(t *testing.T) { - // Verify the setup will fill in the RuntimeId. - clientSet := fake.NewSimpleClientset() - - type testCase struct { - jobSpec *tfv1alpha1.TFJob - expectMounts int - expectPhase tfv1alpha1.TFJobPhase - expectReason string - expectState tfv1alpha1.State - } - - testCases := []testCase{ - { - jobSpec: &tfv1alpha1.TFJob{ - Spec: tfv1alpha1.TFJobSpec{ - ReplicaSpecs: []*tfv1alpha1.TFReplicaSpec{ - { - Replicas: proto.Int32(1), - TFPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - }, - }, - }, - }, - TFReplicaType: tfv1alpha1.MASTER, - }, - }, - }, - }, - expectMounts: 0, - expectPhase: tfv1alpha1.TFJobPhaseCreating, - expectState: tfv1alpha1.StateRunning, - }, - { - jobSpec: &tfv1alpha1.TFJob{ - Spec: tfv1alpha1.TFJobSpec{ - ReplicaSpecs: []*tfv1alpha1.TFReplicaSpec{ - { - Replicas: proto.Int32(2), - TFPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - Resources: v1.ResourceRequirements{ - Requests: map[v1.ResourceName]resource.Quantity{ - "nvidia-gpu": resource.MustParse("1"), - }, - }, - }, - }, - }, - }, - TFReplicaType: tfv1alpha1.WORKER, - }, - }, - TerminationPolicy: &tfv1alpha1.TerminationPolicySpec{ - Chief: &tfv1alpha1.ChiefSpec{ - ReplicaName: string(tfv1alpha1.WORKER), - ReplicaIndex: 0, - }, - }, - }, - }, - expectMounts: 1, - expectPhase: tfv1alpha1.TFJobPhaseCreating, - expectState: tfv1alpha1.StateRunning, - }, - { - // The job should fail setup because the spec is invalid. - jobSpec: &tfv1alpha1.TFJob{ - Spec: tfv1alpha1.TFJobSpec{ - ReplicaSpecs: []*tfv1alpha1.TFReplicaSpec{ - { - Replicas: proto.Int32(2), - TFPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - Resources: v1.ResourceRequirements{ - Requests: map[v1.ResourceName]resource.Quantity{ - "nvidia-gpu": resource.MustParse("1"), - }, - }, - }, - }, - }, - }, - TFReplicaType: tfv1alpha1.WORKER, - }, - }, - }, - }, - expectMounts: 0, - expectPhase: tfv1alpha1.TFJobPhaseFailed, - expectState: tfv1alpha1.StateFailed, - expectReason: "invalid job spec: Missing ReplicaSpec for chief: MASTER", - }, - } - - config := &tfv1alpha1.ControllerConfig{ - Accelerators: map[string]tfv1alpha1.AcceleratorConfig{ - "nvidia-gpu": tfv1alpha1.AcceleratorConfig{ - Volumes: []tfv1alpha1.AcceleratorVolume{ - { - Name: "cuda-lib", - HostPath: "/home/cuda", - MountPath: "/usr/local/cuda", - }, - }, - }, - }, - } - - for _, c := range testCases { - - recorder := record.NewFakeRecorder(100) - job, err := initJob(clientSet, &tfJobFake.Clientset{}, recorder, c.jobSpec) - - job.setup(config) - - if err != nil { - t.Errorf("j.setup error: %v", err) - } - - if job.status.Phase != c.expectPhase { - t.Errorf("job.job.Status.Phase Want: %v Got:%v ", c.expectPhase, job.status.Phase) - } - - if job.status.Reason != c.expectReason { - t.Errorf("job.job.Status.Reason Want: %v Got:%v ", c.expectReason, job.status.Reason) - } - - if job.status.State != c.expectState { - t.Errorf("job.job.Status.State Want: %v Got:%v ", c.expectState, job.status.State) - } - - // Make sure the runtime id is set if the job didn't fail. - if c.expectState != tfv1alpha1.StateFailed && job.job.Spec.RuntimeId == "" { - t.Errorf("RuntimeId should not be empty after calling setup.") - } - - if len(job.job.Spec.ReplicaSpecs[0].Template.Spec.Volumes) != c.expectMounts { - t.Errorf("Expect %v Volumes got %v", c.expectMounts, len(job.job.Spec.ReplicaSpecs[0].Template.Spec.Volumes)) - } - - if len(job.job.Spec.ReplicaSpecs[0].Template.Spec.Containers[0].VolumeMounts) != c.expectMounts { - t.Errorf("Expect %v VolumeMounts got %v", c.expectMounts, len(job.job.Spec.ReplicaSpecs[0].Template.Spec.Containers[0].VolumeMounts)) - } - } -} - -func TestPDBForGangScheduling(t *testing.T) { - clientSet := fake.NewSimpleClientset() - - type testCase struct { - jobSpec *tfv1alpha1.TFJob - expectPdb *v1beta1.PodDisruptionBudget - } - - minAvailable3 := intstr.FromInt(3) - - testCases := []testCase{ - { - jobSpec: &tfv1alpha1.TFJob{ - ObjectMeta: metav1.ObjectMeta{ - Name: "some-meta-name", - }, - Spec: tfv1alpha1.TFJobSpec{ - RuntimeId: "some-runtime-id", - ReplicaSpecs: []*tfv1alpha1.TFReplicaSpec{ - { - Replicas: proto.Int32(1), - TFPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - }, - }, - }, - }, - TFReplicaType: tfv1alpha1.WORKER, - }, - }, - }, - }, - expectPdb: nil, - }, - - { - jobSpec: &tfv1alpha1.TFJob{ - ObjectMeta: metav1.ObjectMeta{ - Name: "some-meta-name", - }, - Spec: tfv1alpha1.TFJobSpec{ - RuntimeId: "some-runtime-id", - ReplicaSpecs: []*tfv1alpha1.TFReplicaSpec{ - { - Replicas: proto.Int32(1), - TFPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - }, - }, - }, - }, - TFReplicaType: tfv1alpha1.MASTER, - }, - { - Replicas: proto.Int32(1), - TFPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - }, - }, - }, - }, - TFReplicaType: tfv1alpha1.PS, - }, - { - Replicas: proto.Int32(1), - TFPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - }, - }, - }, - }, - TFReplicaType: tfv1alpha1.WORKER, - }, - }, - }, - }, - expectPdb: &v1beta1.PodDisruptionBudget{ - Spec: v1beta1.PodDisruptionBudgetSpec{ - MinAvailable: &minAvailable3, - Selector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - "runtime_id": "some-runtime-id", - "tf_job_name": "some-meta-name", - }, - }, - }, - }, - }, - } - - for _, c := range testCases { - recorder := record.NewFakeRecorder(100) - job, err := initJob(clientSet, &tfJobFake.Clientset{}, recorder, c.jobSpec) - if err != nil { - t.Errorf("j.initJob() error: %v", err) - } - - err = job.setupReplicas() - if err != nil { - t.Errorf("j.setupReplicas() error: %v", err) - } - - err = job.syncPdb() - if err != nil { - t.Errorf("j.Reconcile() error: %v", err) - } - - actualPdbList, err := clientSet.PolicyV1beta1().PodDisruptionBudgets(job.job.ObjectMeta.Namespace).List(metav1.ListOptions{}) - if err != nil { - t.Fatalf("Could not get PDB List: %v", err) - } - if len(actualPdbList.Items) != 1 && c.expectPdb != nil { - t.Fatalf("k8s should have one PDB but the length of actually created PDB isn't 1, Got %d", len(actualPdbList.Items)) - } - - if c.expectPdb == nil { - // non distributed training job, shouldn't have PDB - continue - } - - actualPdb := actualPdbList.Items[0] - if !reflect.DeepEqual(c.expectPdb.Spec, actualPdb.Spec) { - t.Fatalf("Got %v, Want %v", actualPdb.Spec, c.expectPdb.Spec) - } - } -} diff --git a/pkg/util/k8sutil/k8sutil.go b/pkg/util/k8sutil/k8sutil.go deleted file mode 100644 index 9421504ca6..0000000000 --- a/pkg/util/k8sutil/k8sutil.go +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package k8sutil - -import ( - "net" - "os" - - log "github.com/sirupsen/logrus" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/client-go/kubernetes" - _ "k8s.io/client-go/plugin/pkg/client/auth/gcp" // for gcp auth - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/clientcmd" - - tfv1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" -) - -const RecommendedConfigPathEnvVar = "KUBECONFIG" - -// TODO(jlewi): I think this function is used to add an owner to a resource. I think we we should use this -// method to ensure all resources created for the TFJob are owned by the TFJob. -func addOwnerRefToObject(o metav1.Object, r metav1.OwnerReference) { - o.SetOwnerReferences(append(o.GetOwnerReferences(), r)) -} - -func MustNewKubeClient() kubernetes.Interface { - cfg, err := GetClusterConfig() - if err != nil { - log.Fatal(err) - } - return kubernetes.NewForConfigOrDie(cfg) -} - -// Obtain the config from the Kube configuration used by kubeconfig, or from k8s cluster. -func GetClusterConfig() (*rest.Config, error) { - if len(os.Getenv(RecommendedConfigPathEnvVar)) > 0 { - // use the current context in kubeconfig - // This is very useful for running locally. - return clientcmd.BuildConfigFromFlags("", os.Getenv(RecommendedConfigPathEnvVar)) - } - - // Work around https://github.com/kubernetes/kubernetes/issues/40973 - // See https://github.com/coreos/etcd-operator/issues/731#issuecomment-283804819 - if len(os.Getenv("KUBERNETES_SERVICE_HOST")) == 0 { - addrs, err := net.LookupHost("kubernetes.default.svc") - if err != nil { - panic(err) - } - if err := os.Setenv("KUBERNETES_SERVICE_HOST", addrs[0]); err != nil { - return nil, err - } - } - if len(os.Getenv("KUBERNETES_SERVICE_PORT")) == 0 { - if err := os.Setenv("KUBERNETES_SERVICE_PORT", "443"); err != nil { - panic(err) - } - } - return rest.InClusterConfig() -} - -func IsKubernetesResourceAlreadyExistError(err error) bool { - return apierrors.IsAlreadyExists(err) -} - -func IsKubernetesResourceNotFoundError(err error) bool { - return apierrors.IsNotFound(err) -} - -// We are using internal api types for cluster related. -func JobListOpt(clusterName string) metav1.ListOptions { - return metav1.ListOptions{ - LabelSelector: labels.SelectorFromSet(LabelsForJob(clusterName)).String(), - } -} - -func LabelsForJob(jobName string) map[string]string { - return map[string]string{ - // TODO(jlewi): Need to set appropriate labels for TF. - "tf_job": jobName, - "app": tfv1alpha1.AppLabel, - } -} - -// TODO(jlewi): CascadeDeletOptions are part of garbage collection policy. -// Do we want to use this? See -// https://kubernetes.io/docs/concepts/workloads/controllers/garbage-collection/ -func CascadeDeleteOptions(gracePeriodSeconds int64) *metav1.DeleteOptions { - return &metav1.DeleteOptions{ - GracePeriodSeconds: func(t int64) *int64 { return &t }(gracePeriodSeconds), - PropagationPolicy: func() *metav1.DeletionPropagation { - foreground := metav1.DeletePropagationForeground - return &foreground - }(), - } -} - -// mergeLabels merges l2 into l1. Conflicting labels will be skipped. -func mergeLabels(l1, l2 map[string]string) { - for k, v := range l2 { - if _, ok := l1[k]; ok { - continue - } - l1[k] = v - } -} diff --git a/pkg/util/util.go b/pkg/util/util.go deleted file mode 100644 index fd623b5a89..0000000000 --- a/pkg/util/util.go +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package util provides various helper routines. -package util - -import ( - "encoding/json" - "fmt" - "math/rand" - "time" - - log "github.com/sirupsen/logrus" -) - -const ( - // Environment variable for namespace when deployed on kubernetes - EnvKubeflowNamespace = "KUBEFLOW_NAMESPACE" -) - -// Pformat returns a pretty format output of any value that can be marshalled to JSON. -func Pformat(value interface{}) string { - if s, ok := value.(string); ok { - return s - } - valueJSON, err := json.MarshalIndent(value, "", " ") - if err != nil { - log.Warningf("Couldn't pretty format %v, error: %v", value, err) - return fmt.Sprintf("%v", value) - } - return string(valueJSON) -} - -var src = rand.NewSource(time.Now().UnixNano()) - -const letterBytes = "0123456789abcdefghijklmnopqrstuvwxyz" -const ( - letterIdxBits = 6 // 6 bits to represent a letter index - letterIdxMask = 1<= 0; { - if remain == 0 { - cache, remain = src.Int63(), letterIdxMax - } - if idx := int(cache & letterIdxMask); idx < len(letterBytes) { - b[i] = letterBytes[idx] - i-- - } - cache >>= letterIdxBits - remain-- - } - - return string(b) -} From 4e1dd8d297baadd2a4a66af252b6112fa510635a Mon Sep 17 00:00:00 2001 From: Penghao Cen Date: Thu, 15 Mar 2018 17:07:58 +0800 Subject: [PATCH 05/24] Import v1alpha2 logic code --- cmd/tf-operator/app/server.go | 99 ++- cmd/tf-operator/main.go | 4 +- pkg/apis/tensorflow/v1alpha2/constants.go | 20 + pkg/controller/controller.go | 466 ++++++++++---- pkg/controller/controller_pod.go | 273 ++++++++ pkg/controller/controller_ref_manager.go | 399 ++++++++++++ pkg/controller/controller_service.go | 258 ++++++++ pkg/controller/controller_tensorflow.go | 109 ++++ pkg/controller/controller_utils.go | 719 ++++++++++++++++++++++ pkg/util/signals/signal.go | 43 ++ pkg/util/signals/signal_posix.go | 26 + pkg/util/signals/signal_windows.go | 23 + 12 files changed, 2253 insertions(+), 186 deletions(-) create mode 100644 pkg/apis/tensorflow/v1alpha2/constants.go create mode 100644 pkg/controller/controller_pod.go create mode 100644 pkg/controller/controller_ref_manager.go create mode 100644 pkg/controller/controller_service.go create mode 100644 pkg/controller/controller_tensorflow.go create mode 100644 pkg/controller/controller_utils.go create mode 100644 pkg/util/signals/signal.go create mode 100644 pkg/util/signals/signal_posix.go create mode 100644 pkg/util/signals/signal_windows.go diff --git a/cmd/tf-operator/app/server.go b/cmd/tf-operator/app/server.go index f4794d3dcb..811cebfb4d 100644 --- a/cmd/tf-operator/app/server.go +++ b/cmd/tf-operator/app/server.go @@ -16,37 +16,40 @@ package app import ( "fmt" - "io/ioutil" "os" "time" - "github.com/ghodss/yaml" log "github.com/sirupsen/logrus" + "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - clientset "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" + kubeinformers "k8s.io/client-go/informers" + kubeclientset "k8s.io/client-go/kubernetes" + restclientset "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" election "k8s.io/client-go/tools/leaderelection" "k8s.io/client-go/tools/leaderelection/resourcelock" "k8s.io/client-go/tools/record" "github.com/kubeflow/tf-operator/cmd/tf-operator/app/options" - "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" - tfjobclient "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" + "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" + tfjobclientset "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" - informers "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions" + tfjobinformers "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions" "github.com/kubeflow/tf-operator/pkg/controller" - "github.com/kubeflow/tf-operator/pkg/util" - "github.com/kubeflow/tf-operator/pkg/util/k8sutil" + "github.com/kubeflow/tf-operator/pkg/util/signals" "github.com/kubeflow/tf-operator/version" ) var ( + // leader election config leaseDuration = 15 * time.Second renewDuration = 5 * time.Second retryPeriod = 3 * time.Second ) +const RecommendedKubeConfigPathEnv = "KUBECONFIG" + func Run(opt *options.ServerOption) error { // Check if the -version flag was passed and, if so, print the version and exit. @@ -54,40 +57,51 @@ func Run(opt *options.ServerOption) error { version.PrintVersionAndExit() } - namespace := os.Getenv(util.EnvKubeflowNamespace) + namespace := os.Getenv(v1alpha2.EnvKubeflowNamespace) if len(namespace) == 0 { log.Infof("KUBEFLOW_NAMESPACE not set, using default namespace") namespace = metav1.NamespaceDefault } - // To help debugging, immediately log version + // To help debugging, immediately log version. log.Infof("%+v", version.Info()) - config, err := k8sutil.GetClusterConfig() - if err != nil { - return err + // Set up signals so we handle the first shutdown signal gracefully. + stopCh := signals.SetupSignalHandler() + + // Note: ENV KUBECONFIG will overwrite user defined Kubeconfig option. + if len(os.Getenv(RecommendedKubeConfigPathEnv)) > 0 { + // use the current context in kubeconfig + // This is very useful for running locally. + opt.Kubeconfig = os.Getenv(RecommendedKubeConfigPathEnv) } - kubeClient, leaderElectionClient, tfJobClient, err := createClients(config) + // Get kubernetes config. + kcfg, err := clientcmd.BuildConfigFromFlags(opt.MasterURL, opt.Kubeconfig) if err != nil { - return err + log.Fatalf("Error building kubeconfig: %s", err.Error()) } - controllerConfig := readControllerConfig(opt.ControllerConfigFile) - - neverStop := make(chan struct{}) - defer close(neverStop) - - tfJobInformerFactory := informers.NewSharedInformerFactory(tfJobClient, time.Second*30) - controller, err := controller.New(kubeClient, tfJobClient, *controllerConfig, tfJobInformerFactory, opt.EnableGangScheduling) + // Create clients. + kubeClientSet, leaderElectionClientSet, tfJobClientSet, err := createClientSets(kcfg) if err != nil { return err } - go tfJobInformerFactory.Start(neverStop) + // Create informer factory. + kubeInformerFactory := kubeinformers.NewSharedInformerFactory(kubeClientSet, time.Second*30) + tfJobInformerFactory := tfjobinformers.NewSharedInformerFactory(tfJobClientSet, time.Second*30) + + // Create tf controller. + tc := controller.NewTFJobController(kubeClientSet, tfJobClientSet, kubeInformerFactory, tfJobInformerFactory) + + // Start informer goroutines. + go kubeInformerFactory.Start(stopCh) + go tfJobInformerFactory.Start(stopCh) - run := func(stopCh <-chan struct{}) { - controller.Run(1, stopCh) + // Set leader election start function. + run := func(<-chan struct{}) { + tc.Run(opt.Threadiness, stopCh) } id, err := os.Hostname() @@ -104,13 +118,14 @@ func Run(opt *options.ServerOption) error { Namespace: namespace, Name: "tf-operator", }, - Client: leaderElectionClient.CoreV1(), + Client: leaderElectionClientSet.CoreV1(), LockConfig: resourcelock.ResourceLockConfig{ Identity: id, EventRecorder: recorder, }, } + // Start leader election. election.RunOrDie(election.LeaderElectionConfig{ Lock: rl, LeaseDuration: leaseDuration, @@ -127,41 +142,21 @@ func Run(opt *options.ServerOption) error { return nil } -func readControllerConfig(controllerConfigFile string) *v1alpha1.ControllerConfig { - controllerConfig := &v1alpha1.ControllerConfig{} - if controllerConfigFile != "" { - log.Infof("Loading controller config from %v.", controllerConfigFile) - data, err := ioutil.ReadFile(controllerConfigFile) - if err != nil { - log.Fatalf("Could not read file: %v. Error: %v", controllerConfigFile, err) - return controllerConfig - } - err = yaml.Unmarshal(data, controllerConfig) - if err != nil { - log.Fatalf("Could not parse controller config; Error: %v\n", err) - } - log.Infof("ControllerConfig: %v", util.Pformat(controllerConfig)) - } else { - log.Info("No controller_config_file provided; using empty config.") - } - return controllerConfig -} - -func createClients(config *rest.Config) (clientset.Interface, clientset.Interface, tfjobclient.Interface, error) { - kubeClient, err := clientset.NewForConfig(rest.AddUserAgent(config, "tfjob_operator")) +func createClientSets(config *restclientset.Config) (kubeclientset.Interface, kubeclientset.Interface, tfjobclientset.Interface, error) { + kubeClientSet, err := kubeclientset.NewForConfig(restclientset.AddUserAgent(config, "tf-operator")) if err != nil { return nil, nil, nil, err } - leaderElectionClient, err := clientset.NewForConfig(rest.AddUserAgent(config, "leader-election")) + leaderElectionClientSet, err := kubeclientset.NewForConfig(restclientset.AddUserAgent(config, "leader-election")) if err != nil { return nil, nil, nil, err } - tfJobClient, err := tfjobclient.NewForConfig(config) + tfJobClientSet, err := tfjobclientset.NewForConfig(config) if err != nil { return nil, nil, nil, err } - return kubeClient, leaderElectionClient, tfJobClient, nil + return kubeClientSet, leaderElectionClientSet, tfJobClientSet, nil } diff --git a/cmd/tf-operator/main.go b/cmd/tf-operator/main.go index a91fe1aa2f..d4fcfe4bef 100644 --- a/cmd/tf-operator/main.go +++ b/cmd/tf-operator/main.go @@ -25,7 +25,7 @@ import ( ) func init() { - // Add filename as one of the fields of the structured log message + // Add filename as one of the fields of the structured log message. filenameHook := filename.NewHook() filenameHook.Field = "filename" log.AddHook(filenameHook) @@ -38,7 +38,7 @@ func main() { flag.Parse() if s.JSONLogFormat { - // Output logs in a json format so that it can be parsed by services like Stackdriver + // Output logs in a json format so that it can be parsed by services like Stackdriver. log.SetFormatter(&log.JSONFormatter{}) } diff --git a/pkg/apis/tensorflow/v1alpha2/constants.go b/pkg/apis/tensorflow/v1alpha2/constants.go new file mode 100644 index 0000000000..2f712de58d --- /dev/null +++ b/pkg/apis/tensorflow/v1alpha2/constants.go @@ -0,0 +1,20 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1alpha2 + +const ( + // EnvKubeflowNamespace is ENV for kubeflow namespace specified by user. + EnvKubeflowNamespace = "KUBEFLOW_NAMESPACE" +) diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index 130e5d3ffa..c3db9c398d 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -12,152 +12,259 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package controller provides a Kubernetes controller for a TensorFlow job resource. +// Package controller provides a Kubernetes controller for a TFJob resource. + package controller import ( - "errors" "fmt" + "strings" "time" log "github.com/sirupsen/logrus" + "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/util/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/client-go/kubernetes" + kubeinformers "k8s.io/client-go/informers" + kubeclientset "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" + corelisters "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" "k8s.io/client-go/util/workqueue" - tfv1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" - tfjobclient "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" - kubeflowscheme "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" - informers "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions" - listers "github.com/kubeflow/tf-operator/pkg/client/listers/kubeflow/v1alpha1" - "github.com/kubeflow/tf-operator/pkg/trainer" + tfv1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" + tfjobclientset "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" + tfjobscheme "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" + tfjobinformers "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions" + tfjoblisters "github.com/kubeflow/tf-operator/pkg/client/listers/kubeflow/v1alpha2" ) const ( - controllerName = "kubeflow" -) + controllerName = "tf-operator" -var ( - ErrVersionOutdated = errors.New("requested version is outdated in apiserver") + // labels for pods and servers. + tfReplicaTypeLabel = "tf-replica-type" + tfReplicaIndexLabel = "tf-replica-index" - // IndexerInformer uses a delta queue, therefore for deletes we have to use this - // key function but it should be just fine for non delete events. - keyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc + hit = "hit" + noHit = "no-hit" - // DefaultJobBackOff is the max backoff period, exported for the e2e test - DefaultJobBackOff = 10 * time.Second - // MaxJobBackOff is the max backoff period, exported for the e2e test - MaxJobBackOff = 360 * time.Second + defaultPort = 2222 + defaultPortStr = "2222" ) -type Controller struct { - KubeClient kubernetes.Interface - TFJobClient tfjobclient.Interface +// controllerKind contains the schema.GroupVersionKind for this controller type. +var controllerKind = tfv1alpha2.SchemeGroupVersion.WithKind("TFJob") + +var groupVersionKind = schema.GroupVersionKind{ + Group: tfv1alpha2.GroupName, + Version: tfv1alpha2.GroupVersion, + Kind: tfv1alpha2.TFJobResourceKind, +} + +// TFJobControllerConfiguration contains configuration of tf-operator. +// DefaultTimerConfig is the suggested tf-operator configuration for production. +type TFJobControllerConfiguration struct { + // ReconcilerSyncLoopPeriod is the amount of time the reconciler sync states loop + // wait between two reconciler sync. + // It is set to 15 sec by default. + // TODO(cph): maybe we can let it grows by multiple in the future + // and up to 5 minutes to reduce idle loop. + // e.g. 15s, 30s, 60s, 120s... + ReconcilerSyncLoopPeriod metav1.Duration +} + +// DefaultTFJobControllerConfiguration is the suggested tf-operator configuration for production. +var DefaultTFJobControllerConfiguration TFJobControllerConfiguration = TFJobControllerConfiguration{ + ReconcilerSyncLoopPeriod: metav1.Duration{Duration: 15 * time.Second}, +} + +type TFJobController struct { + config TFJobControllerConfiguration + + // podControl is used to add or delete pods. + podControl PodControlInterface + + // serviceControl is used to add or delete services. + serviceControl ServiceControlInterface - config tfv1alpha1.ControllerConfig - jobs map[string]*trainer.TrainingJob + // kubeClientSet is a standard kubernetes clientset. + kubeClientSet kubeclientset.Interface - TFJobLister listers.TFJobLister - TFJobSynced cache.InformerSynced + // tfJobClientSet is a clientset for CRD TFJob. + tfJobClientSet tfjobclientset.Interface - // WorkQueue is a rate limited work queue. This is used to queue work to be + // To allow injection of syncTFJob for testing. + syncHandler func(tfJobKey string) (bool, error) + + // Listers for TFJob, Pod and Service + // tfJobLister can list/get tfjobs from the shared informer's store. + tfJobLister tfjoblisters.TFJobLister + + // podLister can list/get pods from the shared informer's store. + podLister corelisters.PodLister + + // serviceLister can list/get services from the shared informer's store. + serviceLister corelisters.ServiceLister + + // tfJobListerSynced returns true if the tfjob store has been synced at least once. + tfJobListerSynced cache.InformerSynced + + // podListerSynced returns true if the pod store has been synced at least once. + podListerSynced cache.InformerSynced + + // serviceListerSynced returns true if the service store has been synced at least once. + serviceListerSynced cache.InformerSynced + + // A TTLCache of pod/services creates/deletes each tfjob expects to see + // We use TFJob namespace/name + TFReplicaType + pods/services as an expectation key, + // For example, there is a TFJob with namespace "tf-operator" and name "tfjob-abc": + // { + // "PS": { + // "Replicas": 2, + // }, + // "Worker": { + // "Replicas": 4, + // } + // } + // We will create 4 expectations: + // - "tf-operator/tfjob-abc/ps/services", expects 2 adds. + // - "tf-operator/tfjob-abc/ps/pods", expects 2 adds. + // - "tf-operator/tfjob-abc/worker/services", expects 4 adds. + // - "tf-operator/tfjob-abc/worker/pods", expects 4 adds. + expectations ControllerExpectationsInterface + + // workQueue is a rate limited work queue. This is used to queue work to be // processed instead of performing it as soon as a change happens. This // means we can ensure we only process a fixed amount of resources at a // time, and makes it easy to ensure we are never processing the same item // simultaneously in two different workers. - WorkQueue workqueue.RateLimitingInterface + workQueue workqueue.RateLimitingInterface // recorder is an event recorder for recording Event resources to the // Kubernetes API. recorder record.EventRecorder - - syncHandler func(jobKey string) (bool, error) - - enableGangScheduling bool } -func New(kubeClient kubernetes.Interface, tfJobClient tfjobclient.Interface, - config tfv1alpha1.ControllerConfig, tfJobInformerFactory informers.SharedInformerFactory, - enableGangScheduling bool) (*Controller, error) { - tfJobInformer := tfJobInformerFactory.Kubeflow().V1alpha1().TFJobs() +// NewTFJobController returns a new TFJob controller. +func NewTFJobController( + kubeClientSet kubeclientset.Interface, + tfJobClientSet tfjobclientset.Interface, + kubeInformerFactory kubeinformers.SharedInformerFactory, + tfJobInformerFactory tfjobinformers.SharedInformerFactory) *TFJobController { + + tfjobscheme.AddToScheme(scheme.Scheme) - kubeflowscheme.AddToScheme(scheme.Scheme) log.Debug("Creating event broadcaster") eventBroadcaster := record.NewBroadcaster() eventBroadcaster.StartLogging(log.Infof) - eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) + eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClientSet.CoreV1().Events("")}) recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: controllerName}) - controller := &Controller{ - KubeClient: kubeClient, - TFJobClient: tfJobClient, - WorkQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "TFjobs"), - recorder: recorder, - // TODO(jlewi)): What to do about cluster.Cluster? - jobs: make(map[string]*trainer.TrainingJob), - config: config, - enableGangScheduling: enableGangScheduling, + realPodControl := RealPodControl{ + KubeClient: kubeClientSet, + Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "tfjob-controller"}), } - log.Info("Setting up event handlers") - // Set up an event handler for when Foo resources change - tfJobInformer.Informer().AddEventHandler( - cache.FilteringResourceEventHandler{ - FilterFunc: func(obj interface{}) bool { - switch t := obj.(type) { - case *tfv1alpha1.TFJob: - log.Debugf("filter tfjob name: %v", t.Name) - return true - default: - return false - } - }, - Handler: cache.ResourceEventHandlerFuncs{ - AddFunc: controller.enqueueController, - UpdateFunc: func(oldObj, newObj interface{}) { - controller.enqueueController(newObj) - }, - DeleteFunc: controller.enqueueController, - }, - }) - - controller.TFJobLister = tfJobInformer.Lister() - controller.TFJobSynced = tfJobInformer.Informer().HasSynced - controller.syncHandler = controller.syncTFJob - - return controller, nil + realServiceControl := RealServiceControl{ + KubeClient: kubeClientSet, + Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "tfjob-controller"}), + } + + // Create new TFJobController. + tc := &TFJobController{ + podControl: realPodControl, + serviceControl: realServiceControl, + kubeClientSet: kubeClientSet, + tfJobClientSet: tfJobClientSet, + expectations: NewControllerExpectations(), + workQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "tfjobs"), + recorder: recorder, + } + + // Set sync handler. + tc.syncHandler = tc.syncTFJob + + // Create tfjob informer. + tfJobInformer := tfJobInformerFactory.Kubeflow().V1alpha2().TFJobs() + + // Set up an event handler for when tfjob resources change. + tfJobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: tc.enqueueTFJob, + UpdateFunc: tc.updateTFJob, + // This will enter the sync loop and no-op, + // because the tfjob has been deleted from the store. + DeleteFunc: tc.enqueueTFJob, + }) + + tc.tfJobLister = tfJobInformer.Lister() + tc.tfJobListerSynced = tfJobInformer.Informer().HasSynced + + // Create pod informer. + podInformer := kubeInformerFactory.Core().V1().Pods() + + // Set up an event handler for when pod resources change + podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: tc.addPod, + UpdateFunc: tc.updatePod, + DeleteFunc: tc.deletePod, + }) + + tc.podLister = podInformer.Lister() + tc.podListerSynced = podInformer.Informer().HasSynced + + // Create service informer. + serviceInformer := kubeInformerFactory.Core().V1().Services() + + // Set up an event handler for when service resources change. + serviceInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: tc.addService, + UpdateFunc: tc.updateService, + DeleteFunc: tc.deleteService, + }) + + tc.serviceLister = serviceInformer.Lister() + tc.serviceListerSynced = serviceInformer.Informer().HasSynced + + return tc } // Run will set up the event handlers for types we are interested in, as well // as syncing informer caches and starting workers. It will block until stopCh // is closed, at which point it will shutdown the workqueue and wait for // workers to finish processing their current work items. -func (c *Controller) Run(threadiness int, stopCh <-chan struct{}) error { +func (tc *TFJobController) Run(threadiness int, stopCh <-chan struct{}) error { defer runtime.HandleCrash() - defer c.WorkQueue.ShutDown() + defer tc.workQueue.ShutDown() - // Start the informer factories to begin populating the informer caches + // Start the informer factories to begin populating the informer caches. log.Info("Starting TFJob controller") - // Wait for the caches to be synced before starting workers + // Wait for the caches to be synced before starting workers. log.Info("Waiting for informer caches to sync") - if ok := cache.WaitForCacheSync(stopCh, c.TFJobSynced); !ok { - return fmt.Errorf("failed to wait for caches to sync") + if ok := cache.WaitForCacheSync(stopCh, tc.tfJobListerSynced); !ok { + return fmt.Errorf("failed to wait for tfjob caches to sync") + } + + if ok := cache.WaitForCacheSync(stopCh, tc.podListerSynced); !ok { + return fmt.Errorf("failed to wait for pod caches to sync") + } + + if ok := cache.WaitForCacheSync(stopCh, tc.serviceListerSynced); !ok { + return fmt.Errorf("failed to wait for service caches to sync") } log.Infof("Starting %v workers", threadiness) - // Launch workers to process TFJob resources + // Launch workers to process TFJob resources. for i := 0; i < threadiness; i++ { - go wait.Until(c.runWorker, time.Second, stopCh) + go wait.Until(tc.runWorker, time.Second, stopCh) } log.Info("Started workers") @@ -170,103 +277,198 @@ func (c *Controller) Run(threadiness int, stopCh <-chan struct{}) error { // runWorker is a long-running function that will continually call the // processNextWorkItem function in order to read and process a message on the // workqueue. -func (c *Controller) runWorker() { - for c.processNextWorkItem() { +func (tc *TFJobController) runWorker() { + for tc.processNextWorkItem() { } } // processNextWorkItem will read a single work item off the workqueue and // attempt to process it, by calling the syncHandler. -func (c *Controller) processNextWorkItem() bool { - key, quit := c.WorkQueue.Get() +func (tc *TFJobController) processNextWorkItem() bool { + key, quit := tc.workQueue.Get() if quit { return false } - defer c.WorkQueue.Done(key) + defer tc.workQueue.Done(key) - forget, err := c.syncHandler(key.(string)) + forget, err := tc.syncHandler(key.(string)) if err == nil { if forget { - c.WorkQueue.Forget(key) + tc.workQueue.Forget(key) } return true } - utilruntime.HandleError(fmt.Errorf("Error syncing job: %v", err)) - c.WorkQueue.AddRateLimited(key) + utilruntime.HandleError(fmt.Errorf("Error syncing tfjob: %v", err)) + tc.workQueue.AddRateLimited(key) return true } -// syncTFJob will sync the job with the given. This function is not meant to be invoked -// concurrently with the same key. -// -// When a job is completely processed it will return true indicating that its ok to forget about this job since -// no more processing will occur for it. -func (c *Controller) syncTFJob(key string) (bool, error) { +func (tc *TFJobController) enqueueTFJob(tfjob interface{}) { + key, err := KeyFunc(tfjob) + if err != nil { + utilruntime.HandleError(fmt.Errorf("Couldn't get key for tfjob object %#v: %v", tfjob, err)) + return + } + + tc.workQueue.Add(key) +} + +// syncTFJob will sync the tfjob with the given key if it has had its expectations fulfilled, meaning +// it did not expect to see any more of its pods/services created or deleted. +// This function is not meant to be invoked concurrently with the same key. +func (tc *TFJobController) syncTFJob(key string) (bool, error) { startTime := time.Now() defer func() { - log.Debugf("Finished syncing job %q (%v)", key, time.Since(startTime)) + log.Infof("Finished syncing tfjob %q (%v)", key, time.Since(startTime)) }() - ns, name, err := cache.SplitMetaNamespaceKey(key) + namespace, name, err := cache.SplitMetaNamespaceKey(key) if err != nil { return false, err } - if len(ns) == 0 || len(name) == 0 { - return false, fmt.Errorf("invalid job key %q: either namespace or name is missing", key) - } - - tfJob, err := c.TFJobLister.TFJobs(ns).Get(name) + tfjob, err := tc.tfJobLister.TFJobs(namespace).Get(name) if err != nil { - if apierrors.IsNotFound(err) { - log.Debugf("Job has been deleted: %v", key) + if errors.IsNotFound(err) { + log.Infof("TFJob has been deleted: %v", key) + // jm.expectations.DeleteExpectations(key) return true, nil } return false, err } - // Create a new TrainingJob if there is no TrainingJob stored for it in the jobs map or if the UID's don't match. - // The UID's won't match in the event we deleted the job and then recreated the job with the same name. - if cJob, ok := c.jobs[key]; !ok || cJob.UID() != tfJob.UID { - nc, err := trainer.NewJob(c.KubeClient, c.TFJobClient, c.recorder, tfJob, &c.config) + tfjobNeedsSync := tc.satisfiedExpectations(tfjob) + + var reconcileTFJobsErr error + if tfjobNeedsSync && tfjob.DeletionTimestamp == nil { + reconcileTFJobsErr = tc.reconcileTFJobs(tfjob) + } + + if reconcileTFJobsErr != nil { + return false, reconcileTFJobsErr + } + + return true, err +} + +// reconcileTFJobs checks and updates replicas for each given TFReplicaSpec. +// It will requeue the tfjob in case of an error while creating/deleting pods/services. +func (tc *TFJobController) reconcileTFJobs(tfjob *tfv1alpha2.TFJob) error { + pods, err := tc.getPodsForTFJob(tfjob) + + if err != nil { + log.Infof("getPodsForTFJob error %v", err) + return err + } + + services, err := tc.getServicesForTFJob(tfjob) + + if err != nil { + log.Infof("getServicesForTFJob error %v", err) + return err + } + + // Diff current active pods/services with replicas. + for rtype, spec := range tfjob.Spec.TFReplicaSpecs { + err = tc.reconcilePods(tfjob, pods, rtype, spec) if err != nil { - return false, err + log.Infof("reconcilePods error %v", err) + return err } - c.jobs[key] = nc - } - nc := c.jobs[key] + err = tc.reconcileServices(tfjob, services, rtype, spec) - if err := nc.Reconcile(&c.config, c.enableGangScheduling); err != nil { - return false, err + if err != nil { + log.Infof("reconcileServices error %v", err) + return err + } } - tfJob, err = c.TFJobClient.KubeflowV1alpha1().TFJobs(tfJob.ObjectMeta.Namespace).Get(tfJob.ObjectMeta.Name, metav1.GetOptions{}) + return nil +} + +func genGeneralName(tfjobKey, rtype, index string) string { + n := tfjobKey + "-" + rtype + "-" + index + return strings.Replace(n, "/", "-", -1) +} +// satisfiedExpectations returns true if the required adds/dels for the given tfjob have been observed. +// Add/del counts are established by the controller at sync time, and updated as controllees are observed by the controller +// manager. +func (tc *TFJobController) satisfiedExpectations(tfjob *tfv1alpha2.TFJob) bool { + satisfied := false + tfjobKey, err := KeyFunc(tfjob) if err != nil { - return false, err + utilruntime.HandleError(fmt.Errorf("Couldn't get key for tfjob object %#v: %v", tfjob, err)) + return false } - // TODO(jlewi): This logic will need to change when/if we get rid of phases and move to conditions. At that - // case we should forget about a job when the appropriate condition is reached. - if tfJob.Status.Phase == tfv1alpha1.TFJobPhaseCleanUp { - return true, nil - } else { - return false, nil + for rtype, _ := range tfjob.Spec.TFReplicaSpecs { + // Check the expectations of the pods. + expectationPodsKey := genExpectationPodsKey(tfjobKey, string(rtype)) + satisfied = satisfied || tc.expectations.SatisfiedExpectations(expectationPodsKey) + + // Check the expectations of the services. + expectationServicesKey := genExpectationServicesKey(tfjobKey, string(rtype)) + satisfied = satisfied || tc.expectations.SatisfiedExpectations(expectationServicesKey) + } + + return satisfied +} + +func genLabels(tfjobKey string) map[string]string { + return map[string]string{ + "group_name": tfv1alpha2.GroupName, + "tf_job_key": strings.Replace(tfjobKey, "/", "-", -1), } +} + +// When a pod is updated, enqueue the current tfjob. +func (tc *TFJobController) updateTFJob(old, cur interface{}) { + oldTFJob := old.(*tfv1alpha2.TFJob) + log.Infof("Updating tfjob: %s", oldTFJob.Name) + tc.enqueueTFJob(cur) +} +func (tc *TFJobController) updateTFJobStatus(tfjob *tfv1alpha2.TFJob) error { + // TODO + return nil } -// obj could be an *batch.Job, or a DeletionFinalStateUnknown marker item. -func (c *Controller) enqueueController(obj interface{}) { - key, err := keyFunc(obj) +// resolveControllerRef returns the tfjob referenced by a ControllerRef, +// or nil if the ControllerRef could not be resolved to a matching tfjob +// of the correct Kind. +func (tc *TFJobController) resolveControllerRef(namespace string, controllerRef *metav1.OwnerReference) *tfv1alpha2.TFJob { + // We can't look up by UID, so look up by Name and then verify UID. + // Don't even try to look up by Name if it's the wrong Kind. + if controllerRef.Kind != controllerKind.Kind { + return nil + } + tfjob, err := tc.tfJobLister.TFJobs(namespace).Get(controllerRef.Name) if err != nil { - utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err)) - return + return nil + } + if tfjob.UID != controllerRef.UID { + // The controller we found with this Name is not the same one that the + // ControllerRef points to. + return nil + } + return tfjob +} + +func genOwnerReference(tfjob *tfv1alpha2.TFJob) *metav1.OwnerReference { + boolPtr := func(b bool) *bool { return &b } + controllerRef := &metav1.OwnerReference{ + APIVersion: groupVersionKind.GroupVersion().String(), + Kind: groupVersionKind.Kind, + Name: tfjob.Name, + UID: tfjob.UID, + BlockOwnerDeletion: boolPtr(true), + Controller: boolPtr(true), } - c.WorkQueue.AddRateLimited(key) + return controllerRef } diff --git a/pkg/controller/controller_pod.go b/pkg/controller/controller_pod.go new file mode 100644 index 0000000000..94486578a9 --- /dev/null +++ b/pkg/controller/controller_pod.go @@ -0,0 +1,273 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package controller provides a Kubernetes controller for a TFJob resource. + +package controller + +import ( + "fmt" + "strings" + + log "github.com/sirupsen/logrus" + + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + + tfv1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" +) + +// reconcilePods checks and updates pods for each given TFReplicaSpec. +// It will requeue the tfjob in case of an error while creating/deleting pods. +func (tc *TFJobController) reconcilePods( + tfjob *tfv1alpha2.TFJob, + pods []*v1.Pod, + rtype tfv1alpha2.TFReplicaType, + spec *tfv1alpha2.TFReplicaSpec) error { + tfjobKey, err := KeyFunc(tfjob) + if err != nil { + utilruntime.HandleError(fmt.Errorf("Couldn't get key for tfjob object %#v: %v", tfjob, err)) + return err + } + + // Convert TFReplicaType to lower string. + rt := strings.ToLower(string(rtype)) + + // Get active pods for this TFReplicaType. + activePods := filterActivePodsForTFReplicaType(pods, rt) + + diff := len(activePods) - int(*(spec.Replicas)) + + if diff < 0 { + // Need to create new pods. + diffIndexes := getDiffPodIndexes(activePods, *spec.Replicas) + if diff+len(diffIndexes) != 0 { + // This should never happened. + return fmt.Errorf("diff is not equal to length of diffIndexes") + } + + expectationPodsKey := genExpectationPodsKey(tfjobKey, rt) + tc.expectations.ExpectCreations(expectationPodsKey, int(diff)) + + for _, index := range diffIndexes { + log.Infof("need to create new pod: %s-%s", rt, index) + + // Create OwnerReference. + controllerRef := genOwnerReference(tfjob) + + // Append tfReplicaTypeLabel and tfReplicaIndexLabel labels. + pTemplate := spec.Template.DeepCopy() + + labels := genLabels(tfjobKey) + labels[tfReplicaTypeLabel] = rt + labels[tfReplicaIndexLabel] = index + + pTemplate.Labels = labels + + // Generate TF_CONFIG JSON string. + tfConfigStr := genTFConfigJSONStr(tfjob, rt, index) + + if tfConfigStr == "" { + return nil + } + + // Add TF_CONFIG environment variable. + for _, c := range pTemplate.Spec.Containers { + if len(c.Env) == 0 { + c.Env = make([]v1.EnvVar, 0) + } + c.Env = append(c.Env, v1.EnvVar{ + Name: "TF_CONFIG", + Value: tfConfigStr, + }) + } + + err := tc.podControl.CreatePodsWithControllerRef(tfjob.Namespace, pTemplate, tfjob, controllerRef) + if err != nil && errors.IsTimeout(err) { + // Pod is created but its initialization has timed out. + // If the initialization is successful eventually, the + // controller will observe the creation via the informer. + // If the initialization fails, or if the pod keeps + // uninitialized for a long time, the informer will not + // receive any update, and the controller will create a new + // pod when the expectation expires. + return nil + } + return err + } + } else if diff > 0 { + // TODO(CPH): Need to delete pods. + } + + return nil +} + +// getDiffPodIndexes checks and gets diff indexes from desired and current. +func getDiffPodIndexes(activePods []*v1.Pod, replicas int32) []string { + desiredIndexes := make(map[string]string) + + for i := int32(0); i < replicas; i++ { + desiredIndexes[fmt.Sprintf("%d", i)] = noHit + } + + for _, pod := range activePods { + if _, ok := pod.Labels[tfReplicaIndexLabel]; !ok { + continue + } + + index := pod.Labels[tfReplicaIndexLabel] + + if _, ok := desiredIndexes[index]; ok { + desiredIndexes[index] = hit + } + } + + diffIndexes := []string{} + for index, hit := range desiredIndexes { + if hit == noHit { + diffIndexes = append(diffIndexes, index) + } + } + + return diffIndexes +} + +// getPodsForTFJob returns the set of pods that this tfjob should manage. +// It also reconciles ControllerRef by adopting/orphaning. +// Note that the returned Pods are pointers into the cache. +func (tc *TFJobController) getPodsForTFJob(tfjob *tfv1alpha2.TFJob) ([]*v1.Pod, error) { + tfjobKey, err := KeyFunc(tfjob) + if err != nil { + utilruntime.HandleError(fmt.Errorf("Couldn't get key for tfjob object %#v: %v", tfjob, err)) + return nil, err + } + + // Create selector. + selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ + MatchLabels: genLabels(tfjobKey), + }) + + if err != nil { + return nil, fmt.Errorf("couldn't convert Job selector: %v", err) + } + // List all pods to include those that don't match the selector anymore + // but have a ControllerRef pointing to this controller. + pods, err := tc.podLister.Pods(tfjob.Namespace).List(labels.Everything()) + if err != nil { + return nil, err + } + + // If any adoptions are attempted, we should first recheck for deletion + // with an uncached quorum read sometime after listing Pods (see #42639). + canAdoptFunc := RecheckDeletionTimestamp(func() (metav1.Object, error) { + fresh, err := tc.tfJobClientSet.KubeflowV1alpha2().TFJobs(tfjob.Namespace).Get(tfjob.Name, metav1.GetOptions{}) + if err != nil { + return nil, err + } + if fresh.UID != tfjob.UID { + return nil, fmt.Errorf("original TFJob %v/%v is gone: got uid %v, wanted %v", tfjob.Namespace, tfjob.Name, fresh.UID, tfjob.UID) + } + return fresh, nil + }) + cm := NewPodControllerRefManager(tc.podControl, tfjob, selector, controllerKind, canAdoptFunc) + return cm.ClaimPods(pods) +} + +// filterActivePodsForTFReplicaType returns pods that have not terminated, +// and belong to a TFReplicaType. +func filterActivePodsForTFReplicaType(pods []*v1.Pod, tfReplicaType string) []*v1.Pod { + activePods := FilterActivePods(pods) + + var result []*v1.Pod + + tfReplicaSelector := &metav1.LabelSelector{ + MatchLabels: make(map[string]string), + } + + tfReplicaSelector.MatchLabels[tfReplicaTypeLabel] = tfReplicaType + + for _, pod := range activePods { + selector, _ := metav1.LabelSelectorAsSelector(tfReplicaSelector) + if !selector.Matches(labels.Set(pod.Labels)) { + continue + } + result = append(result, pod) + } + return result +} + +func genExpectationPodsKey(tfjobKey, replicaType string) string { + return tfjobKey + "/" + strings.ToLower(replicaType) + "/pods" +} + +// When a pod is created, enqueue the tfjob that manages it and update its expectations. +func (tc *TFJobController) addPod(obj interface{}) { + pod := obj.(*v1.Pod) + if pod.DeletionTimestamp != nil { + // on a restart of the controller controller, it's possible a new pod shows up in a state that + // is already pending deletion. Prevent the pod from being a creation observation. + // tc.deletePod(pod) + return + } + + // If it has a ControllerRef, that's all that matters. + if controllerRef := metav1.GetControllerOf(pod); controllerRef != nil { + tfjob := tc.resolveControllerRef(pod.Namespace, controllerRef) + if tfjob == nil { + return + } + + tfjobKey, err := KeyFunc(tfjob) + if err != nil { + return + } + + if _, ok := pod.Labels[tfReplicaTypeLabel]; !ok { + log.Infof("This pod maybe not created by tf-operator") + return + } + + rtype := pod.Labels[tfReplicaTypeLabel] + expectationPodsKey := genExpectationPodsKey(tfjobKey, rtype) + + tc.expectations.CreationObserved(expectationPodsKey) + tc.enqueueTFJob(tfjob) + + return + } + + // Otherwise, it's an orphan. Get a list of all matching controllers and sync + // them to see if anyone wants to adopt it. + // DO NOT observe creation because no controller should be waiting for an + // orphan. + // for _, tfjob := range tc.getPodJobs(pod) { + // tc.enqueueTFJob(tfjob) + // } +} + +// When a pod is updated, figure out what tfjob/s manage it and wake them up. +// If the labels of the pod have changed we need to awaken both the old +// and new replica set. old and cur must be *v1.Pod types. +func (tc *TFJobController) updatePod(old, cur interface{}) { + // TODO(CPH): handle this gracefully. +} + +// When a pod is deleted, enqueue the tfjob that manages the pod and update its expectations. +// obj could be an *v1.Pod, or a DeletionFinalStateUnknown marker item. +func (tc *TFJobController) deletePod(obj interface{}) { + // TODO(CPH): handle this gracefully. +} diff --git a/pkg/controller/controller_ref_manager.go b/pkg/controller/controller_ref_manager.go new file mode 100644 index 0000000000..acc17b089c --- /dev/null +++ b/pkg/controller/controller_ref_manager.go @@ -0,0 +1,399 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Note(CPH): this file is copied form k8s.io/kubernetes/pkg/controller +// We should not import the huge package k8s.io/kubernetes/pkg + +package controller + +import ( + "fmt" + "sync" + + "github.com/golang/glog" + // apps "k8s.io/api/apps/v1" + "k8s.io/api/core/v1" + // extensions "k8s.io/api/extensions/v1beta1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime/schema" + utilerrors "k8s.io/apimachinery/pkg/util/errors" +) + +type BaseControllerRefManager struct { + Controller metav1.Object + Selector labels.Selector + + canAdoptErr error + canAdoptOnce sync.Once + CanAdoptFunc func() error +} + +func (m *BaseControllerRefManager) CanAdopt() error { + m.canAdoptOnce.Do(func() { + if m.CanAdoptFunc != nil { + m.canAdoptErr = m.CanAdoptFunc() + } + }) + return m.canAdoptErr +} + +// ClaimObject tries to take ownership of an object for this controller. +// +// It will reconcile the following: +// * Adopt orphans if the match function returns true. +// * Release owned objects if the match function returns false. +// +// A non-nil error is returned if some form of reconciliation was attempted and +// failed. Usually, controllers should try again later in case reconciliation +// is still needed. +// +// If the error is nil, either the reconciliation succeeded, or no +// reconciliation was necessary. The returned boolean indicates whether you now +// own the object. +// +// No reconciliation will be attempted if the controller is being deleted. +func (m *BaseControllerRefManager) ClaimObject(obj metav1.Object, match func(metav1.Object) bool, adopt, release func(metav1.Object) error) (bool, error) { + controllerRef := metav1.GetControllerOf(obj) + if controllerRef != nil { + if controllerRef.UID != m.Controller.GetUID() { + // Owned by someone else. Ignore. + return false, nil + } + if match(obj) { + // We already own it and the selector matches. + // Return true (successfully claimed) before checking deletion timestamp. + // We're still allowed to claim things we already own while being deleted + // because doing so requires taking no actions. + return true, nil + } + // Owned by us but selector doesn't match. + // Try to release, unless we're being deleted. + if m.Controller.GetDeletionTimestamp() != nil { + return false, nil + } + if err := release(obj); err != nil { + // If the pod no longer exists, ignore the error. + if errors.IsNotFound(err) { + return false, nil + } + // Either someone else released it, or there was a transient error. + // The controller should requeue and try again if it's still stale. + return false, err + } + // Successfully released. + return false, nil + } + + // It's an orphan. + if m.Controller.GetDeletionTimestamp() != nil || !match(obj) { + // Ignore if we're being deleted or selector doesn't match. + return false, nil + } + if obj.GetDeletionTimestamp() != nil { + // Ignore if the object is being deleted + return false, nil + } + // Selector matches. Try to adopt. + if err := adopt(obj); err != nil { + // If the pod no longer exists, ignore the error. + if errors.IsNotFound(err) { + return false, nil + } + // Either someone else claimed it first, or there was a transient error. + // The controller should requeue and try again if it's still orphaned. + return false, err + } + // Successfully adopted. + return true, nil +} + +type PodControllerRefManager struct { + BaseControllerRefManager + controllerKind schema.GroupVersionKind + podControl PodControlInterface +} + +// NewPodControllerRefManager returns a PodControllerRefManager that exposes +// methods to manage the controllerRef of pods. +// +// The CanAdopt() function can be used to perform a potentially expensive check +// (such as a live GET from the API server) prior to the first adoption. +// It will only be called (at most once) if an adoption is actually attempted. +// If CanAdopt() returns a non-nil error, all adoptions will fail. +// +// NOTE: Once CanAdopt() is called, it will not be called again by the same +// PodControllerRefManager instance. Create a new instance if it makes +// sense to check CanAdopt() again (e.g. in a different sync pass). +func NewPodControllerRefManager( + podControl PodControlInterface, + controller metav1.Object, + selector labels.Selector, + controllerKind schema.GroupVersionKind, + canAdopt func() error, +) *PodControllerRefManager { + return &PodControllerRefManager{ + BaseControllerRefManager: BaseControllerRefManager{ + Controller: controller, + Selector: selector, + CanAdoptFunc: canAdopt, + }, + controllerKind: controllerKind, + podControl: podControl, + } +} + +// ClaimPods tries to take ownership of a list of Pods. +// +// It will reconcile the following: +// * Adopt orphans if the selector matches. +// * Release owned objects if the selector no longer matches. +// +// Optional: If one or more filters are specified, a Pod will only be claimed if +// all filters return true. +// +// A non-nil error is returned if some form of reconciliation was attempted and +// failed. Usually, controllers should try again later in case reconciliation +// is still needed. +// +// If the error is nil, either the reconciliation succeeded, or no +// reconciliation was necessary. The list of Pods that you now own is returned. +func (m *PodControllerRefManager) ClaimPods(pods []*v1.Pod, filters ...func(*v1.Pod) bool) ([]*v1.Pod, error) { + var claimed []*v1.Pod + var errlist []error + + match := func(obj metav1.Object) bool { + pod := obj.(*v1.Pod) + // Check selector first so filters only run on potentially matching Pods. + if !m.Selector.Matches(labels.Set(pod.Labels)) { + return false + } + for _, filter := range filters { + if !filter(pod) { + return false + } + } + return true + } + adopt := func(obj metav1.Object) error { + return m.AdoptPod(obj.(*v1.Pod)) + } + release := func(obj metav1.Object) error { + return m.ReleasePod(obj.(*v1.Pod)) + } + + for _, pod := range pods { + ok, err := m.ClaimObject(pod, match, adopt, release) + if err != nil { + errlist = append(errlist, err) + continue + } + if ok { + claimed = append(claimed, pod) + } + } + return claimed, utilerrors.NewAggregate(errlist) +} + +// AdoptPod sends a patch to take control of the pod. It returns the error if +// the patching fails. +func (m *PodControllerRefManager) AdoptPod(pod *v1.Pod) error { + if err := m.CanAdopt(); err != nil { + return fmt.Errorf("can't adopt Pod %v/%v (%v): %v", pod.Namespace, pod.Name, pod.UID, err) + } + // Note that ValidateOwnerReferences() will reject this patch if another + // OwnerReference exists with controller=true. + addControllerPatch := fmt.Sprintf( + `{"metadata":{"ownerReferences":[{"apiVersion":"%s","kind":"%s","name":"%s","uid":"%s","controller":true,"blockOwnerDeletion":true}],"uid":"%s"}}`, + m.controllerKind.GroupVersion(), m.controllerKind.Kind, + m.Controller.GetName(), m.Controller.GetUID(), pod.UID) + return m.podControl.PatchPod(pod.Namespace, pod.Name, []byte(addControllerPatch)) +} + +// ReleasePod sends a patch to free the pod from the control of the controller. +// It returns the error if the patching fails. 404 and 422 errors are ignored. +func (m *PodControllerRefManager) ReleasePod(pod *v1.Pod) error { + glog.V(2).Infof("patching pod %s_%s to remove its controllerRef to %s/%s:%s", + pod.Namespace, pod.Name, m.controllerKind.GroupVersion(), m.controllerKind.Kind, m.Controller.GetName()) + deleteOwnerRefPatch := fmt.Sprintf(`{"metadata":{"ownerReferences":[{"$patch":"delete","uid":"%s"}],"uid":"%s"}}`, m.Controller.GetUID(), pod.UID) + err := m.podControl.PatchPod(pod.Namespace, pod.Name, []byte(deleteOwnerRefPatch)) + if err != nil { + if errors.IsNotFound(err) { + // If the pod no longer exists, ignore it. + return nil + } + if errors.IsInvalid(err) { + // Invalid error will be returned in two cases: 1. the pod + // has no owner reference, 2. the uid of the pod doesn't + // match, which means the pod is deleted and then recreated. + // In both cases, the error can be ignored. + + // TODO: If the pod has owner references, but none of them + // has the owner.UID, server will silently ignore the patch. + // Investigate why. + return nil + } + } + return err +} + +type ServiceControllerRefManager struct { + BaseControllerRefManager + + controllerKind schema.GroupVersionKind + serviceControl ServiceControlInterface +} + +// NewServiceControllerRefManager returns a ServiceControllerRefManager that exposes +// methods to manage the controllerRef of services. +// +// The canAdopt() function can be used to perform a potentially expensive check +// (such as a live GET from the API server) prior to the first adoption. +// It will only be called (at most once) if an adoption is actually attempted. +// If canAdopt() returns a non-nil error, all adoptions will fail. +// +// NOTE: Once canAdopt() is called, it will not be called again by the same +// ServiceControllerRefManager instance. Create a new instance if it makes +// sense to check canAdopt() again (e.g. in a different sync pass). +func NewServiceControllerRefManager( + serviceControl ServiceControlInterface, + controller metav1.Object, + selector labels.Selector, + controllerKind schema.GroupVersionKind, + canAdopt func() error, +) *ServiceControllerRefManager { + return &ServiceControllerRefManager{ + BaseControllerRefManager: BaseControllerRefManager{ + Controller: controller, + Selector: selector, + CanAdoptFunc: canAdopt, + }, + controllerKind: controllerKind, + serviceControl: serviceControl, + } +} + +// ClaimServices tries to take ownership of a list of Services. +// +// It will reconcile the following: +// * Adopt orphans if the selector matches. +// * Release owned objects if the selector no longer matches. +// +// Optional: If one or more filters are specified, a Service will only be claimed if +// all filters return true. +// +// A non-nil error is returned if some form of reconciliation was attempted and +// failed. Usually, controllers should try again later in case reconciliation +// is still needed. +// +// If the error is nil, either the reconciliation succeeded, or no +// reconciliation was necessary. The list of Services that you now own is returned. +func (m *ServiceControllerRefManager) ClaimServices(services []*v1.Service, filters ...func(*v1.Service) bool) ([]*v1.Service, error) { + var claimed []*v1.Service + var errlist []error + + match := func(obj metav1.Object) bool { + service := obj.(*v1.Service) + // Check selector first so filters only run on potentially matching Services. + if !m.Selector.Matches(labels.Set(service.Labels)) { + return false + } + for _, filter := range filters { + if !filter(service) { + return false + } + } + return true + } + adopt := func(obj metav1.Object) error { + return m.AdoptService(obj.(*v1.Service)) + } + release := func(obj metav1.Object) error { + return m.ReleaseService(obj.(*v1.Service)) + } + + for _, service := range services { + ok, err := m.ClaimObject(service, match, adopt, release) + if err != nil { + errlist = append(errlist, err) + continue + } + if ok { + claimed = append(claimed, service) + } + } + return claimed, utilerrors.NewAggregate(errlist) +} + +// AdoptService sends a patch to take control of the service. It returns the error if +// the patching fails. +func (m *ServiceControllerRefManager) AdoptService(service *v1.Service) error { + if err := m.CanAdopt(); err != nil { + return fmt.Errorf("can't adopt Service %v/%v (%v): %v", service.Namespace, service.Name, service.UID, err) + } + // Note that ValidateOwnerReferences() will reject this patch if another + // OwnerReference exists with controller=true. + addControllerPatch := fmt.Sprintf( + `{"metadata":{"ownerReferences":[{"apiVersion":"%s","kind":"%s","name":"%s","uid":"%s","controller":true,"blockOwnerDeletion":true}],"uid":"%s"}}`, + m.controllerKind.GroupVersion(), m.controllerKind.Kind, + m.Controller.GetName(), m.Controller.GetUID(), service.UID) + return m.serviceControl.PatchService(service.Namespace, service.Name, []byte(addControllerPatch)) +} + +// ReleaseService sends a patch to free the service from the control of the controller. +// It returns the error if the patching fails. 404 and 422 errors are ignored. +func (m *ServiceControllerRefManager) ReleaseService(service *v1.Service) error { + glog.V(2).Infof("patching service %s_%s to remove its controllerRef to %s/%s:%s", + service.Namespace, service.Name, m.controllerKind.GroupVersion(), m.controllerKind.Kind, m.Controller.GetName()) + deleteOwnerRefPatch := fmt.Sprintf(`{"metadata":{"ownerReferences":[{"$patch":"delete","uid":"%s"}],"uid":"%s"}}`, m.Controller.GetUID(), service.UID) + err := m.serviceControl.PatchService(service.Namespace, service.Name, []byte(deleteOwnerRefPatch)) + if err != nil { + if errors.IsNotFound(err) { + // If the service no longer exists, ignore it. + return nil + } + if errors.IsInvalid(err) { + // Invalid error will be returned in two cases: 1. the service + // has no owner reference, 2. the uid of the service doesn't + // match, which means the service is deleted and then recreated. + // In both cases, the error can be ignored. + + // TODO: If the service has owner references, but none of them + // has the owner.UID, server will silently ignore the patch. + // Investigate why. + return nil + } + } + return err +} + +// RecheckDeletionTimestamp returns a CanAdopt() function to recheck deletion. +// +// The CanAdopt() function calls getObject() to fetch the latest value, +// and denies adoption attempts if that object has a non-nil DeletionTimestamp. +func RecheckDeletionTimestamp(getObject func() (metav1.Object, error)) func() error { + return func() error { + obj, err := getObject() + if err != nil { + return fmt.Errorf("can't recheck DeletionTimestamp: %v", err) + } + if obj.GetDeletionTimestamp() != nil { + return fmt.Errorf("%v/%v has just been deleted at %v", obj.GetNamespace(), obj.GetName(), obj.GetDeletionTimestamp()) + } + return nil + } +} diff --git a/pkg/controller/controller_service.go b/pkg/controller/controller_service.go new file mode 100644 index 0000000000..31c9931386 --- /dev/null +++ b/pkg/controller/controller_service.go @@ -0,0 +1,258 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package controller provides a Kubernetes controller for a TFJob resource. + +package controller + +import ( + "fmt" + "strings" + + log "github.com/sirupsen/logrus" + + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + + tfv1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" +) + +// reconcileServices checks and updates services for each given TFReplicaSpec. +// It will requeue the tfjob in case of an error while creating/deleting services. +func (tc *TFJobController) reconcileServices( + tfjob *tfv1alpha2.TFJob, + services []*v1.Service, + rtype tfv1alpha2.TFReplicaType, + spec *tfv1alpha2.TFReplicaSpec) error { + tfjobKey, err := KeyFunc(tfjob) + if err != nil { + utilruntime.HandleError(fmt.Errorf("Couldn't get key for tfjob object %#v: %v", tfjob, err)) + return err + } + + // Convert TFReplicaType to lower string. + rt := strings.ToLower(string(rtype)) + + // Get active services for this TFReplicaType. + activeServices := filterActiveServicesForTFReplicaType(services, rt) + + diff := len(activeServices) - int(*(spec.Replicas)) + + if diff < 0 { + // Need to create new services. + diffIndexes := getDiffServiceIndexes(activeServices, *spec.Replicas) + if diff+len(diffIndexes) != 0 { + // This should never happened. + return fmt.Errorf("diff is not equal to length of diffIndexes") + } + + expectationServicesKey := genExpectationServicesKey(tfjobKey, rt) + tc.expectations.ExpectCreations(expectationServicesKey, int(diff)) + + for _, index := range diffIndexes { + log.Infof("need to create new service: %s-%s", rt, index) + + // Create OwnerReference. + controllerRef := genOwnerReference(tfjob) + + // Append tfReplicaTypeLabel and tfReplicaIndexLabel labels. + labels := genLabels(tfjobKey) + labels[tfReplicaTypeLabel] = rt + labels[tfReplicaIndexLabel] = index + + service := &v1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: genGeneralName(tfjobKey, rt, index), + Labels: labels, + }, + Spec: v1.ServiceSpec{ + Selector: labels, + Ports: []v1.ServicePort{ + { + Name: genGeneralName(tfjobKey, rt, index), + Port: 2222, + }, + }, + }, + } + + err := tc.serviceControl.CreateServicesWithControllerRef(tfjob.Namespace, service, tfjob, controllerRef) + if err != nil && errors.IsTimeout(err) { + // Service is created but its initialization has timed out. + // If the initialization is successful eventually, the + // controller will observe the creation via the informer. + // If the initialization fails, or if the pod keeps + // uninitialized for a long time, the informer will not + // receive any update, and the controller will create a new + // pod when the expectation expires. + return nil + } + return err + } + } else if diff > 0 { + // TODO(CPH): Need to delete pods. + } + + return nil +} + +// getDiffServiceIndexes checks and gets diff indexes from desired and current. +func getDiffServiceIndexes(activeServices []*v1.Service, replicas int32) []string { + desiredIndexes := make(map[string]string) + + for i := int32(0); i < replicas; i++ { + desiredIndexes[fmt.Sprintf("%d", i)] = noHit + } + + for _, service := range activeServices { + if _, ok := service.Labels[tfReplicaIndexLabel]; !ok { + continue + } + + index := service.Labels[tfReplicaIndexLabel] + + if _, ok := desiredIndexes[index]; ok { + desiredIndexes[index] = hit + } + } + + diffIndexes := []string{} + for index, hit := range desiredIndexes { + if hit == noHit { + diffIndexes = append(diffIndexes, index) + } + } + + return diffIndexes +} + +// getServicesForTFJob returns the set of services that this tfjob should manage. +// It also reconciles ControllerRef by adopting/orphaning. +// Note that the returned Pods are pointers into the cache. +func (tc *TFJobController) getServicesForTFJob(tfjob *tfv1alpha2.TFJob) ([]*v1.Service, error) { + tfjobKey, err := KeyFunc(tfjob) + if err != nil { + utilruntime.HandleError(fmt.Errorf("Couldn't get key for tfjob object %#v: %v", tfjob, err)) + return nil, err + } + + // Create selector + selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ + MatchLabels: genLabels(tfjobKey), + }) + + if err != nil { + return nil, fmt.Errorf("couldn't convert Job selector: %v", err) + } + // List all services to include those that don't match the selector anymore + // but have a ControllerRef pointing to this controller. + services, err := tc.serviceLister.Services(tfjob.Namespace).List(labels.Everything()) + if err != nil { + return nil, err + } + + // If any adoptions are attempted, we should first recheck for deletion + // with an uncached quorum read sometime after listing Pods (see #42639). + canAdoptFunc := RecheckDeletionTimestamp(func() (metav1.Object, error) { + fresh, err := tc.tfJobClientSet.KubeflowV1alpha2().TFJobs(tfjob.Namespace).Get(tfjob.Name, metav1.GetOptions{}) + if err != nil { + return nil, err + } + if fresh.UID != tfjob.UID { + return nil, fmt.Errorf("original TFJob %v/%v is gone: got uid %v, wanted %v", tfjob.Namespace, tfjob.Name, fresh.UID, tfjob.UID) + } + return fresh, nil + }) + cm := NewServiceControllerRefManager(tc.serviceControl, tfjob, selector, controllerKind, canAdoptFunc) + return cm.ClaimServices(services) +} + +// filterActiveServicesForTFReplicaType returns service that have not terminated, +// and belong to a TFReplicaType. +func filterActiveServicesForTFReplicaType(services []*v1.Service, tfReplicaType string) []*v1.Service { + var result []*v1.Service + + tfReplicaSelector := &metav1.LabelSelector{ + MatchLabels: make(map[string]string), + } + + tfReplicaSelector.MatchLabels[tfReplicaTypeLabel] = tfReplicaType + + for _, service := range services { + selector, _ := metav1.LabelSelectorAsSelector(tfReplicaSelector) + if !selector.Matches(labels.Set(service.Labels)) { + continue + } + result = append(result, service) + } + return result +} + +func genExpectationServicesKey(tfjobKey, replicaType string) string { + return tfjobKey + "/" + strings.ToLower(replicaType) + "/services" +} + +// When a service is created, enqueue the controller that manages it and update its expectations. +func (tc *TFJobController) addService(obj interface{}) { + service := obj.(*v1.Service) + if service.DeletionTimestamp != nil { + // on a restart of the controller controller, it's possible a new service shows up in a state that + // is already pending deletion. Prevent the service from being a creation observation. + // tc.deleteService(service) + return + } + + // If it has a ControllerRef, that's all that matters. + if controllerRef := metav1.GetControllerOf(service); controllerRef != nil { + tfjob := tc.resolveControllerRef(service.Namespace, controllerRef) + if tfjob == nil { + return + } + + tfjobKey, err := KeyFunc(tfjob) + if err != nil { + return + } + + if _, ok := service.Labels[tfReplicaTypeLabel]; !ok { + log.Infof("This service maybe not created by tf-operator") + return + } + + rtype := service.Labels[tfReplicaTypeLabel] + expectationServicesKey := genExpectationServicesKey(tfjobKey, rtype) + + tc.expectations.CreationObserved(expectationServicesKey) + tc.enqueueTFJob(tfjob) + + return + } + +} + +// When a service is updated, figure out what tfjob/s manage it and wake them up. +// If the labels of the service have changed we need to awaken both the old +// and new replica set. old and cur must be *v1.Service types. +func (tc *TFJobController) updateService(old, cur interface{}) { + // TODO(CPH): handle this gracefully. +} + +// When a service is deleted, enqueue the tfjob that manages the service and update its expectations. +// obj could be an *v1.Service, or a DeletionFinalStateUnknown marker item. +func (tc *TFJobController) deleteService(obj interface{}) { + // TODO(CPH): handle this gracefully. +} diff --git a/pkg/controller/controller_tensorflow.go b/pkg/controller/controller_tensorflow.go new file mode 100644 index 0000000000..98b6296591 --- /dev/null +++ b/pkg/controller/controller_tensorflow.go @@ -0,0 +1,109 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package controller provides a Kubernetes controller for a TFJob resource. + +package controller + +import ( + "encoding/json" + "fmt" + "strconv" + "strings" + + log "github.com/sirupsen/logrus" + + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + + tfv1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" +) + +// TFConfig is a struct representing the distributed TensorFlow config. +// This struct is turned into an environment variable TF_CONFIG +// which is used by TensorFlow processes to configure themselves. +// https://cloud.google.com/ml-engine/docs/trainer-considerations#use_tf_config +type TFConfig struct { + // Cluster represents a TensorFlow ClusterSpec. + // See: https://www.tensorflow.org/api_docs/python/tf/train/ClusterSpec + Cluster ClusterSpec `json:"cluster"` + Task TaskSpec `json:"task"` +} + +// ClusterSpec represents a cluster TensorFlow specification. +// https://www.tensorflow.org/deploy/distributed#create_a_tftrainclusterspec_to_describe_the_cluster +// It is a map from job names to network addresses. +type ClusterSpec map[string][]string + +type TaskSpec struct { + Type string `json:"type"` + Index int `json:"index"` +} + +// genTFConfig will generate the environment variable TF_CONFIG +// { +// "cluster": { +// "ps": ["ps1:2222", "ps2:2222"], +// "worker": ["worker1:2222", "worker2:2222", "worker3:2222"] +// }, +// "task": { +// "type": "ps", +// "index": 1 +// }, +// } +// } +func genTFConfigJSONStr(tfjob *tfv1alpha2.TFJob, rtype, index string) string { + // Configure the TFCONFIG environment variable. + i, _ := strconv.ParseInt(index, 0, 32) + + tfConfig := TFConfig{ + Cluster: genClusterSpec(tfjob), + Task: TaskSpec{ + Type: rtype, + Index: int(i), + }, + } + + tfConfigJSONStr, err := json.Marshal(tfConfig) + if err != nil { + log.Errorf("TFJob: %v serializing tfConfig return error: %v", tfjob.Name, err) + return "" + } + + return string(tfConfigJSONStr) +} + +// genClusterSpec will generate ClusterSpec. +func genClusterSpec(tfjob *tfv1alpha2.TFJob) ClusterSpec { + tfjobKey, err := KeyFunc(tfjob) + if err != nil { + utilruntime.HandleError(fmt.Errorf("Couldn't get key for tfjob object %#v: %v", tfjob, err)) + return nil + } + + clusterSpec := make(ClusterSpec) + + for rtype, spec := range tfjob.Spec.TFReplicaSpecs { + rt := strings.ToLower(string(rtype)) + replicaNames := make([]string, 0, *spec.Replicas) + + for i := int32(0); i < *spec.Replicas; i++ { + host := genGeneralName(tfjobKey, rt, fmt.Sprintf("%d", i)) + ":" + defaultPortStr + replicaNames = append(replicaNames, host) + } + + clusterSpec[rt] = replicaNames + } + + return clusterSpec +} diff --git a/pkg/controller/controller_utils.go b/pkg/controller/controller_utils.go new file mode 100644 index 0000000000..1f541180e8 --- /dev/null +++ b/pkg/controller/controller_utils.go @@ -0,0 +1,719 @@ +/* +Copyright 2014 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Note(CPH): this file is copied form k8s.io/kubernetes/pkg/controller +// We should not import the huge package k8s.io/kubernetes/pkg + +package controller + +import ( + "fmt" + "sync" + "sync/atomic" + "time" + + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/meta" + apimachineryvalidation "k8s.io/apimachinery/pkg/api/validation" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/clock" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/util/wait" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/record" + + "github.com/golang/glog" +) + +const ( + // If a watch drops a delete event for a pod, it'll take this long + // before a dormant controller waiting for those packets is woken up anyway. It is + // specifically targeted at the case where some problem prevents an update + // of expectations, without it the controller could stay asleep forever. This should + // be set based on the expected latency of watch events. + // + // Currently a controller can service (create *and* observe the watch events for said + // creation) about 10 pods a second, so it takes about 1 min to service + // 500 pods. Just creation is limited to 20qps, and watching happens with ~10-30s + // latency/pod at the scale of 3000 pods over 100 nodes. + ExpectationsTimeout = 5 * time.Minute + // When batching pod creates, SlowStartInitialBatchSize is the size of the + // initial batch. The size of each successive batch is twice the size of + // the previous batch. For example, for a value of 1, batch sizes would be + // 1, 2, 4, 8, ... and for a value of 10, batch sizes would be + // 10, 20, 40, 80, ... Setting the value higher means that quota denials + // will result in more doomed API calls and associated event spam. Setting + // the value lower will result in more API call round trip periods for + // large batches. + // + // Given a number of pods to start "N": + // The number of doomed calls per sync once quota is exceeded is given by: + // min(N,SlowStartInitialBatchSize) + // The number of batches is given by: + // 1+floor(log_2(ceil(N/SlowStartInitialBatchSize))) + SlowStartInitialBatchSize = 1 +) + +var UpdateTaintBackoff = wait.Backoff{ + Steps: 5, + Duration: 100 * time.Millisecond, + Jitter: 1.0, +} + +var ( + KeyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc +) + +type ResyncPeriodFunc func() time.Duration + +// Returns 0 for resyncPeriod in case resyncing is not needed. +func NoResyncPeriodFunc() time.Duration { + return 0 +} + +// StaticResyncPeriodFunc returns the resync period specified +func StaticResyncPeriodFunc(resyncPeriod time.Duration) ResyncPeriodFunc { + return func() time.Duration { + return resyncPeriod + } +} + +// Expectations are a way for controllers to tell the controller manager what they expect. eg: +// ControllerExpectations: { +// controller1: expects 2 adds in 2 minutes +// controller2: expects 2 dels in 2 minutes +// controller3: expects -1 adds in 2 minutes => controller3's expectations have already been met +// } +// +// Implementation: +// ControlleeExpectation = pair of atomic counters to track controllee's creation/deletion +// ControllerExpectationsStore = TTLStore + a ControlleeExpectation per controller +// +// * Once set expectations can only be lowered +// * A controller isn't synced till its expectations are either fulfilled, or expire +// * Controllers that don't set expectations will get woken up for every matching controllee + +// ExpKeyFunc to parse out the key from a ControlleeExpectation +var ExpKeyFunc = func(obj interface{}) (string, error) { + if e, ok := obj.(*ControlleeExpectations); ok { + return e.key, nil + } + return "", fmt.Errorf("Could not find key for obj %#v", obj) +} + +// ControllerExpectationsInterface is an interface that allows users to set and wait on expectations. +// Only abstracted out for testing. +// Warning: if using KeyFunc it is not safe to use a single ControllerExpectationsInterface with different +// types of controllers, because the keys might conflict across types. +type ControllerExpectationsInterface interface { + GetExpectations(controllerKey string) (*ControlleeExpectations, bool, error) + SatisfiedExpectations(controllerKey string) bool + DeleteExpectations(controllerKey string) + SetExpectations(controllerKey string, add, del int) error + ExpectCreations(controllerKey string, adds int) error + ExpectDeletions(controllerKey string, dels int) error + CreationObserved(controllerKey string) + DeletionObserved(controllerKey string) + RaiseExpectations(controllerKey string, add, del int) + LowerExpectations(controllerKey string, add, del int) +} + +// ControllerExpectations is a cache mapping controllers to what they expect to see before being woken up for a sync. +type ControllerExpectations struct { + cache.Store +} + +// GetExpectations returns the ControlleeExpectations of the given controller. +func (r *ControllerExpectations) GetExpectations(controllerKey string) (*ControlleeExpectations, bool, error) { + if exp, exists, err := r.GetByKey(controllerKey); err == nil && exists { + return exp.(*ControlleeExpectations), true, nil + } else { + return nil, false, err + } +} + +// DeleteExpectations deletes the expectations of the given controller from the TTLStore. +func (r *ControllerExpectations) DeleteExpectations(controllerKey string) { + if exp, exists, err := r.GetByKey(controllerKey); err == nil && exists { + if err := r.Delete(exp); err != nil { + glog.V(2).Infof("Error deleting expectations for controller %v: %v", controllerKey, err) + } + } +} + +// SatisfiedExpectations returns true if the required adds/dels for the given controller have been observed. +// Add/del counts are established by the controller at sync time, and updated as controllees are observed by the controller +// manager. +func (r *ControllerExpectations) SatisfiedExpectations(controllerKey string) bool { + if exp, exists, err := r.GetExpectations(controllerKey); exists { + if exp.Fulfilled() { + glog.V(4).Infof("Controller expectations fulfilled %#v", exp) + return true + } else if exp.isExpired() { + glog.V(4).Infof("Controller expectations expired %#v", exp) + return true + } else { + glog.V(4).Infof("Controller still waiting on expectations %#v", exp) + return false + } + } else if err != nil { + glog.V(2).Infof("Error encountered while checking expectations %#v, forcing sync", err) + } else { + // When a new controller is created, it doesn't have expectations. + // When it doesn't see expected watch events for > TTL, the expectations expire. + // - In this case it wakes up, creates/deletes controllees, and sets expectations again. + // When it has satisfied expectations and no controllees need to be created/destroyed > TTL, the expectations expire. + // - In this case it continues without setting expectations till it needs to create/delete controllees. + glog.V(4).Infof("Controller %v either never recorded expectations, or the ttl expired.", controllerKey) + } + // Trigger a sync if we either encountered and error (which shouldn't happen since we're + // getting from local store) or this controller hasn't established expectations. + return true +} + +// TODO: Extend ExpirationCache to support explicit expiration. +// TODO: Make this possible to disable in tests. +// TODO: Support injection of clock. +func (exp *ControlleeExpectations) isExpired() bool { + return clock.RealClock{}.Since(exp.timestamp) > ExpectationsTimeout +} + +// SetExpectations registers new expectations for the given controller. Forgets existing expectations. +func (r *ControllerExpectations) SetExpectations(controllerKey string, add, del int) error { + exp := &ControlleeExpectations{add: int64(add), del: int64(del), key: controllerKey, timestamp: clock.RealClock{}.Now()} + glog.V(4).Infof("Setting expectations %#v", exp) + return r.Add(exp) +} + +func (r *ControllerExpectations) ExpectCreations(controllerKey string, adds int) error { + return r.SetExpectations(controllerKey, adds, 0) +} + +func (r *ControllerExpectations) ExpectDeletions(controllerKey string, dels int) error { + return r.SetExpectations(controllerKey, 0, dels) +} + +// Decrements the expectation counts of the given controller. +func (r *ControllerExpectations) LowerExpectations(controllerKey string, add, del int) { + if exp, exists, err := r.GetExpectations(controllerKey); err == nil && exists { + exp.Add(int64(-add), int64(-del)) + // The expectations might've been modified since the update on the previous line. + glog.V(4).Infof("Lowered expectations %#v", exp) + } +} + +// Increments the expectation counts of the given controller. +func (r *ControllerExpectations) RaiseExpectations(controllerKey string, add, del int) { + if exp, exists, err := r.GetExpectations(controllerKey); err == nil && exists { + exp.Add(int64(add), int64(del)) + // The expectations might've been modified since the update on the previous line. + glog.V(4).Infof("Raised expectations %#v", exp) + } +} + +// CreationObserved atomically decrements the `add` expectation count of the given controller. +func (r *ControllerExpectations) CreationObserved(controllerKey string) { + r.LowerExpectations(controllerKey, 1, 0) +} + +// DeletionObserved atomically decrements the `del` expectation count of the given controller. +func (r *ControllerExpectations) DeletionObserved(controllerKey string) { + r.LowerExpectations(controllerKey, 0, 1) +} + +// Expectations are either fulfilled, or expire naturally. +type Expectations interface { + Fulfilled() bool +} + +// ControlleeExpectations track controllee creates/deletes. +type ControlleeExpectations struct { + // Important: Since these two int64 fields are using sync/atomic, they have to be at the top of the struct due to a bug on 32-bit platforms + // See: https://golang.org/pkg/sync/atomic/ for more information + add int64 + del int64 + key string + timestamp time.Time +} + +// Add increments the add and del counters. +func (e *ControlleeExpectations) Add(add, del int64) { + atomic.AddInt64(&e.add, add) + atomic.AddInt64(&e.del, del) +} + +// Fulfilled returns true if this expectation has been fulfilled. +func (e *ControlleeExpectations) Fulfilled() bool { + // TODO: think about why this line being atomic doesn't matter + return atomic.LoadInt64(&e.add) <= 0 && atomic.LoadInt64(&e.del) <= 0 +} + +// GetExpectations returns the add and del expectations of the controllee. +func (e *ControlleeExpectations) GetExpectations() (int64, int64) { + return atomic.LoadInt64(&e.add), atomic.LoadInt64(&e.del) +} + +// NewControllerExpectations returns a store for ControllerExpectations. +func NewControllerExpectations() *ControllerExpectations { + return &ControllerExpectations{cache.NewStore(ExpKeyFunc)} +} + +// UIDSetKeyFunc to parse out the key from a UIDSet. +var UIDSetKeyFunc = func(obj interface{}) (string, error) { + if u, ok := obj.(*UIDSet); ok { + return u.key, nil + } + return "", fmt.Errorf("Could not find key for obj %#v", obj) +} + +// UIDSet holds a key and a set of UIDs. Used by the +// UIDTrackingControllerExpectations to remember which UID it has seen/still +// waiting for. +type UIDSet struct { + sets.String + key string +} + +// UIDTrackingControllerExpectations tracks the UID of the pods it deletes. +// This cache is needed over plain old expectations to safely handle graceful +// deletion. The desired behavior is to treat an update that sets the +// DeletionTimestamp on an object as a delete. To do so consistently, one needs +// to remember the expected deletes so they aren't double counted. +// TODO: Track creates as well (#22599) +type UIDTrackingControllerExpectations struct { + ControllerExpectationsInterface + // TODO: There is a much nicer way to do this that involves a single store, + // a lock per entry, and a ControlleeExpectationsInterface type. + uidStoreLock sync.Mutex + // Store used for the UIDs associated with any expectation tracked via the + // ControllerExpectationsInterface. + uidStore cache.Store +} + +// GetUIDs is a convenience method to avoid exposing the set of expected uids. +// The returned set is not thread safe, all modifications must be made holding +// the uidStoreLock. +func (u *UIDTrackingControllerExpectations) GetUIDs(controllerKey string) sets.String { + if uid, exists, err := u.uidStore.GetByKey(controllerKey); err == nil && exists { + return uid.(*UIDSet).String + } + return nil +} + +// ExpectDeletions records expectations for the given deleteKeys, against the given controller. +func (u *UIDTrackingControllerExpectations) ExpectDeletions(rcKey string, deletedKeys []string) error { + u.uidStoreLock.Lock() + defer u.uidStoreLock.Unlock() + + if existing := u.GetUIDs(rcKey); existing != nil && existing.Len() != 0 { + glog.Errorf("Clobbering existing delete keys: %+v", existing) + } + expectedUIDs := sets.NewString() + for _, k := range deletedKeys { + expectedUIDs.Insert(k) + } + glog.V(4).Infof("Controller %v waiting on deletions for: %+v", rcKey, deletedKeys) + if err := u.uidStore.Add(&UIDSet{expectedUIDs, rcKey}); err != nil { + return err + } + return u.ControllerExpectationsInterface.ExpectDeletions(rcKey, expectedUIDs.Len()) +} + +// DeletionObserved records the given deleteKey as a deletion, for the given rc. +func (u *UIDTrackingControllerExpectations) DeletionObserved(rcKey, deleteKey string) { + u.uidStoreLock.Lock() + defer u.uidStoreLock.Unlock() + + uids := u.GetUIDs(rcKey) + if uids != nil && uids.Has(deleteKey) { + glog.V(4).Infof("Controller %v received delete for pod %v", rcKey, deleteKey) + u.ControllerExpectationsInterface.DeletionObserved(rcKey) + uids.Delete(deleteKey) + } +} + +// DeleteExpectations deletes the UID set and invokes DeleteExpectations on the +// underlying ControllerExpectationsInterface. +func (u *UIDTrackingControllerExpectations) DeleteExpectations(rcKey string) { + u.uidStoreLock.Lock() + defer u.uidStoreLock.Unlock() + + u.ControllerExpectationsInterface.DeleteExpectations(rcKey) + if uidExp, exists, err := u.uidStore.GetByKey(rcKey); err == nil && exists { + if err := u.uidStore.Delete(uidExp); err != nil { + glog.V(2).Infof("Error deleting uid expectations for controller %v: %v", rcKey, err) + } + } +} + +// NewUIDTrackingControllerExpectations returns a wrapper around +// ControllerExpectations that is aware of deleteKeys. +func NewUIDTrackingControllerExpectations(ce ControllerExpectationsInterface) *UIDTrackingControllerExpectations { + return &UIDTrackingControllerExpectations{ControllerExpectationsInterface: ce, uidStore: cache.NewStore(UIDSetKeyFunc)} +} + +// Reasons for pod events +const ( + // FailedCreatePodReason is added in an event and in a replica set condition + // when a pod for a replica set is failed to be created. + FailedCreatePodReason = "FailedCreate" + // SuccessfulCreatePodReason is added in an event when a pod for a replica set + // is successfully created. + SuccessfulCreatePodReason = "SuccessfulCreate" + // FailedDeletePodReason is added in an event and in a replica set condition + // when a pod for a replica set is failed to be deleted. + FailedDeletePodReason = "FailedDelete" + // SuccessfulDeletePodReason is added in an event when a pod for a replica set + // is successfully deleted. + SuccessfulDeletePodReason = "SuccessfulDelete" + + FailedCreateServiceReason = "FailedCreateService" + SuccessfulCreateServiceReason = "SuccessfulCreateService" +) + +// PodControlInterface is an interface that knows how to add or delete pods +// created as an interface to allow testing. +type PodControlInterface interface { + // CreatePods creates new pods according to the spec. + CreatePods(namespace string, template *v1.PodTemplateSpec, object runtime.Object) error + // CreatePodsOnNode creates a new pod according to the spec on the specified node, + // and sets the ControllerRef. + CreatePodsOnNode(nodeName, namespace string, template *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error + // CreatePodsWithControllerRef creates new pods according to the spec, and sets object as the pod's controller. + CreatePodsWithControllerRef(namespace string, template *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error + // DeletePod deletes the pod identified by podID. + DeletePod(namespace string, podID string, object runtime.Object) error + // PatchPod patches the pod. + PatchPod(namespace, name string, data []byte) error +} + +// RealPodControl is the default implementation of PodControlInterface. +type RealPodControl struct { + KubeClient clientset.Interface + Recorder record.EventRecorder +} + +var _ PodControlInterface = &RealPodControl{} + +func getPodsLabelSet(template *v1.PodTemplateSpec) labels.Set { + desiredLabels := make(labels.Set) + for k, v := range template.Labels { + desiredLabels[k] = v + } + return desiredLabels +} + +func getPodsFinalizers(template *v1.PodTemplateSpec) []string { + desiredFinalizers := make([]string, len(template.Finalizers)) + copy(desiredFinalizers, template.Finalizers) + return desiredFinalizers +} + +func getPodsAnnotationSet(template *v1.PodTemplateSpec) labels.Set { + desiredAnnotations := make(labels.Set) + for k, v := range template.Annotations { + desiredAnnotations[k] = v + } + return desiredAnnotations +} + +func validatePodName(name string, prefix bool) []string { + return apimachineryvalidation.NameIsDNSSubdomain(name, prefix) +} + +func getPodsPrefix(controllerName string) string { + // use the dash (if the name isn't too long) to make the pod name a bit prettier + prefix := fmt.Sprintf("%s-", controllerName) + if len(validatePodName(prefix, true)) != 0 { + prefix = controllerName + } + return prefix +} + +func validateControllerRef(controllerRef *metav1.OwnerReference) error { + if controllerRef == nil { + return fmt.Errorf("controllerRef is nil") + } + if len(controllerRef.APIVersion) == 0 { + return fmt.Errorf("controllerRef has empty APIVersion") + } + if len(controllerRef.Kind) == 0 { + return fmt.Errorf("controllerRef has empty Kind") + } + if controllerRef.Controller == nil || *controllerRef.Controller != true { + return fmt.Errorf("controllerRef.Controller is not set to true") + } + if controllerRef.BlockOwnerDeletion == nil || *controllerRef.BlockOwnerDeletion != true { + return fmt.Errorf("controllerRef.BlockOwnerDeletion is not set") + } + return nil +} + +func (r RealPodControl) CreatePods(namespace string, template *v1.PodTemplateSpec, object runtime.Object) error { + return r.createPods("", namespace, template, object, nil) +} + +func (r RealPodControl) CreatePodsWithControllerRef(namespace string, template *v1.PodTemplateSpec, controllerObject runtime.Object, controllerRef *metav1.OwnerReference) error { + if err := validateControllerRef(controllerRef); err != nil { + return err + } + return r.createPods("", namespace, template, controllerObject, controllerRef) +} + +func (r RealPodControl) CreatePodsOnNode(nodeName, namespace string, template *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error { + if err := validateControllerRef(controllerRef); err != nil { + return err + } + return r.createPods(nodeName, namespace, template, object, controllerRef) +} + +func (r RealPodControl) PatchPod(namespace, name string, data []byte) error { + _, err := r.KubeClient.CoreV1().Pods(namespace).Patch(name, types.StrategicMergePatchType, data) + return err +} + +func GetPodFromTemplate(template *v1.PodTemplateSpec, parentObject runtime.Object, controllerRef *metav1.OwnerReference) (*v1.Pod, error) { + desiredLabels := getPodsLabelSet(template) + desiredFinalizers := getPodsFinalizers(template) + desiredAnnotations := getPodsAnnotationSet(template) + accessor, err := meta.Accessor(parentObject) + if err != nil { + return nil, fmt.Errorf("parentObject does not have ObjectMeta, %v", err) + } + prefix := getPodsPrefix(accessor.GetName()) + + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Labels: desiredLabels, + Annotations: desiredAnnotations, + GenerateName: prefix, + Finalizers: desiredFinalizers, + }, + } + if controllerRef != nil { + pod.OwnerReferences = append(pod.OwnerReferences, *controllerRef) + } + pod.Spec = *template.Spec.DeepCopy() + return pod, nil +} + +func (r RealPodControl) createPods(nodeName, namespace string, template *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error { + pod, err := GetPodFromTemplate(template, object, controllerRef) + if err != nil { + return err + } + if len(nodeName) != 0 { + pod.Spec.NodeName = nodeName + } + if labels.Set(pod.Labels).AsSelectorPreValidated().Empty() { + return fmt.Errorf("unable to create pods, no labels") + } + if newPod, err := r.KubeClient.CoreV1().Pods(namespace).Create(pod); err != nil { + r.Recorder.Eventf(object, v1.EventTypeWarning, FailedCreatePodReason, "Error creating: %v", err) + return err + } else { + accessor, err := meta.Accessor(object) + if err != nil { + glog.Errorf("parentObject does not have ObjectMeta, %v", err) + return nil + } + glog.V(4).Infof("Controller %v created pod %v", accessor.GetName(), newPod.Name) + r.Recorder.Eventf(object, v1.EventTypeNormal, SuccessfulCreatePodReason, "Created pod: %v", newPod.Name) + } + return nil +} + +func (r RealPodControl) DeletePod(namespace string, podID string, object runtime.Object) error { + accessor, err := meta.Accessor(object) + if err != nil { + return fmt.Errorf("object does not have ObjectMeta, %v", err) + } + glog.V(2).Infof("Controller %v deleting pod %v/%v", accessor.GetName(), namespace, podID) + if err := r.KubeClient.CoreV1().Pods(namespace).Delete(podID, nil); err != nil { + r.Recorder.Eventf(object, v1.EventTypeWarning, FailedDeletePodReason, "Error deleting: %v", err) + return fmt.Errorf("unable to delete pods: %v", err) + } else { + r.Recorder.Eventf(object, v1.EventTypeNormal, SuccessfulDeletePodReason, "Deleted pod: %v", podID) + } + return nil +} + +type FakePodControl struct { + sync.Mutex + Templates []v1.PodTemplateSpec + ControllerRefs []metav1.OwnerReference + DeletePodName []string + Patches [][]byte + Err error + CreateLimit int + CreateCallCount int +} + +var _ PodControlInterface = &FakePodControl{} + +func (f *FakePodControl) PatchPod(namespace, name string, data []byte) error { + f.Lock() + defer f.Unlock() + f.Patches = append(f.Patches, data) + if f.Err != nil { + return f.Err + } + return nil +} + +func (f *FakePodControl) CreatePods(namespace string, spec *v1.PodTemplateSpec, object runtime.Object) error { + f.Lock() + defer f.Unlock() + f.CreateCallCount++ + if f.CreateLimit != 0 && f.CreateCallCount > f.CreateLimit { + return fmt.Errorf("Not creating pod, limit %d already reached (create call %d)", f.CreateLimit, f.CreateCallCount) + } + f.Templates = append(f.Templates, *spec) + if f.Err != nil { + return f.Err + } + return nil +} + +func (f *FakePodControl) CreatePodsWithControllerRef(namespace string, spec *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error { + f.Lock() + defer f.Unlock() + f.CreateCallCount++ + if f.CreateLimit != 0 && f.CreateCallCount > f.CreateLimit { + return fmt.Errorf("Not creating pod, limit %d already reached (create call %d)", f.CreateLimit, f.CreateCallCount) + } + f.Templates = append(f.Templates, *spec) + f.ControllerRefs = append(f.ControllerRefs, *controllerRef) + if f.Err != nil { + return f.Err + } + return nil +} + +func (f *FakePodControl) CreatePodsOnNode(nodeName, namespace string, template *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error { + f.Lock() + defer f.Unlock() + f.CreateCallCount++ + if f.CreateLimit != 0 && f.CreateCallCount > f.CreateLimit { + return fmt.Errorf("Not creating pod, limit %d already reached (create call %d)", f.CreateLimit, f.CreateCallCount) + } + f.Templates = append(f.Templates, *template) + f.ControllerRefs = append(f.ControllerRefs, *controllerRef) + if f.Err != nil { + return f.Err + } + return nil +} + +func (f *FakePodControl) DeletePod(namespace string, podID string, object runtime.Object) error { + f.Lock() + defer f.Unlock() + f.DeletePodName = append(f.DeletePodName, podID) + if f.Err != nil { + return f.Err + } + return nil +} + +func (f *FakePodControl) Clear() { + f.Lock() + defer f.Unlock() + f.DeletePodName = []string{} + f.Templates = []v1.PodTemplateSpec{} + f.ControllerRefs = []metav1.OwnerReference{} + f.Patches = [][]byte{} + f.CreateLimit = 0 + f.CreateCallCount = 0 +} + +// FilterActivePods returns pods that have not terminated. +func FilterActivePods(pods []*v1.Pod) []*v1.Pod { + var result []*v1.Pod + for _, p := range pods { + if IsPodActive(p) { + result = append(result, p) + } else { + glog.V(4).Infof("Ignoring inactive pod %v/%v in state %v, deletion time %v", + p.Namespace, p.Name, p.Status.Phase, p.DeletionTimestamp) + } + } + return result +} + +func IsPodActive(p *v1.Pod) bool { + return v1.PodSucceeded != p.Status.Phase && + v1.PodFailed != p.Status.Phase && + p.DeletionTimestamp == nil +} + +// ServiceControlInterface is an interface that knows how to add or delete Services +// created as an interface to allow testing. +type ServiceControlInterface interface { + // CreateServices creates new Services according to the spec. + CreateServices(namespace string, service *v1.Service, object runtime.Object) error + // CreateServicesWithControllerRef creates new services according to the spec, and sets object as the service's controller. + CreateServicesWithControllerRef(namespace string, service *v1.Service, object runtime.Object, controllerRef *metav1.OwnerReference) error + // PatchService patches the service. + PatchService(namespace, name string, data []byte) error +} + +// RealServiceControl is the default implementation of ServiceControlInterface. +type RealServiceControl struct { + KubeClient clientset.Interface + Recorder record.EventRecorder +} + +func (r RealServiceControl) PatchService(namespace, name string, data []byte) error { + _, err := r.KubeClient.CoreV1().Services(namespace).Patch(name, types.StrategicMergePatchType, data) + return err +} + +func (r RealServiceControl) CreateServices(namespace string, service *v1.Service, object runtime.Object) error { + return r.createServices(namespace, service, object, nil) +} + +func (r RealServiceControl) CreateServicesWithControllerRef(namespace string, service *v1.Service, controllerObject runtime.Object, controllerRef *metav1.OwnerReference) error { + if err := validateControllerRef(controllerRef); err != nil { + return err + } + return r.createServices(namespace, service, controllerObject, controllerRef) +} + +func (r RealServiceControl) createServices(namespace string, service *v1.Service, object runtime.Object, controllerRef *metav1.OwnerReference) error { + if labels.Set(service.Labels).AsSelectorPreValidated().Empty() { + return fmt.Errorf("unable to create Services, no labels") + } + + newService, err := r.KubeClient.CoreV1().Services(namespace).Create(service) + if err != nil { + r.Recorder.Eventf(object, v1.EventTypeWarning, FailedCreateServiceReason, "Error creating: %v", err) + return fmt.Errorf("unable to create services: %v", err) + } + + accessor, err := meta.Accessor(object) + if err != nil { + glog.Errorf("parentObject does not have ObjectMeta, %v", err) + return nil + } + glog.V(4).Infof("Controller %v created service %v", accessor.GetName(), newService.Name) + r.Recorder.Eventf(object, v1.EventTypeNormal, SuccessfulCreateServiceReason, "Created service: %v", newService.Name) + + return nil +} diff --git a/pkg/util/signals/signal.go b/pkg/util/signals/signal.go new file mode 100644 index 0000000000..6bddfddb4f --- /dev/null +++ b/pkg/util/signals/signal.go @@ -0,0 +1,43 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package signals + +import ( + "os" + "os/signal" +) + +var onlyOneSignalHandler = make(chan struct{}) + +// SetupSignalHandler registered for SIGTERM and SIGINT. A stop channel is returned +// which is closed on one of these signals. If a second signal is caught, the program +// is terminated with exit code 1. +func SetupSignalHandler() (stopCh <-chan struct{}) { + close(onlyOneSignalHandler) // panics when called twice + + stop := make(chan struct{}) + c := make(chan os.Signal, 2) + signal.Notify(c, shutdownSignals...) + go func() { + <-c + close(stop) + <-c + os.Exit(1) // second signal. Exit directly. + }() + + return stop +} diff --git a/pkg/util/signals/signal_posix.go b/pkg/util/signals/signal_posix.go new file mode 100644 index 0000000000..9bdb4e7418 --- /dev/null +++ b/pkg/util/signals/signal_posix.go @@ -0,0 +1,26 @@ +// +build !windows + +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package signals + +import ( + "os" + "syscall" +) + +var shutdownSignals = []os.Signal{os.Interrupt, syscall.SIGTERM} diff --git a/pkg/util/signals/signal_windows.go b/pkg/util/signals/signal_windows.go new file mode 100644 index 0000000000..4907d573fe --- /dev/null +++ b/pkg/util/signals/signal_windows.go @@ -0,0 +1,23 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package signals + +import ( + "os" +) + +var shutdownSignals = []os.Signal{os.Interrupt} From 8ac85a8d6a874a9597902eb834278ccf67f8b4fc Mon Sep 17 00:00:00 2001 From: Penghao Cen Date: Thu, 15 Mar 2018 16:24:14 +0800 Subject: [PATCH 06/24] Update examples for test --- examples/crd/crd.yaml | 2 +- examples/tf_job.yaml | 32 +++++++++++++------------------- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/examples/crd/crd.yaml b/examples/crd/crd.yaml index fc2bc45a2a..93fdf1e426 100644 --- a/examples/crd/crd.yaml +++ b/examples/crd/crd.yaml @@ -4,7 +4,7 @@ metadata: name: tfjobs.kubeflow.org spec: group: kubeflow.org - version: v1alpha1 + version: v1alpha2 names: kind: TFJob singular: tfjob diff --git a/examples/tf_job.yaml b/examples/tf_job.yaml index 4b274d03df..7c4017c185 100644 --- a/examples/tf_job.yaml +++ b/examples/tf_job.yaml @@ -1,30 +1,24 @@ -apiVersion: "kubeflow.org/v1alpha1" +apiVersion: "kubeflow.org/v1alpha2" kind: "TFJob" metadata: - name: "example-job" + name: "example-job-1" spec: - replicaSpecs: - - replicas: 1 - tfReplicaType: MASTER + tfReplicaSpecs: + PS: + replicas: 2 template: spec: containers: - - image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff - name: tensorflow + - name: ps-busybox + image: busybox + command: ["sleep", "30000"] restartPolicy: OnFailure - - replicas: 1 - tfReplicaType: WORKER + Worker: + replicas: 4 template: spec: containers: - - image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff - name: tensorflow - restartPolicy: OnFailure - - replicas: 2 - tfReplicaType: PS - template: - spec: - containers: - - image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff - name: tensorflow + - name: worker-busybox + image: busybox + command: ["sleep", "30000"] restartPolicy: OnFailure From 79d55c441110e52841e0392df0c04691428005b3 Mon Sep 17 00:00:00 2001 From: Penghao Cen Date: Thu, 15 Mar 2018 18:09:59 +0800 Subject: [PATCH 07/24] Fix travis-ci error --- pkg/apis/tensorflow/v1alpha1/defaults.go | 58 --- pkg/apis/tensorflow/v1alpha1/defaults_test.go | 118 ----- pkg/apis/tensorflow/v1alpha1/doc.go | 20 - pkg/apis/tensorflow/v1alpha1/register.go | 60 --- pkg/apis/tensorflow/v1alpha1/types.go | 193 --------- .../v1alpha1/zz_generated.deepcopy.go | 405 ------------------ .../v1alpha1/zz_generated.defaults.go | 45 -- .../versioned/typed/kubeflow/v1alpha1/doc.go | 18 - .../typed/kubeflow/v1alpha1/fake/doc.go | 18 - .../v1alpha1/fake/fake_kubeflow_client.go | 35 -- .../kubeflow/v1alpha1/fake/fake_tfjob.go | 123 ------ .../kubeflow/v1alpha1/generated_expansion.go | 16 - .../kubeflow/v1alpha1/kubeflow_client.go | 85 ---- .../typed/kubeflow/v1alpha1/tfjob.go | 152 ------- .../kubeflow/v1alpha1/interface.go | 41 -- .../kubeflow/v1alpha1/tfjob.go | 71 --- .../kubeflow/v1alpha1/expansion_generated.go | 25 -- pkg/client/listers/kubeflow/v1alpha1/tfjob.go | 92 ---- 18 files changed, 1575 deletions(-) delete mode 100644 pkg/apis/tensorflow/v1alpha1/defaults.go delete mode 100644 pkg/apis/tensorflow/v1alpha1/defaults_test.go delete mode 100644 pkg/apis/tensorflow/v1alpha1/doc.go delete mode 100644 pkg/apis/tensorflow/v1alpha1/register.go delete mode 100644 pkg/apis/tensorflow/v1alpha1/types.go delete mode 100644 pkg/apis/tensorflow/v1alpha1/zz_generated.deepcopy.go delete mode 100644 pkg/apis/tensorflow/v1alpha1/zz_generated.defaults.go delete mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/doc.go delete mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/doc.go delete mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_kubeflow_client.go delete mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_tfjob.go delete mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/generated_expansion.go delete mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/kubeflow_client.go delete mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/tfjob.go delete mode 100644 pkg/client/informers/externalversions/kubeflow/v1alpha1/interface.go delete mode 100644 pkg/client/informers/externalversions/kubeflow/v1alpha1/tfjob.go delete mode 100644 pkg/client/listers/kubeflow/v1alpha1/expansion_generated.go delete mode 100644 pkg/client/listers/kubeflow/v1alpha1/tfjob.go diff --git a/pkg/apis/tensorflow/v1alpha1/defaults.go b/pkg/apis/tensorflow/v1alpha1/defaults.go deleted file mode 100644 index d87976993f..0000000000 --- a/pkg/apis/tensorflow/v1alpha1/defaults.go +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package v1alpha1 - -import ( - "github.com/golang/protobuf/proto" - "k8s.io/apimachinery/pkg/runtime" -) - -func addDefaultingFuncs(scheme *runtime.Scheme) error { - return RegisterDefaults(scheme) -} - -// SetDefaults_TFJob sets any unspecified values to defaults -func SetDefaults_TFJob(obj *TFJob) { - c := &obj.Spec - - if c.TFImage == "" { - c.TFImage = DefaultTFImage - } - - // Check that each replica has a TensorFlow container. - for _, r := range c.ReplicaSpecs { - - if r.TFPort == nil { - r.TFPort = proto.Int32(TFPort) - } - - if string(r.TFReplicaType) == "" { - r.TFReplicaType = MASTER - } - - if r.Replicas == nil { - r.Replicas = proto.Int32(Replicas) - } - } - if c.TerminationPolicy == nil { - c.TerminationPolicy = &TerminationPolicySpec{ - Chief: &ChiefSpec{ - ReplicaName: "MASTER", - ReplicaIndex: 0, - }, - } - } - -} diff --git a/pkg/apis/tensorflow/v1alpha1/defaults_test.go b/pkg/apis/tensorflow/v1alpha1/defaults_test.go deleted file mode 100644 index 419a647ea0..0000000000 --- a/pkg/apis/tensorflow/v1alpha1/defaults_test.go +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package v1alpha1 - -import ( - "reflect" - "testing" - - "github.com/gogo/protobuf/proto" - "github.com/kubeflow/tf-operator/pkg/util" - "k8s.io/api/core/v1" -) - -func TestSetDefaults_TFJob(t *testing.T) { - type testCase struct { - in *TFJob - expected *TFJob - } - - testCases := []testCase{ - { - in: &TFJob{ - Spec: TFJobSpec{ - ReplicaSpecs: []*TFReplicaSpec{ - { - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - }, - }, - }, - }, - }, - }, - TFImage: "tensorflow/tensorflow:1.3.0", - }, - }, - expected: &TFJob{ - Spec: TFJobSpec{ - ReplicaSpecs: []*TFReplicaSpec{ - { - Replicas: proto.Int32(1), - TFPort: proto.Int32(2222), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "tensorflow", - }, - }, - }, - }, - TFReplicaType: MASTER, - }, - }, - TFImage: "tensorflow/tensorflow:1.3.0", - TerminationPolicy: &TerminationPolicySpec{ - Chief: &ChiefSpec{ - ReplicaName: "MASTER", - ReplicaIndex: 0, - }, - }, - }, - }, - }, - { - in: &TFJob{ - Spec: TFJobSpec{ - ReplicaSpecs: []*TFReplicaSpec{ - { - TFReplicaType: PS, - }, - }, - TFImage: "tensorflow/tensorflow:1.3.0", - }, - }, - expected: &TFJob{ - Spec: TFJobSpec{ - ReplicaSpecs: []*TFReplicaSpec{ - { - Replicas: proto.Int32(1), - TFPort: proto.Int32(2222), - TFReplicaType: PS, - }, - }, - TFImage: "tensorflow/tensorflow:1.3.0", - TerminationPolicy: &TerminationPolicySpec{ - Chief: &ChiefSpec{ - ReplicaName: "MASTER", - ReplicaIndex: 0, - }, - }, - }, - }, - }, - } - - for _, c := range testCases { - SetDefaults_TFJob(c.in) - if !reflect.DeepEqual(c.in, c.expected) { - t.Errorf("Want\n%v; Got\n %v", util.Pformat(c.expected), util.Pformat(c.in)) - } - } -} diff --git a/pkg/apis/tensorflow/v1alpha1/doc.go b/pkg/apis/tensorflow/v1alpha1/doc.go deleted file mode 100644 index 92db83ef13..0000000000 --- a/pkg/apis/tensorflow/v1alpha1/doc.go +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +k8s:deepcopy-gen=package,register -// +k8s:defaulter-gen=TypeMeta - -// Package v1alpha1 is the v1alpha1 version of the API. -// +groupName=kubeflow.org -package v1alpha1 diff --git a/pkg/apis/tensorflow/v1alpha1/register.go b/pkg/apis/tensorflow/v1alpha1/register.go deleted file mode 100644 index 1fe6fad84c..0000000000 --- a/pkg/apis/tensorflow/v1alpha1/register.go +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package v1alpha1 - -import ( - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" -) - -var ( - SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) - AddToScheme = SchemeBuilder.AddToScheme -) - -const ( - // GroupName is the group name use in this package. - GroupName = "kubeflow.org" - // TFJobResourceKind is the kind name. - TFJobResourceKind = "TFJob" - // GroupVersion is the version. - GroupVersion = "v1alpha1" -) - -// SchemeGroupVersion is the group version used to register these objects. -var SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: CRDVersion} - -func init() { - // We only register manually written functions here. The registration of the - // generated functions takes place in the generated files. The separation - // makes the code compile even when the generated files are missing. - SchemeBuilder.Register(addDefaultingFuncs) -} - -// Resource takes an unqualified resource and returns a Group-qualified GroupResource. -func Resource(resource string) schema.GroupResource { - return SchemeGroupVersion.WithResource(resource).GroupResource() -} - -// addKnownTypes adds the set of types defined in this package to the supplied scheme. -func addKnownTypes(scheme *runtime.Scheme) error { - scheme.AddKnownTypes(SchemeGroupVersion, - &TFJob{}, - &TFJobList{}, - ) - metav1.AddToGroupVersion(scheme, SchemeGroupVersion) - return nil -} diff --git a/pkg/apis/tensorflow/v1alpha1/types.go b/pkg/apis/tensorflow/v1alpha1/types.go deleted file mode 100644 index 26e26d3c45..0000000000 --- a/pkg/apis/tensorflow/v1alpha1/types.go +++ /dev/null @@ -1,193 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package v1alpha1 - -import ( - "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -const ( - CRDKind = "tfjob" - CRDKindPlural = "tfjobs" - CRDGroup = "kubeflow.org" - CRDVersion = "v1alpha1" - // Value of the APP label that gets applied to a lot of entities. - AppLabel = "tensorflow-job" - // Defaults for the Spec - TFPort = 2222 - Replicas = 1 -) - -// +genclient -// +genclient:noStatus -// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object -// +resource:path=tfjob - -// TFJob describes tfjob info -type TFJob struct { - metav1.TypeMeta `json:",inline"` - metav1.ObjectMeta `json:"metadata,omitempty"` - Spec TFJobSpec `json:"spec"` - Status TFJobStatus `json:"status"` -} - -type TFJobSpec struct { - // TODO(jlewi): Can we we get rid of this and use some value from Kubernetes or a random ide. - RuntimeId string - - // ReplicaSpecs specifies the TF replicas to run. - ReplicaSpecs []*TFReplicaSpec `json:"replicaSpecs"` - - // TFImage defines the tensorflow docker image that should be used for default parameter server - TFImage string `json:"tfImage,omitempty"` - - // TerminationPolicy specifies the condition that the tfjob should be considered finished. - TerminationPolicy *TerminationPolicySpec `json:"terminationPolicy,omitempty"` - - // SchedulerName specifies the name of scheduler which should handle the TFJob - SchedulerName string `json:"schedulerName,omitempty"` -} - -type TerminationPolicySpec struct { - // Chief policy waits for a particular process (which is the chief) to exit. - Chief *ChiefSpec `json:"chief,omitempty"` -} - -type ChiefSpec struct { - ReplicaName string `json:"replicaName"` - ReplicaIndex int `json:"replicaIndex"` -} - -// TFReplicaType determines how a set of TF processes are handled. -type TFReplicaType string - -const ( - MASTER TFReplicaType = "MASTER" - PS TFReplicaType = "PS" - WORKER TFReplicaType = "WORKER" -) - -const ( - DefaultTFContainer string = "tensorflow" - DefaultTFImage string = "tensorflow/tensorflow:1.3.0" -) - -// TODO(jlewi): We probably want to add a name field. This would allow us to have more than 1 type of each worker. -// This might be useful if you wanted to have a separate set of workers to do eval. -type TFReplicaSpec struct { - // Replicas is the number of desired replicas. - // This is a pointer to distinguish between explicit zero and unspecified. - // Defaults to 1. - // More info: http://kubernetes.io/docs/user-guide/replication-controller#what-is-a-replication-controller - // +optional - Replicas *int32 `json:"replicas,omitempty" protobuf:"varint,1,opt,name=replicas"` - Template *v1.PodTemplateSpec `json:"template,omitempty" protobuf:"bytes,3,opt,name=template"` - // TFPort is the port to use for TF services. - TFPort *int32 `json:"tfPort,omitempty" protobuf:"varint,1,opt,name=tfPort"` - TFReplicaType `json:"tfReplicaType"` -} - -type TFJobPhase string - -const ( - TFJobPhaseNone TFJobPhase = "" - TFJobPhaseCreating TFJobPhase = "Creating" - TFJobPhaseRunning TFJobPhase = "Running" - TFJobPhaseCleanUp TFJobPhase = "CleanUp" - TFJobPhaseFailed TFJobPhase = "Failed" - TFJobPhaseDone TFJobPhase = "Done" -) - -type State string - -const ( - StateUnknown State = "Unknown" - StateRunning State = "Running" - StateSucceeded State = "Succeeded" - StateFailed State = "Failed" -) - -type TFJobStatus struct { - // Phase is the TFJob running phase - Phase TFJobPhase `json:"phase"` - Reason string `json:"reason"` - - // State indicates the state of the job. - State State `json:"state"` - - // ReplicaStatuses specifies the status of each TF replica. - ReplicaStatuses []*TFReplicaStatus `json:"replicaStatuses"` -} - -type ReplicaState string - -const ( - ReplicaStateUnknown ReplicaState = "Unknown" - ReplicaStateRunning ReplicaState = "Running" - ReplicaStateFailed ReplicaState = "Failed" - ReplicaStateSucceeded ReplicaState = "Succeeded" -) - -type TFReplicaStatus struct { - TFReplicaType `json:"tf_replica_type"` - - // State is the overall state of the replica - State ReplicaState `json:"state"` - - // ReplicasStates provides the number of replicas in each status. - ReplicasStates map[ReplicaState]int -} - -// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object -// +resource:path=tfjobs - -// TFJobList is a list of TFJobs clusters. -type TFJobList struct { - metav1.TypeMeta `json:",inline"` - // Standard list metadata - // More info: http://releases.k8s.io/HEAD/docs/devel/api-conventions.md#metadata - metav1.ListMeta `json:"metadata,omitempty"` - // Items is a list of TFJobs - Items []TFJob `json:"items"` -} - -type ControllerConfig struct { - // Accelerators is a map from the name of the accelerator to the config for that accelerator. - // This should match the value specified as a container limit. - // e.g. alpha.kubernetes.io/nvidia-gpu - Accelerators map[string]AcceleratorConfig - - // Path to the file containing the grpc server source - GrpcServerFilePath string -} - -// AcceleratorVolume represents a host path that must be mounted into -// each container that needs to use GPUs. -type AcceleratorVolume struct { - Name string - HostPath string - MountPath string -} - -type AcceleratorConfig struct { - Volumes []AcceleratorVolume - EnvVars []EnvironmentVariableConfig -} - -type EnvironmentVariableConfig struct { - Name string - Value string -} diff --git a/pkg/apis/tensorflow/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/tensorflow/v1alpha1/zz_generated.deepcopy.go deleted file mode 100644 index 739628b8ef..0000000000 --- a/pkg/apis/tensorflow/v1alpha1/zz_generated.deepcopy.go +++ /dev/null @@ -1,405 +0,0 @@ -// +build !ignore_autogenerated - -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This file was autogenerated by deepcopy-gen. Do not edit it manually! - -package v1alpha1 - -import ( - v1 "k8s.io/api/core/v1" - conversion "k8s.io/apimachinery/pkg/conversion" - runtime "k8s.io/apimachinery/pkg/runtime" - reflect "reflect" -) - -func init() { - SchemeBuilder.Register(RegisterDeepCopies) -} - -// RegisterDeepCopies adds deep-copy functions to the given scheme. Public -// to allow building arbitrary schemes. -// -// Deprecated: deepcopy registration will go away when static deepcopy is fully implemented. -func RegisterDeepCopies(scheme *runtime.Scheme) error { - return scheme.AddGeneratedDeepCopyFuncs( - conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { - in.(*AcceleratorConfig).DeepCopyInto(out.(*AcceleratorConfig)) - return nil - }, InType: reflect.TypeOf(&AcceleratorConfig{})}, - conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { - in.(*AcceleratorVolume).DeepCopyInto(out.(*AcceleratorVolume)) - return nil - }, InType: reflect.TypeOf(&AcceleratorVolume{})}, - conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { - in.(*ChiefSpec).DeepCopyInto(out.(*ChiefSpec)) - return nil - }, InType: reflect.TypeOf(&ChiefSpec{})}, - conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { - in.(*ControllerConfig).DeepCopyInto(out.(*ControllerConfig)) - return nil - }, InType: reflect.TypeOf(&ControllerConfig{})}, - conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { - in.(*EnvironmentVariableConfig).DeepCopyInto(out.(*EnvironmentVariableConfig)) - return nil - }, InType: reflect.TypeOf(&EnvironmentVariableConfig{})}, - conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { - in.(*TFJob).DeepCopyInto(out.(*TFJob)) - return nil - }, InType: reflect.TypeOf(&TFJob{})}, - conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { - in.(*TFJobList).DeepCopyInto(out.(*TFJobList)) - return nil - }, InType: reflect.TypeOf(&TFJobList{})}, - conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { - in.(*TFJobSpec).DeepCopyInto(out.(*TFJobSpec)) - return nil - }, InType: reflect.TypeOf(&TFJobSpec{})}, - conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { - in.(*TFJobStatus).DeepCopyInto(out.(*TFJobStatus)) - return nil - }, InType: reflect.TypeOf(&TFJobStatus{})}, - conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { - in.(*TFReplicaSpec).DeepCopyInto(out.(*TFReplicaSpec)) - return nil - }, InType: reflect.TypeOf(&TFReplicaSpec{})}, - conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { - in.(*TFReplicaStatus).DeepCopyInto(out.(*TFReplicaStatus)) - return nil - }, InType: reflect.TypeOf(&TFReplicaStatus{})}, - conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { - in.(*TerminationPolicySpec).DeepCopyInto(out.(*TerminationPolicySpec)) - return nil - }, InType: reflect.TypeOf(&TerminationPolicySpec{})}, - ) -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *AcceleratorConfig) DeepCopyInto(out *AcceleratorConfig) { - *out = *in - if in.Volumes != nil { - in, out := &in.Volumes, &out.Volumes - *out = make([]AcceleratorVolume, len(*in)) - copy(*out, *in) - } - if in.EnvVars != nil { - in, out := &in.EnvVars, &out.EnvVars - *out = make([]EnvironmentVariableConfig, len(*in)) - copy(*out, *in) - } - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AcceleratorConfig. -func (in *AcceleratorConfig) DeepCopy() *AcceleratorConfig { - if in == nil { - return nil - } - out := new(AcceleratorConfig) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *AcceleratorVolume) DeepCopyInto(out *AcceleratorVolume) { - *out = *in - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AcceleratorVolume. -func (in *AcceleratorVolume) DeepCopy() *AcceleratorVolume { - if in == nil { - return nil - } - out := new(AcceleratorVolume) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ChiefSpec) DeepCopyInto(out *ChiefSpec) { - *out = *in - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ChiefSpec. -func (in *ChiefSpec) DeepCopy() *ChiefSpec { - if in == nil { - return nil - } - out := new(ChiefSpec) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ControllerConfig) DeepCopyInto(out *ControllerConfig) { - *out = *in - if in.Accelerators != nil { - in, out := &in.Accelerators, &out.Accelerators - *out = make(map[string]AcceleratorConfig, len(*in)) - for key, val := range *in { - newVal := new(AcceleratorConfig) - val.DeepCopyInto(newVal) - (*out)[key] = *newVal - } - } - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ControllerConfig. -func (in *ControllerConfig) DeepCopy() *ControllerConfig { - if in == nil { - return nil - } - out := new(ControllerConfig) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *EnvironmentVariableConfig) DeepCopyInto(out *EnvironmentVariableConfig) { - *out = *in - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvironmentVariableConfig. -func (in *EnvironmentVariableConfig) DeepCopy() *EnvironmentVariableConfig { - if in == nil { - return nil - } - out := new(EnvironmentVariableConfig) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *TFJob) DeepCopyInto(out *TFJob) { - *out = *in - out.TypeMeta = in.TypeMeta - in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - in.Spec.DeepCopyInto(&out.Spec) - in.Status.DeepCopyInto(&out.Status) - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFJob. -func (in *TFJob) DeepCopy() *TFJob { - if in == nil { - return nil - } - out := new(TFJob) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *TFJob) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } else { - return nil - } -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *TFJobList) DeepCopyInto(out *TFJobList) { - *out = *in - out.TypeMeta = in.TypeMeta - out.ListMeta = in.ListMeta - if in.Items != nil { - in, out := &in.Items, &out.Items - *out = make([]TFJob, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFJobList. -func (in *TFJobList) DeepCopy() *TFJobList { - if in == nil { - return nil - } - out := new(TFJobList) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *TFJobList) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } else { - return nil - } -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *TFJobSpec) DeepCopyInto(out *TFJobSpec) { - *out = *in - if in.ReplicaSpecs != nil { - in, out := &in.ReplicaSpecs, &out.ReplicaSpecs - *out = make([]*TFReplicaSpec, len(*in)) - for i := range *in { - if (*in)[i] == nil { - (*out)[i] = nil - } else { - (*out)[i] = new(TFReplicaSpec) - (*in)[i].DeepCopyInto((*out)[i]) - } - } - } - if in.TerminationPolicy != nil { - in, out := &in.TerminationPolicy, &out.TerminationPolicy - if *in == nil { - *out = nil - } else { - *out = new(TerminationPolicySpec) - (*in).DeepCopyInto(*out) - } - } - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFJobSpec. -func (in *TFJobSpec) DeepCopy() *TFJobSpec { - if in == nil { - return nil - } - out := new(TFJobSpec) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *TFJobStatus) DeepCopyInto(out *TFJobStatus) { - *out = *in - if in.ReplicaStatuses != nil { - in, out := &in.ReplicaStatuses, &out.ReplicaStatuses - *out = make([]*TFReplicaStatus, len(*in)) - for i := range *in { - if (*in)[i] == nil { - (*out)[i] = nil - } else { - (*out)[i] = new(TFReplicaStatus) - (*in)[i].DeepCopyInto((*out)[i]) - } - } - } - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFJobStatus. -func (in *TFJobStatus) DeepCopy() *TFJobStatus { - if in == nil { - return nil - } - out := new(TFJobStatus) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *TFReplicaSpec) DeepCopyInto(out *TFReplicaSpec) { - *out = *in - if in.Replicas != nil { - in, out := &in.Replicas, &out.Replicas - if *in == nil { - *out = nil - } else { - *out = new(int32) - **out = **in - } - } - if in.Template != nil { - in, out := &in.Template, &out.Template - if *in == nil { - *out = nil - } else { - *out = new(v1.PodTemplateSpec) - (*in).DeepCopyInto(*out) - } - } - if in.TFPort != nil { - in, out := &in.TFPort, &out.TFPort - if *in == nil { - *out = nil - } else { - *out = new(int32) - **out = **in - } - } - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFReplicaSpec. -func (in *TFReplicaSpec) DeepCopy() *TFReplicaSpec { - if in == nil { - return nil - } - out := new(TFReplicaSpec) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *TFReplicaStatus) DeepCopyInto(out *TFReplicaStatus) { - *out = *in - if in.ReplicasStates != nil { - in, out := &in.ReplicasStates, &out.ReplicasStates - *out = make(map[ReplicaState]int, len(*in)) - for key, val := range *in { - (*out)[key] = val - } - } - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFReplicaStatus. -func (in *TFReplicaStatus) DeepCopy() *TFReplicaStatus { - if in == nil { - return nil - } - out := new(TFReplicaStatus) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *TerminationPolicySpec) DeepCopyInto(out *TerminationPolicySpec) { - *out = *in - if in.Chief != nil { - in, out := &in.Chief, &out.Chief - if *in == nil { - *out = nil - } else { - *out = new(ChiefSpec) - **out = **in - } - } - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TerminationPolicySpec. -func (in *TerminationPolicySpec) DeepCopy() *TerminationPolicySpec { - if in == nil { - return nil - } - out := new(TerminationPolicySpec) - in.DeepCopyInto(out) - return out -} diff --git a/pkg/apis/tensorflow/v1alpha1/zz_generated.defaults.go b/pkg/apis/tensorflow/v1alpha1/zz_generated.defaults.go deleted file mode 100644 index 2d3a981187..0000000000 --- a/pkg/apis/tensorflow/v1alpha1/zz_generated.defaults.go +++ /dev/null @@ -1,45 +0,0 @@ -// +build !ignore_autogenerated - -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// This file was autogenerated by defaulter-gen. Do not edit it manually! - -package v1alpha1 - -import ( - runtime "k8s.io/apimachinery/pkg/runtime" -) - -// RegisterDefaults adds defaulters functions to the given scheme. -// Public to allow building arbitrary schemes. -// All generated defaulters are covering - they call all nested defaulters. -func RegisterDefaults(scheme *runtime.Scheme) error { - scheme.AddTypeDefaultingFunc(&TFJob{}, func(obj interface{}) { SetObjectDefaults_TFJob(obj.(*TFJob)) }) - scheme.AddTypeDefaultingFunc(&TFJobList{}, func(obj interface{}) { SetObjectDefaults_TFJobList(obj.(*TFJobList)) }) - return nil -} - -func SetObjectDefaults_TFJob(in *TFJob) { - SetDefaults_TFJob(in) -} - -func SetObjectDefaults_TFJobList(in *TFJobList) { - for i := range in.Items { - a := &in.Items[i] - SetObjectDefaults_TFJob(a) - } -} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/doc.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/doc.go deleted file mode 100644 index 8d24212e9c..0000000000 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/doc.go +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This package is generated by client-gen with custom arguments. - -// This package has the automatically generated typed clients. -package v1alpha1 diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/doc.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/doc.go deleted file mode 100644 index 41d860c548..0000000000 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/doc.go +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This package is generated by client-gen with custom arguments. - -// Package fake has the automatically generated clients. -package fake diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_kubeflow_client.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_kubeflow_client.go deleted file mode 100644 index 26f189df83..0000000000 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_kubeflow_client.go +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -package fake - -import ( - v1alpha1 "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1" - rest "k8s.io/client-go/rest" - testing "k8s.io/client-go/testing" -) - -type FakeKubeflowV1alpha1 struct { - *testing.Fake -} - -func (c *FakeKubeflowV1alpha1) TFJobs(namespace string) v1alpha1.TFJobInterface { - return &FakeTFJobs{c, namespace} -} - -// RESTClient returns a RESTClient that is used to communicate -// with API server by this client implementation. -func (c *FakeKubeflowV1alpha1) RESTClient() rest.Interface { - var ret *rest.RESTClient - return ret -} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_tfjob.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_tfjob.go deleted file mode 100644 index f42aab7738..0000000000 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_tfjob.go +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -package fake - -import ( - v1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - labels "k8s.io/apimachinery/pkg/labels" - schema "k8s.io/apimachinery/pkg/runtime/schema" - types "k8s.io/apimachinery/pkg/types" - watch "k8s.io/apimachinery/pkg/watch" - testing "k8s.io/client-go/testing" -) - -// FakeTFJobs implements TFJobInterface -type FakeTFJobs struct { - Fake *FakeKubeflowV1alpha1 - ns string -} - -var tfjobsResource = schema.GroupVersionResource{Group: "kubeflow.org", Version: "v1alpha1", Resource: "tfjobs"} - -var tfjobsKind = schema.GroupVersionKind{Group: "kubeflow.org", Version: "v1alpha1", Kind: "TFJob"} - -// Get takes name of the tFJob, and returns the corresponding tFJob object, and an error if there is any. -func (c *FakeTFJobs) Get(name string, options v1.GetOptions) (result *v1alpha1.TFJob, err error) { - obj, err := c.Fake. - Invokes(testing.NewGetAction(tfjobsResource, c.ns, name), &v1alpha1.TFJob{}) - - if obj == nil { - return nil, err - } - return obj.(*v1alpha1.TFJob), err -} - -// List takes label and field selectors, and returns the list of TFJobs that match those selectors. -func (c *FakeTFJobs) List(opts v1.ListOptions) (result *v1alpha1.TFJobList, err error) { - obj, err := c.Fake. - Invokes(testing.NewListAction(tfjobsResource, tfjobsKind, c.ns, opts), &v1alpha1.TFJobList{}) - - if obj == nil { - return nil, err - } - - label, _, _ := testing.ExtractFromListOptions(opts) - if label == nil { - label = labels.Everything() - } - list := &v1alpha1.TFJobList{} - for _, item := range obj.(*v1alpha1.TFJobList).Items { - if label.Matches(labels.Set(item.Labels)) { - list.Items = append(list.Items, item) - } - } - return list, err -} - -// Watch returns a watch.Interface that watches the requested tFJobs. -func (c *FakeTFJobs) Watch(opts v1.ListOptions) (watch.Interface, error) { - return c.Fake. - InvokesWatch(testing.NewWatchAction(tfjobsResource, c.ns, opts)) - -} - -// Create takes the representation of a tFJob and creates it. Returns the server's representation of the tFJob, and an error, if there is any. -func (c *FakeTFJobs) Create(tFJob *v1alpha1.TFJob) (result *v1alpha1.TFJob, err error) { - obj, err := c.Fake. - Invokes(testing.NewCreateAction(tfjobsResource, c.ns, tFJob), &v1alpha1.TFJob{}) - - if obj == nil { - return nil, err - } - return obj.(*v1alpha1.TFJob), err -} - -// Update takes the representation of a tFJob and updates it. Returns the server's representation of the tFJob, and an error, if there is any. -func (c *FakeTFJobs) Update(tFJob *v1alpha1.TFJob) (result *v1alpha1.TFJob, err error) { - obj, err := c.Fake. - Invokes(testing.NewUpdateAction(tfjobsResource, c.ns, tFJob), &v1alpha1.TFJob{}) - - if obj == nil { - return nil, err - } - return obj.(*v1alpha1.TFJob), err -} - -// Delete takes name of the tFJob and deletes it. Returns an error if one occurs. -func (c *FakeTFJobs) Delete(name string, options *v1.DeleteOptions) error { - _, err := c.Fake. - Invokes(testing.NewDeleteAction(tfjobsResource, c.ns, name), &v1alpha1.TFJob{}) - - return err -} - -// DeleteCollection deletes a collection of objects. -func (c *FakeTFJobs) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { - action := testing.NewDeleteCollectionAction(tfjobsResource, c.ns, listOptions) - - _, err := c.Fake.Invokes(action, &v1alpha1.TFJobList{}) - return err -} - -// Patch applies the patch and returns the patched tFJob. -func (c *FakeTFJobs) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.TFJob, err error) { - obj, err := c.Fake. - Invokes(testing.NewPatchSubresourceAction(tfjobsResource, c.ns, name, data, subresources...), &v1alpha1.TFJob{}) - - if obj == nil { - return nil, err - } - return obj.(*v1alpha1.TFJob), err -} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/generated_expansion.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/generated_expansion.go deleted file mode 100644 index 609abde95d..0000000000 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/generated_expansion.go +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -package v1alpha1 - -type TFJobExpansion interface{} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/kubeflow_client.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/kubeflow_client.go deleted file mode 100644 index dfb97df405..0000000000 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/kubeflow_client.go +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -package v1alpha1 - -import ( - v1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" - "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" - serializer "k8s.io/apimachinery/pkg/runtime/serializer" - rest "k8s.io/client-go/rest" -) - -type KubeflowV1alpha1Interface interface { - RESTClient() rest.Interface - TFJobsGetter -} - -// KubeflowV1alpha1Client is used to interact with features provided by the kubeflow.org group. -type KubeflowV1alpha1Client struct { - restClient rest.Interface -} - -func (c *KubeflowV1alpha1Client) TFJobs(namespace string) TFJobInterface { - return newTFJobs(c, namespace) -} - -// NewForConfig creates a new KubeflowV1alpha1Client for the given config. -func NewForConfig(c *rest.Config) (*KubeflowV1alpha1Client, error) { - config := *c - if err := setConfigDefaults(&config); err != nil { - return nil, err - } - client, err := rest.RESTClientFor(&config) - if err != nil { - return nil, err - } - return &KubeflowV1alpha1Client{client}, nil -} - -// NewForConfigOrDie creates a new KubeflowV1alpha1Client for the given config and -// panics if there is an error in the config. -func NewForConfigOrDie(c *rest.Config) *KubeflowV1alpha1Client { - client, err := NewForConfig(c) - if err != nil { - panic(err) - } - return client -} - -// New creates a new KubeflowV1alpha1Client for the given RESTClient. -func New(c rest.Interface) *KubeflowV1alpha1Client { - return &KubeflowV1alpha1Client{c} -} - -func setConfigDefaults(config *rest.Config) error { - gv := v1alpha1.SchemeGroupVersion - config.GroupVersion = &gv - config.APIPath = "/apis" - config.NegotiatedSerializer = serializer.DirectCodecFactory{CodecFactory: scheme.Codecs} - - if config.UserAgent == "" { - config.UserAgent = rest.DefaultKubernetesUserAgent() - } - - return nil -} - -// RESTClient returns a RESTClient that is used to communicate -// with API server by this client implementation. -func (c *KubeflowV1alpha1Client) RESTClient() rest.Interface { - if c == nil { - return nil - } - return c.restClient -} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/tfjob.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/tfjob.go deleted file mode 100644 index 87d02e3869..0000000000 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/tfjob.go +++ /dev/null @@ -1,152 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -package v1alpha1 - -import ( - v1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" - scheme "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - types "k8s.io/apimachinery/pkg/types" - watch "k8s.io/apimachinery/pkg/watch" - rest "k8s.io/client-go/rest" -) - -// TFJobsGetter has a method to return a TFJobInterface. -// A group's client should implement this interface. -type TFJobsGetter interface { - TFJobs(namespace string) TFJobInterface -} - -// TFJobInterface has methods to work with TFJob resources. -type TFJobInterface interface { - Create(*v1alpha1.TFJob) (*v1alpha1.TFJob, error) - Update(*v1alpha1.TFJob) (*v1alpha1.TFJob, error) - Delete(name string, options *v1.DeleteOptions) error - DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error - Get(name string, options v1.GetOptions) (*v1alpha1.TFJob, error) - List(opts v1.ListOptions) (*v1alpha1.TFJobList, error) - Watch(opts v1.ListOptions) (watch.Interface, error) - Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.TFJob, err error) - TFJobExpansion -} - -// tFJobs implements TFJobInterface -type tFJobs struct { - client rest.Interface - ns string -} - -// newTFJobs returns a TFJobs -func newTFJobs(c *KubeflowV1alpha1Client, namespace string) *tFJobs { - return &tFJobs{ - client: c.RESTClient(), - ns: namespace, - } -} - -// Get takes name of the tFJob, and returns the corresponding tFJob object, and an error if there is any. -func (c *tFJobs) Get(name string, options v1.GetOptions) (result *v1alpha1.TFJob, err error) { - result = &v1alpha1.TFJob{} - err = c.client.Get(). - Namespace(c.ns). - Resource("tfjobs"). - Name(name). - VersionedParams(&options, scheme.ParameterCodec). - Do(). - Into(result) - return -} - -// List takes label and field selectors, and returns the list of TFJobs that match those selectors. -func (c *tFJobs) List(opts v1.ListOptions) (result *v1alpha1.TFJobList, err error) { - result = &v1alpha1.TFJobList{} - err = c.client.Get(). - Namespace(c.ns). - Resource("tfjobs"). - VersionedParams(&opts, scheme.ParameterCodec). - Do(). - Into(result) - return -} - -// Watch returns a watch.Interface that watches the requested tFJobs. -func (c *tFJobs) Watch(opts v1.ListOptions) (watch.Interface, error) { - opts.Watch = true - return c.client.Get(). - Namespace(c.ns). - Resource("tfjobs"). - VersionedParams(&opts, scheme.ParameterCodec). - Watch() -} - -// Create takes the representation of a tFJob and creates it. Returns the server's representation of the tFJob, and an error, if there is any. -func (c *tFJobs) Create(tFJob *v1alpha1.TFJob) (result *v1alpha1.TFJob, err error) { - result = &v1alpha1.TFJob{} - err = c.client.Post(). - Namespace(c.ns). - Resource("tfjobs"). - Body(tFJob). - Do(). - Into(result) - return -} - -// Update takes the representation of a tFJob and updates it. Returns the server's representation of the tFJob, and an error, if there is any. -func (c *tFJobs) Update(tFJob *v1alpha1.TFJob) (result *v1alpha1.TFJob, err error) { - result = &v1alpha1.TFJob{} - err = c.client.Put(). - Namespace(c.ns). - Resource("tfjobs"). - Name(tFJob.Name). - Body(tFJob). - Do(). - Into(result) - return -} - -// Delete takes name of the tFJob and deletes it. Returns an error if one occurs. -func (c *tFJobs) Delete(name string, options *v1.DeleteOptions) error { - return c.client.Delete(). - Namespace(c.ns). - Resource("tfjobs"). - Name(name). - Body(options). - Do(). - Error() -} - -// DeleteCollection deletes a collection of objects. -func (c *tFJobs) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { - return c.client.Delete(). - Namespace(c.ns). - Resource("tfjobs"). - VersionedParams(&listOptions, scheme.ParameterCodec). - Body(options). - Do(). - Error() -} - -// Patch applies the patch and returns the patched tFJob. -func (c *tFJobs) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.TFJob, err error) { - result = &v1alpha1.TFJob{} - err = c.client.Patch(pt). - Namespace(c.ns). - Resource("tfjobs"). - SubResource(subresources...). - Name(name). - Body(data). - Do(). - Into(result) - return -} diff --git a/pkg/client/informers/externalversions/kubeflow/v1alpha1/interface.go b/pkg/client/informers/externalversions/kubeflow/v1alpha1/interface.go deleted file mode 100644 index d6d535ee33..0000000000 --- a/pkg/client/informers/externalversions/kubeflow/v1alpha1/interface.go +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This file was automatically generated by informer-gen - -package v1alpha1 - -import ( - internalinterfaces "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/internalinterfaces" -) - -// Interface provides access to all the informers in this group version. -type Interface interface { - // TFJobs returns a TFJobInformer. - TFJobs() TFJobInformer -} - -type version struct { - internalinterfaces.SharedInformerFactory -} - -// New returns a new Interface. -func New(f internalinterfaces.SharedInformerFactory) Interface { - return &version{f} -} - -// TFJobs returns a TFJobInformer. -func (v *version) TFJobs() TFJobInformer { - return &tFJobInformer{factory: v.SharedInformerFactory} -} diff --git a/pkg/client/informers/externalversions/kubeflow/v1alpha1/tfjob.go b/pkg/client/informers/externalversions/kubeflow/v1alpha1/tfjob.go deleted file mode 100644 index 719d577179..0000000000 --- a/pkg/client/informers/externalversions/kubeflow/v1alpha1/tfjob.go +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This file was automatically generated by informer-gen - -package v1alpha1 - -import ( - tensorflow_v1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" - versioned "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" - internalinterfaces "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/internalinterfaces" - v1alpha1 "github.com/kubeflow/tf-operator/pkg/client/listers/kubeflow/v1alpha1" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - runtime "k8s.io/apimachinery/pkg/runtime" - watch "k8s.io/apimachinery/pkg/watch" - cache "k8s.io/client-go/tools/cache" - time "time" -) - -// TFJobInformer provides access to a shared informer and lister for -// TFJobs. -type TFJobInformer interface { - Informer() cache.SharedIndexInformer - Lister() v1alpha1.TFJobLister -} - -type tFJobInformer struct { - factory internalinterfaces.SharedInformerFactory -} - -// NewTFJobInformer constructs a new informer for TFJob type. -// Always prefer using an informer factory to get a shared informer instead of getting an independent -// one. This reduces memory footprint and number of connections to the server. -func NewTFJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { - return cache.NewSharedIndexInformer( - &cache.ListWatch{ - ListFunc: func(options v1.ListOptions) (runtime.Object, error) { - return client.KubeflowV1alpha1().TFJobs(namespace).List(options) - }, - WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { - return client.KubeflowV1alpha1().TFJobs(namespace).Watch(options) - }, - }, - &tensorflow_v1alpha1.TFJob{}, - resyncPeriod, - indexers, - ) -} - -func defaultTFJobInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { - return NewTFJobInformer(client, v1.NamespaceAll, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) -} - -func (f *tFJobInformer) Informer() cache.SharedIndexInformer { - return f.factory.InformerFor(&tensorflow_v1alpha1.TFJob{}, defaultTFJobInformer) -} - -func (f *tFJobInformer) Lister() v1alpha1.TFJobLister { - return v1alpha1.NewTFJobLister(f.Informer().GetIndexer()) -} diff --git a/pkg/client/listers/kubeflow/v1alpha1/expansion_generated.go b/pkg/client/listers/kubeflow/v1alpha1/expansion_generated.go deleted file mode 100644 index 13eb4845c0..0000000000 --- a/pkg/client/listers/kubeflow/v1alpha1/expansion_generated.go +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This file was automatically generated by lister-gen - -package v1alpha1 - -// TFJobListerExpansion allows custom methods to be added to -// TFJobLister. -type TFJobListerExpansion interface{} - -// TFJobNamespaceListerExpansion allows custom methods to be added to -// TFJobNamespaceLister. -type TFJobNamespaceListerExpansion interface{} diff --git a/pkg/client/listers/kubeflow/v1alpha1/tfjob.go b/pkg/client/listers/kubeflow/v1alpha1/tfjob.go deleted file mode 100644 index 82c329c3d9..0000000000 --- a/pkg/client/listers/kubeflow/v1alpha1/tfjob.go +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This file was automatically generated by lister-gen - -package v1alpha1 - -import ( - v1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/client-go/tools/cache" -) - -// TFJobLister helps list TFJobs. -type TFJobLister interface { - // List lists all TFJobs in the indexer. - List(selector labels.Selector) (ret []*v1alpha1.TFJob, err error) - // TFJobs returns an object that can list and get TFJobs. - TFJobs(namespace string) TFJobNamespaceLister - TFJobListerExpansion -} - -// tFJobLister implements the TFJobLister interface. -type tFJobLister struct { - indexer cache.Indexer -} - -// NewTFJobLister returns a new TFJobLister. -func NewTFJobLister(indexer cache.Indexer) TFJobLister { - return &tFJobLister{indexer: indexer} -} - -// List lists all TFJobs in the indexer. -func (s *tFJobLister) List(selector labels.Selector) (ret []*v1alpha1.TFJob, err error) { - err = cache.ListAll(s.indexer, selector, func(m interface{}) { - ret = append(ret, m.(*v1alpha1.TFJob)) - }) - return ret, err -} - -// TFJobs returns an object that can list and get TFJobs. -func (s *tFJobLister) TFJobs(namespace string) TFJobNamespaceLister { - return tFJobNamespaceLister{indexer: s.indexer, namespace: namespace} -} - -// TFJobNamespaceLister helps list and get TFJobs. -type TFJobNamespaceLister interface { - // List lists all TFJobs in the indexer for a given namespace. - List(selector labels.Selector) (ret []*v1alpha1.TFJob, err error) - // Get retrieves the TFJob from the indexer for a given namespace and name. - Get(name string) (*v1alpha1.TFJob, error) - TFJobNamespaceListerExpansion -} - -// tFJobNamespaceLister implements the TFJobNamespaceLister -// interface. -type tFJobNamespaceLister struct { - indexer cache.Indexer - namespace string -} - -// List lists all TFJobs in the indexer for a given namespace. -func (s tFJobNamespaceLister) List(selector labels.Selector) (ret []*v1alpha1.TFJob, err error) { - err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) { - ret = append(ret, m.(*v1alpha1.TFJob)) - }) - return ret, err -} - -// Get retrieves the TFJob from the indexer for a given namespace and name. -func (s tFJobNamespaceLister) Get(name string) (*v1alpha1.TFJob, error) { - obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name) - if err != nil { - return nil, err - } - if !exists { - return nil, errors.NewNotFound(v1alpha1.Resource("tfjob"), name) - } - return obj.(*v1alpha1.TFJob), nil -} From 910225d3f65e9a4f676ca9fbe7b4440dd51c98f6 Mon Sep 17 00:00:00 2001 From: Penghao Cen Date: Fri, 16 Mar 2018 11:34:10 +0800 Subject: [PATCH 08/24] Append labels instead of rewriting (#468) --- pkg/controller/controller_pod.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pkg/controller/controller_pod.go b/pkg/controller/controller_pod.go index 94486578a9..7c576acd96 100644 --- a/pkg/controller/controller_pod.go +++ b/pkg/controller/controller_pod.go @@ -76,7 +76,13 @@ func (tc *TFJobController) reconcilePods( labels[tfReplicaTypeLabel] = rt labels[tfReplicaIndexLabel] = index - pTemplate.Labels = labels + if pTemplate.Labels == nil { + pTemplate.Labels = make(map[string]string) + } + + for key, value := range labels { + pTemplate.Labels[key] = value + } // Generate TF_CONFIG JSON string. tfConfigStr := genTFConfigJSONStr(tfjob, rt, index) From 68f771eeec4615d3ac0e634e815475ee901af3b9 Mon Sep 17 00:00:00 2001 From: Ce Gao Date: Mon, 19 Mar 2018 10:39:13 +0800 Subject: [PATCH 09/24] linter: Fix linter ignore file (#466) * linter: Fix Signed-off-by: Ce Gao * linter: Keep v1alpha2 Signed-off-by: Ce Gao * client: Update Signed-off-by: Ce Gao --- linter_config.json | 4 +- .../typed/tensorflow/v1alpha2/doc.go | 20 --- .../typed/tensorflow/v1alpha2/fake/doc.go | 20 --- .../v1alpha2/fake/fake_tensorflow_client.go | 38 ----- .../tensorflow/v1alpha2/fake/fake_tfjob.go | 126 -------------- .../v1alpha2/generated_expansion.go | 19 --- .../tensorflow/v1alpha2/tensorflow_client.go | 88 ---------- .../typed/tensorflow/v1alpha2/tfjob.go | 155 ------------------ .../externalversions/tensorflow/interface.go | 44 ----- .../tensorflow/v1alpha2/interface.go | 43 ----- .../tensorflow/v1alpha2/tfjob.go | 73 --------- .../v1alpha2/expansion_generated.go | 27 --- .../listers/tensorflow/v1alpha2/tfjob.go | 94 ----------- 13 files changed, 3 insertions(+), 748 deletions(-) delete mode 100644 pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/doc.go delete mode 100644 pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/doc.go delete mode 100644 pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/fake_tensorflow_client.go delete mode 100644 pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/fake_tfjob.go delete mode 100644 pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/generated_expansion.go delete mode 100644 pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/tensorflow_client.go delete mode 100644 pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/tfjob.go delete mode 100644 pkg/client/informers/externalversions/tensorflow/interface.go delete mode 100644 pkg/client/informers/externalversions/tensorflow/v1alpha2/interface.go delete mode 100644 pkg/client/informers/externalversions/tensorflow/v1alpha2/tfjob.go delete mode 100644 pkg/client/listers/tensorflow/v1alpha2/expansion_generated.go delete mode 100644 pkg/client/listers/tensorflow/v1alpha2/tfjob.go diff --git a/linter_config.json b/linter_config.json index 00cabcb2b4..c648faefd5 100644 --- a/linter_config.json +++ b/linter_config.json @@ -24,7 +24,9 @@ "comment or be unexported", "comment on exported", "pkg/apis/tensorflow/v1alpha1/zz_generated.deepcopy.go", - "pkg/apis/tensorflow/v1alpha1/zz_generated.defaults.go" + "pkg/apis/tensorflow/v1alpha1/zz_generated.defaults.go", + "pkg/apis/tensorflow/v1alpha2/zz_generated.deepcopy.go", + "pkg/apis/tensorflow/v1alpha2/zz_generated.defaults.go" ], "Deadline": "300s", "Skip": ["pkg/client"] diff --git a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/doc.go b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/doc.go deleted file mode 100644 index ef161aeae6..0000000000 --- a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/doc.go +++ /dev/null @@ -1,20 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// This package is generated by client-gen with custom arguments. - -// This package has the automatically generated typed clients. -package v1alpha2 diff --git a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/doc.go b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/doc.go deleted file mode 100644 index d4003d501b..0000000000 --- a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/doc.go +++ /dev/null @@ -1,20 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// This package is generated by client-gen with custom arguments. - -// Package fake has the automatically generated clients. -package fake diff --git a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/fake_tensorflow_client.go b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/fake_tensorflow_client.go deleted file mode 100644 index b214b508d4..0000000000 --- a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/fake_tensorflow_client.go +++ /dev/null @@ -1,38 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package fake - -import ( - v1alpha2 "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2" - rest "k8s.io/client-go/rest" - testing "k8s.io/client-go/testing" -) - -type FakeTensorflowV1alpha2 struct { - *testing.Fake -} - -func (c *FakeTensorflowV1alpha2) TFJobs(namespace string) v1alpha2.TFJobInterface { - return &FakeTFJobs{c, namespace} -} - -// RESTClient returns a RESTClient that is used to communicate -// with API server by this client implementation. -func (c *FakeTensorflowV1alpha2) RESTClient() rest.Interface { - var ret *rest.RESTClient - return ret -} diff --git a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/fake_tfjob.go b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/fake_tfjob.go deleted file mode 100644 index 5b666a9d66..0000000000 --- a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/fake/fake_tfjob.go +++ /dev/null @@ -1,126 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package fake - -import ( - v1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - labels "k8s.io/apimachinery/pkg/labels" - schema "k8s.io/apimachinery/pkg/runtime/schema" - types "k8s.io/apimachinery/pkg/types" - watch "k8s.io/apimachinery/pkg/watch" - testing "k8s.io/client-go/testing" -) - -// FakeTFJobs implements TFJobInterface -type FakeTFJobs struct { - Fake *FakeTensorflowV1alpha2 - ns string -} - -var tfjobsResource = schema.GroupVersionResource{Group: "tensorflow", Version: "v1alpha2", Resource: "tfjobs"} - -var tfjobsKind = schema.GroupVersionKind{Group: "tensorflow", Version: "v1alpha2", Kind: "TFJob"} - -// Get takes name of the tFJob, and returns the corresponding tFJob object, and an error if there is any. -func (c *FakeTFJobs) Get(name string, options v1.GetOptions) (result *v1alpha2.TFJob, err error) { - obj, err := c.Fake. - Invokes(testing.NewGetAction(tfjobsResource, c.ns, name), &v1alpha2.TFJob{}) - - if obj == nil { - return nil, err - } - return obj.(*v1alpha2.TFJob), err -} - -// List takes label and field selectors, and returns the list of TFJobs that match those selectors. -func (c *FakeTFJobs) List(opts v1.ListOptions) (result *v1alpha2.TFJobList, err error) { - obj, err := c.Fake. - Invokes(testing.NewListAction(tfjobsResource, tfjobsKind, c.ns, opts), &v1alpha2.TFJobList{}) - - if obj == nil { - return nil, err - } - - label, _, _ := testing.ExtractFromListOptions(opts) - if label == nil { - label = labels.Everything() - } - list := &v1alpha2.TFJobList{} - for _, item := range obj.(*v1alpha2.TFJobList).Items { - if label.Matches(labels.Set(item.Labels)) { - list.Items = append(list.Items, item) - } - } - return list, err -} - -// Watch returns a watch.Interface that watches the requested tFJobs. -func (c *FakeTFJobs) Watch(opts v1.ListOptions) (watch.Interface, error) { - return c.Fake. - InvokesWatch(testing.NewWatchAction(tfjobsResource, c.ns, opts)) - -} - -// Create takes the representation of a tFJob and creates it. Returns the server's representation of the tFJob, and an error, if there is any. -func (c *FakeTFJobs) Create(tFJob *v1alpha2.TFJob) (result *v1alpha2.TFJob, err error) { - obj, err := c.Fake. - Invokes(testing.NewCreateAction(tfjobsResource, c.ns, tFJob), &v1alpha2.TFJob{}) - - if obj == nil { - return nil, err - } - return obj.(*v1alpha2.TFJob), err -} - -// Update takes the representation of a tFJob and updates it. Returns the server's representation of the tFJob, and an error, if there is any. -func (c *FakeTFJobs) Update(tFJob *v1alpha2.TFJob) (result *v1alpha2.TFJob, err error) { - obj, err := c.Fake. - Invokes(testing.NewUpdateAction(tfjobsResource, c.ns, tFJob), &v1alpha2.TFJob{}) - - if obj == nil { - return nil, err - } - return obj.(*v1alpha2.TFJob), err -} - -// Delete takes name of the tFJob and deletes it. Returns an error if one occurs. -func (c *FakeTFJobs) Delete(name string, options *v1.DeleteOptions) error { - _, err := c.Fake. - Invokes(testing.NewDeleteAction(tfjobsResource, c.ns, name), &v1alpha2.TFJob{}) - - return err -} - -// DeleteCollection deletes a collection of objects. -func (c *FakeTFJobs) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { - action := testing.NewDeleteCollectionAction(tfjobsResource, c.ns, listOptions) - - _, err := c.Fake.Invokes(action, &v1alpha2.TFJobList{}) - return err -} - -// Patch applies the patch and returns the patched tFJob. -func (c *FakeTFJobs) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha2.TFJob, err error) { - obj, err := c.Fake. - Invokes(testing.NewPatchSubresourceAction(tfjobsResource, c.ns, name, data, subresources...), &v1alpha2.TFJob{}) - - if obj == nil { - return nil, err - } - return obj.(*v1alpha2.TFJob), err -} diff --git a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/generated_expansion.go b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/generated_expansion.go deleted file mode 100644 index 7e99eae6c8..0000000000 --- a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/generated_expansion.go +++ /dev/null @@ -1,19 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1alpha2 - -type TFJobExpansion interface{} diff --git a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/tensorflow_client.go b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/tensorflow_client.go deleted file mode 100644 index 0535a3b66e..0000000000 --- a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/tensorflow_client.go +++ /dev/null @@ -1,88 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1alpha2 - -import ( - v1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" - "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" - serializer "k8s.io/apimachinery/pkg/runtime/serializer" - rest "k8s.io/client-go/rest" -) - -type TensorflowV1alpha2Interface interface { - RESTClient() rest.Interface - TFJobsGetter -} - -// TensorflowV1alpha2Client is used to interact with features provided by the tensorflow group. -type TensorflowV1alpha2Client struct { - restClient rest.Interface -} - -func (c *TensorflowV1alpha2Client) TFJobs(namespace string) TFJobInterface { - return newTFJobs(c, namespace) -} - -// NewForConfig creates a new TensorflowV1alpha2Client for the given config. -func NewForConfig(c *rest.Config) (*TensorflowV1alpha2Client, error) { - config := *c - if err := setConfigDefaults(&config); err != nil { - return nil, err - } - client, err := rest.RESTClientFor(&config) - if err != nil { - return nil, err - } - return &TensorflowV1alpha2Client{client}, nil -} - -// NewForConfigOrDie creates a new TensorflowV1alpha2Client for the given config and -// panics if there is an error in the config. -func NewForConfigOrDie(c *rest.Config) *TensorflowV1alpha2Client { - client, err := NewForConfig(c) - if err != nil { - panic(err) - } - return client -} - -// New creates a new TensorflowV1alpha2Client for the given RESTClient. -func New(c rest.Interface) *TensorflowV1alpha2Client { - return &TensorflowV1alpha2Client{c} -} - -func setConfigDefaults(config *rest.Config) error { - gv := v1alpha2.SchemeGroupVersion - config.GroupVersion = &gv - config.APIPath = "/apis" - config.NegotiatedSerializer = serializer.DirectCodecFactory{CodecFactory: scheme.Codecs} - - if config.UserAgent == "" { - config.UserAgent = rest.DefaultKubernetesUserAgent() - } - - return nil -} - -// RESTClient returns a RESTClient that is used to communicate -// with API server by this client implementation. -func (c *TensorflowV1alpha2Client) RESTClient() rest.Interface { - if c == nil { - return nil - } - return c.restClient -} diff --git a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/tfjob.go b/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/tfjob.go deleted file mode 100644 index 9f66609712..0000000000 --- a/pkg/client/clientset/versioned/typed/tensorflow/v1alpha2/tfjob.go +++ /dev/null @@ -1,155 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1alpha2 - -import ( - v1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" - scheme "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - types "k8s.io/apimachinery/pkg/types" - watch "k8s.io/apimachinery/pkg/watch" - rest "k8s.io/client-go/rest" -) - -// TFJobsGetter has a method to return a TFJobInterface. -// A group's client should implement this interface. -type TFJobsGetter interface { - TFJobs(namespace string) TFJobInterface -} - -// TFJobInterface has methods to work with TFJob resources. -type TFJobInterface interface { - Create(*v1alpha2.TFJob) (*v1alpha2.TFJob, error) - Update(*v1alpha2.TFJob) (*v1alpha2.TFJob, error) - Delete(name string, options *v1.DeleteOptions) error - DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error - Get(name string, options v1.GetOptions) (*v1alpha2.TFJob, error) - List(opts v1.ListOptions) (*v1alpha2.TFJobList, error) - Watch(opts v1.ListOptions) (watch.Interface, error) - Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha2.TFJob, err error) - TFJobExpansion -} - -// tFJobs implements TFJobInterface -type tFJobs struct { - client rest.Interface - ns string -} - -// newTFJobs returns a TFJobs -func newTFJobs(c *TensorflowV1alpha2Client, namespace string) *tFJobs { - return &tFJobs{ - client: c.RESTClient(), - ns: namespace, - } -} - -// Get takes name of the tFJob, and returns the corresponding tFJob object, and an error if there is any. -func (c *tFJobs) Get(name string, options v1.GetOptions) (result *v1alpha2.TFJob, err error) { - result = &v1alpha2.TFJob{} - err = c.client.Get(). - Namespace(c.ns). - Resource("tfjobs"). - Name(name). - VersionedParams(&options, scheme.ParameterCodec). - Do(). - Into(result) - return -} - -// List takes label and field selectors, and returns the list of TFJobs that match those selectors. -func (c *tFJobs) List(opts v1.ListOptions) (result *v1alpha2.TFJobList, err error) { - result = &v1alpha2.TFJobList{} - err = c.client.Get(). - Namespace(c.ns). - Resource("tfjobs"). - VersionedParams(&opts, scheme.ParameterCodec). - Do(). - Into(result) - return -} - -// Watch returns a watch.Interface that watches the requested tFJobs. -func (c *tFJobs) Watch(opts v1.ListOptions) (watch.Interface, error) { - opts.Watch = true - return c.client.Get(). - Namespace(c.ns). - Resource("tfjobs"). - VersionedParams(&opts, scheme.ParameterCodec). - Watch() -} - -// Create takes the representation of a tFJob and creates it. Returns the server's representation of the tFJob, and an error, if there is any. -func (c *tFJobs) Create(tFJob *v1alpha2.TFJob) (result *v1alpha2.TFJob, err error) { - result = &v1alpha2.TFJob{} - err = c.client.Post(). - Namespace(c.ns). - Resource("tfjobs"). - Body(tFJob). - Do(). - Into(result) - return -} - -// Update takes the representation of a tFJob and updates it. Returns the server's representation of the tFJob, and an error, if there is any. -func (c *tFJobs) Update(tFJob *v1alpha2.TFJob) (result *v1alpha2.TFJob, err error) { - result = &v1alpha2.TFJob{} - err = c.client.Put(). - Namespace(c.ns). - Resource("tfjobs"). - Name(tFJob.Name). - Body(tFJob). - Do(). - Into(result) - return -} - -// Delete takes name of the tFJob and deletes it. Returns an error if one occurs. -func (c *tFJobs) Delete(name string, options *v1.DeleteOptions) error { - return c.client.Delete(). - Namespace(c.ns). - Resource("tfjobs"). - Name(name). - Body(options). - Do(). - Error() -} - -// DeleteCollection deletes a collection of objects. -func (c *tFJobs) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { - return c.client.Delete(). - Namespace(c.ns). - Resource("tfjobs"). - VersionedParams(&listOptions, scheme.ParameterCodec). - Body(options). - Do(). - Error() -} - -// Patch applies the patch and returns the patched tFJob. -func (c *tFJobs) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha2.TFJob, err error) { - result = &v1alpha2.TFJob{} - err = c.client.Patch(pt). - Namespace(c.ns). - Resource("tfjobs"). - SubResource(subresources...). - Name(name). - Body(data). - Do(). - Into(result) - return -} diff --git a/pkg/client/informers/externalversions/tensorflow/interface.go b/pkg/client/informers/externalversions/tensorflow/interface.go deleted file mode 100644 index c6dd1e50d0..0000000000 --- a/pkg/client/informers/externalversions/tensorflow/interface.go +++ /dev/null @@ -1,44 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// This file was automatically generated by informer-gen - -package tensorflow - -import ( - internalinterfaces "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/internalinterfaces" - v1alpha2 "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/tensorflow/v1alpha2" -) - -// Interface provides access to each of this group's versions. -type Interface interface { - // V1alpha2 provides access to shared informers for resources in V1alpha2. - V1alpha2() v1alpha2.Interface -} - -type group struct { - internalinterfaces.SharedInformerFactory -} - -// New returns a new Interface. -func New(f internalinterfaces.SharedInformerFactory) Interface { - return &group{f} -} - -// V1alpha2 returns a new v1alpha2.Interface. -func (g *group) V1alpha2() v1alpha2.Interface { - return v1alpha2.New(g.SharedInformerFactory) -} diff --git a/pkg/client/informers/externalversions/tensorflow/v1alpha2/interface.go b/pkg/client/informers/externalversions/tensorflow/v1alpha2/interface.go deleted file mode 100644 index f137397993..0000000000 --- a/pkg/client/informers/externalversions/tensorflow/v1alpha2/interface.go +++ /dev/null @@ -1,43 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// This file was automatically generated by informer-gen - -package v1alpha2 - -import ( - internalinterfaces "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/internalinterfaces" -) - -// Interface provides access to all the informers in this group version. -type Interface interface { - // TFJobs returns a TFJobInformer. - TFJobs() TFJobInformer -} - -type version struct { - internalinterfaces.SharedInformerFactory -} - -// New returns a new Interface. -func New(f internalinterfaces.SharedInformerFactory) Interface { - return &version{f} -} - -// TFJobs returns a TFJobInformer. -func (v *version) TFJobs() TFJobInformer { - return &tFJobInformer{factory: v.SharedInformerFactory} -} diff --git a/pkg/client/informers/externalversions/tensorflow/v1alpha2/tfjob.go b/pkg/client/informers/externalversions/tensorflow/v1alpha2/tfjob.go deleted file mode 100644 index d0bdd3f9ed..0000000000 --- a/pkg/client/informers/externalversions/tensorflow/v1alpha2/tfjob.go +++ /dev/null @@ -1,73 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// This file was automatically generated by informer-gen - -package v1alpha2 - -import ( - tensorflow_v1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" - versioned "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" - internalinterfaces "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/internalinterfaces" - v1alpha2 "github.com/kubeflow/tf-operator/pkg/client/listers/tensorflow/v1alpha2" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - runtime "k8s.io/apimachinery/pkg/runtime" - watch "k8s.io/apimachinery/pkg/watch" - cache "k8s.io/client-go/tools/cache" - time "time" -) - -// TFJobInformer provides access to a shared informer and lister for -// TFJobs. -type TFJobInformer interface { - Informer() cache.SharedIndexInformer - Lister() v1alpha2.TFJobLister -} - -type tFJobInformer struct { - factory internalinterfaces.SharedInformerFactory -} - -// NewTFJobInformer constructs a new informer for TFJob type. -// Always prefer using an informer factory to get a shared informer instead of getting an independent -// one. This reduces memory footprint and number of connections to the server. -func NewTFJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { - return cache.NewSharedIndexInformer( - &cache.ListWatch{ - ListFunc: func(options v1.ListOptions) (runtime.Object, error) { - return client.TensorflowV1alpha2().TFJobs(namespace).List(options) - }, - WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { - return client.TensorflowV1alpha2().TFJobs(namespace).Watch(options) - }, - }, - &tensorflow_v1alpha2.TFJob{}, - resyncPeriod, - indexers, - ) -} - -func defaultTFJobInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { - return NewTFJobInformer(client, v1.NamespaceAll, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) -} - -func (f *tFJobInformer) Informer() cache.SharedIndexInformer { - return f.factory.InformerFor(&tensorflow_v1alpha2.TFJob{}, defaultTFJobInformer) -} - -func (f *tFJobInformer) Lister() v1alpha2.TFJobLister { - return v1alpha2.NewTFJobLister(f.Informer().GetIndexer()) -} diff --git a/pkg/client/listers/tensorflow/v1alpha2/expansion_generated.go b/pkg/client/listers/tensorflow/v1alpha2/expansion_generated.go deleted file mode 100644 index 6880fff70c..0000000000 --- a/pkg/client/listers/tensorflow/v1alpha2/expansion_generated.go +++ /dev/null @@ -1,27 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// This file was automatically generated by lister-gen - -package v1alpha2 - -// TFJobListerExpansion allows custom methods to be added to -// TFJobLister. -type TFJobListerExpansion interface{} - -// TFJobNamespaceListerExpansion allows custom methods to be added to -// TFJobNamespaceLister. -type TFJobNamespaceListerExpansion interface{} diff --git a/pkg/client/listers/tensorflow/v1alpha2/tfjob.go b/pkg/client/listers/tensorflow/v1alpha2/tfjob.go deleted file mode 100644 index 601b03a7d8..0000000000 --- a/pkg/client/listers/tensorflow/v1alpha2/tfjob.go +++ /dev/null @@ -1,94 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// This file was automatically generated by lister-gen - -package v1alpha2 - -import ( - v1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/client-go/tools/cache" -) - -// TFJobLister helps list TFJobs. -type TFJobLister interface { - // List lists all TFJobs in the indexer. - List(selector labels.Selector) (ret []*v1alpha2.TFJob, err error) - // TFJobs returns an object that can list and get TFJobs. - TFJobs(namespace string) TFJobNamespaceLister - TFJobListerExpansion -} - -// tFJobLister implements the TFJobLister interface. -type tFJobLister struct { - indexer cache.Indexer -} - -// NewTFJobLister returns a new TFJobLister. -func NewTFJobLister(indexer cache.Indexer) TFJobLister { - return &tFJobLister{indexer: indexer} -} - -// List lists all TFJobs in the indexer. -func (s *tFJobLister) List(selector labels.Selector) (ret []*v1alpha2.TFJob, err error) { - err = cache.ListAll(s.indexer, selector, func(m interface{}) { - ret = append(ret, m.(*v1alpha2.TFJob)) - }) - return ret, err -} - -// TFJobs returns an object that can list and get TFJobs. -func (s *tFJobLister) TFJobs(namespace string) TFJobNamespaceLister { - return tFJobNamespaceLister{indexer: s.indexer, namespace: namespace} -} - -// TFJobNamespaceLister helps list and get TFJobs. -type TFJobNamespaceLister interface { - // List lists all TFJobs in the indexer for a given namespace. - List(selector labels.Selector) (ret []*v1alpha2.TFJob, err error) - // Get retrieves the TFJob from the indexer for a given namespace and name. - Get(name string) (*v1alpha2.TFJob, error) - TFJobNamespaceListerExpansion -} - -// tFJobNamespaceLister implements the TFJobNamespaceLister -// interface. -type tFJobNamespaceLister struct { - indexer cache.Indexer - namespace string -} - -// List lists all TFJobs in the indexer for a given namespace. -func (s tFJobNamespaceLister) List(selector labels.Selector) (ret []*v1alpha2.TFJob, err error) { - err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) { - ret = append(ret, m.(*v1alpha2.TFJob)) - }) - return ret, err -} - -// Get retrieves the TFJob from the indexer for a given namespace and name. -func (s tFJobNamespaceLister) Get(name string) (*v1alpha2.TFJob, error) { - obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name) - if err != nil { - return nil, err - } - if !exists { - return nil, errors.NewNotFound(v1alpha2.Resource("tfjob"), name) - } - return obj.(*v1alpha2.TFJob), nil -} From 8d3296e9d639aa1887d18311951d1d02ade19472 Mon Sep 17 00:00:00 2001 From: Penghao Cen Date: Mon, 19 Mar 2018 15:56:09 +0800 Subject: [PATCH 10/24] Make RestartPolicy a property of the ReplicaSpec (#473) --- pkg/apis/tensorflow/v1alpha2/types.go | 28 +++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/pkg/apis/tensorflow/v1alpha2/types.go b/pkg/apis/tensorflow/v1alpha2/types.go index b45267c8b8..e498ae399a 100644 --- a/pkg/apis/tensorflow/v1alpha2/types.go +++ b/pkg/apis/tensorflow/v1alpha2/types.go @@ -51,6 +51,20 @@ type TFJobSpec struct { // "Worker": TFReplicaSpec, // } TFReplicaSpecs map[TFReplicaType]*TFReplicaSpec `json:"tfReplicaSpecs"` +} + +// TFReplicaSpec is a description of the TFReplica +type TFReplicaSpec struct { + // Replicas is the desired number of replicas of the given template. + // If unspecified, defaults to 1. + Replicas *int32 `json:"replicas,omitempty"` + + // Template is the object that describes the pod that + // will be created for this TFReplica. + // We use RestartPolicy in PodTemplateSpec + // to describe how the containers within the pod should be restarted. + // Please set this restart policy carefully according to your code. + Template v1.PodTemplateSpec `json:"template,omitempty"` // Restart policy for all TFReplicas within the TFJob. // One of Always, OnFailure, Never and ExitCode. @@ -77,20 +91,6 @@ const ( RestartPolicyExitCode RestartPolicy = "ExitCode" ) -// TFReplicaSpec is a description of the TFReplica -type TFReplicaSpec struct { - // Replicas is the desired number of replicas of the given template. - // If unspecified, defaults to 1. - Replicas *int32 `json:"replicas,omitempty"` - - // Template is the object that describes the pod that - // will be created for this TFReplica. - // We use RestartPolicy in PodTemplateSpec - // to describe how the containers within the pod should be restarted. - // Please set this restart policy carefully according to your code. - Template v1.PodTemplateSpec `json:"template,omitempty"` -} - // TFReplicaType is the type for TFReplica. type TFReplicaType string From a6d5b27a7b94cfcdc840c0f39cfde491243705a1 Mon Sep 17 00:00:00 2001 From: Ce Gao Date: Mon, 19 Mar 2018 17:52:34 +0800 Subject: [PATCH 11/24] test: Add unit test for controller (#467) * utils: Add FakeServiceControl Signed-off-by: Ce Gao * test: Add basic test case Signed-off-by: Ce Gao * travis: Ignore vendored code Signed-off-by: Ce Gao * travis: Ignore vendered code Signed-off-by: Ce Gao --- .travis.yml | 2 +- pkg/controller/controller_test.go | 224 +++++++++++++++++++++++++++++ pkg/controller/controller_utils.go | 52 +++++++ 3 files changed, 277 insertions(+), 1 deletion(-) create mode 100644 pkg/controller/controller_test.go diff --git a/.travis.yml b/.travis.yml index f91e311664..e9e0d53aa5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -29,4 +29,4 @@ script: # For now though we just run all tests in pkg. # And we can not use ** because goveralls uses filepath.Match # to match ignore files and it does not support it. - - goveralls -service=travis-ci -v -package ./pkg/... -ignore "pkg/client/*/*.go,pkg/client/*/*/*.go,pkg/client/*/*/*/*.go,pkg/client/*/*/*/*/*.go,pkg/client/*/*/*/*/*/*.go,pkg/client/*/*/*/*/*/*/*.go,pkg/apis/tensorflow/*/zz_generated.*.go" + - goveralls -service=travis-ci -v -package ./pkg/... -ignore "pkg/client/*/*.go,pkg/client/*/*/*.go,pkg/client/*/*/*/*.go,pkg/client/*/*/*/*/*.go,pkg/client/*/*/*/*/*/*.go,pkg/client/*/*/*/*/*/*/*.go,pkg/apis/tensorflow/*/zz_generated.*.go,pkg/controller/controller_utils.go,pkg/controller/controller_ref_manager.go" diff --git a/pkg/controller/controller_test.go b/pkg/controller/controller_test.go new file mode 100644 index 0000000000..7c8f3dfa56 --- /dev/null +++ b/pkg/controller/controller_test.go @@ -0,0 +1,224 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package controller provides a Kubernetes controller for a TFJob resource. + +package controller + +import ( + "fmt" + "testing" + + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/rand" + kubeinformers "k8s.io/client-go/informers" + kubeclientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/cache" + + tfv1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" + tfjobclientset "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" + tfjobinformers "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions" +) + +var alwaysReady = func() bool { return true } + +func newTFJobControllerFromClient(kubeClientSet kubeclientset.Interface, tfJobClientSet tfjobclientset.Interface, resyncPeriod ResyncPeriodFunc) (*TFJobController, kubeinformers.SharedInformerFactory, tfjobinformers.SharedInformerFactory) { + kubeInformerFactory := kubeinformers.NewSharedInformerFactory(kubeClientSet, resyncPeriod()) + tfJobInformerFactory := tfjobinformers.NewSharedInformerFactory(tfJobClientSet, resyncPeriod()) + + controller := NewTFJobController(kubeClientSet, tfJobClientSet, kubeInformerFactory, tfJobInformerFactory) + controller.podControl = &FakePodControl{} + // TODO(gaocegege): Add FakeServiceControl. + controller.serviceControl = &FakeServiceControl{} + return controller, kubeInformerFactory, tfJobInformerFactory +} + +func newTFJob(worker, ps int) *tfv1alpha2.TFJob { + tfJob := &tfv1alpha2.TFJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foobar", + Namespace: metav1.NamespaceDefault, + }, + Spec: tfv1alpha2.TFJobSpec{ + TFReplicaSpecs: make(map[tfv1alpha2.TFReplicaType]*tfv1alpha2.TFReplicaSpec), + }, + } + + if worker > 0 { + worker := int32(worker) + workerReplicaSpec := &tfv1alpha2.TFReplicaSpec{ + Replicas: &worker, + Template: v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + v1.Container{ + Image: "foo/bar", + }, + }, + }, + }, + } + tfJob.Spec.TFReplicaSpecs[tfv1alpha2.TFReplicaTypeWorker] = workerReplicaSpec + } + + if ps > 0 { + ps := int32(ps) + psReplicaSpec := &tfv1alpha2.TFReplicaSpec{ + Replicas: &ps, + Template: v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + v1.Container{ + Image: "foo/bar", + }, + }, + }, + }, + } + tfJob.Spec.TFReplicaSpecs[tfv1alpha2.TFReplicaTypePS] = psReplicaSpec + } + return tfJob +} + +func getKey(tfJob *tfv1alpha2.TFJob, t *testing.T) string { + if key, err := KeyFunc(tfJob); err != nil { + t.Errorf("Unexpected error getting key for job %v: %v", tfJob.Name, err) + return "" + } else { + return key + } +} + +func newPod(name string, tfJob *tfv1alpha2.TFJob) *v1.Pod { + tfjobKey, err := KeyFunc(tfJob) + if err != nil { + fmt.Errorf("Couldn't get key for tfjob object %#v: %v", tfJob, err) + } + + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: genLabels(tfjobKey), + Namespace: tfJob.Namespace, + OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(tfJob, controllerKind)}, + }, + } +} + +// create count pods with the given phase for the given tfJob +func newPodList(count int32, status v1.PodPhase, tfJob *tfv1alpha2.TFJob) []v1.Pod { + pods := []v1.Pod{} + for i := int32(0); i < count; i++ { + newPod := newPod(fmt.Sprintf("pod-%v", rand.String(10)), tfJob) + newPod.Status = v1.PodStatus{Phase: status} + pods = append(pods, *newPod) + } + return pods +} + +func setPodsStatuses(podIndexer cache.Indexer, tfJob *tfv1alpha2.TFJob, pendingPods, activePods, succeededPods, failedPods int32) { + for _, pod := range newPodList(pendingPods, v1.PodPending, tfJob) { + podIndexer.Add(&pod) + } + for _, pod := range newPodList(activePods, v1.PodRunning, tfJob) { + podIndexer.Add(&pod) + } + for _, pod := range newPodList(succeededPods, v1.PodSucceeded, tfJob) { + podIndexer.Add(&pod) + } + for _, pod := range newPodList(failedPods, v1.PodFailed, tfJob) { + podIndexer.Add(&pod) + } +} + +func TestNormalPath(t *testing.T) { + testCases := map[string]struct { + worker int + ps int + + // pod setup + podControllerError error + jobKeyForget bool + pendingPods int32 + activePods int32 + succeededPods int32 + failedPods int32 + + // TODO(gaocegege): Add service setup. + + // expectations + expectedCreations int32 + expectedDeletions int32 + expectedActive int32 + expectedSucceeded int32 + expectedFailed int32 + // TODO(gaocegege): Add condition check. + // expectedCondition *tfv1alpha2.TFJobConditionType + // expectedConditionReason string + }{ + "Local TFJob created": { + 1, 0, + nil, true, 0, 0, 0, 0, + 1, 0, 1, 0, 0, + }, + } + + for name, tc := range testCases { + // Prepare the clientset and controller for the test. + kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &v1.SchemeGroupVersion, + }, + }, + ) + tfJobClientSet := tfjobclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &tfv1alpha2.SchemeGroupVersion, + }, + }, + ) + controller, kubeInformerFactory, tfJobInformerFactory := newTFJobControllerFromClient(kubeClientSet, tfJobClientSet, NoResyncPeriodFunc) + controller.tfJobListerSynced = alwaysReady + controller.podListerSynced = alwaysReady + controller.serviceListerSynced = alwaysReady + + // Run the test logic. + tfJob := newTFJob(tc.worker, tc.ps) + tfJobInformerFactory.Kubeflow().V1alpha2().TFJobs().Informer().GetIndexer().Add(tfJob) + podIndexer := kubeInformerFactory.Core().V1().Pods().Informer().GetIndexer() + setPodsStatuses(podIndexer, tfJob, tc.pendingPods, tc.activePods, tc.succeededPods, tc.failedPods) + + forget, err := controller.syncTFJob(getKey(tfJob, t)) + // We need requeue syncJob task if podController error + if tc.podControllerError != nil { + if err == nil { + t.Errorf("%s: Syncing jobs would return error when podController exception", name) + } + } else { + if err != nil { + t.Errorf("%s: unexpected error when syncing jobs %v", name, err) + } + } + if forget != tc.jobKeyForget { + t.Errorf("%s: unexpected forget value. Expected %v, saw %v\n", name, tc.jobKeyForget, forget) + } + if int32(len(controller.podControl.(*FakePodControl).Templates)) != tc.expectedCreations { + t.Errorf("%s: unexpected number of creates. Expected %d, saw %d\n", name, tc.expectedCreations, len(controller.podControl.(*FakePodControl).Templates)) + } + } +} diff --git a/pkg/controller/controller_utils.go b/pkg/controller/controller_utils.go index 1f541180e8..238b1c4118 100644 --- a/pkg/controller/controller_utils.go +++ b/pkg/controller/controller_utils.go @@ -717,3 +717,55 @@ func (r RealServiceControl) createServices(namespace string, service *v1.Service return nil } + +type FakeServiceControl struct { + sync.Mutex + Templates []v1.Service + ControllerRefs []metav1.OwnerReference + DeletePodName []string + Patches [][]byte + Err error + CreateLimit int + CreateCallCount int +} + +var _ ServiceControlInterface = &FakeServiceControl{} + +func (f *FakeServiceControl) PatchService(namespace, name string, data []byte) error { + f.Lock() + defer f.Unlock() + f.Patches = append(f.Patches, data) + if f.Err != nil { + return f.Err + } + return nil +} + +func (f *FakeServiceControl) CreateServices(namespace string, service *v1.Service, object runtime.Object) error { + f.Lock() + defer f.Unlock() + f.CreateCallCount++ + if f.CreateLimit != 0 && f.CreateCallCount > f.CreateLimit { + return fmt.Errorf("Not creating service, limit %d already reached (create call %d)", f.CreateLimit, f.CreateCallCount) + } + f.Templates = append(f.Templates, *service) + if f.Err != nil { + return f.Err + } + return nil +} + +func (f *FakeServiceControl) CreateServicesWithControllerRef(namespace string, service *v1.Service, object runtime.Object, controllerRef *metav1.OwnerReference) error { + f.Lock() + defer f.Unlock() + f.CreateCallCount++ + if f.CreateLimit != 0 && f.CreateCallCount > f.CreateLimit { + return fmt.Errorf("Not creating service, limit %d already reached (create call %d)", f.CreateLimit, f.CreateCallCount) + } + f.Templates = append(f.Templates, *service) + f.ControllerRefs = append(f.ControllerRefs, *controllerRef) + if f.Err != nil { + return f.Err + } + return nil +} From 99825ee87f77b3611e2199d59fb7b53a735d1fb2 Mon Sep 17 00:00:00 2001 From: Penghao Cen Date: Mon, 19 Mar 2018 18:10:03 +0800 Subject: [PATCH 12/24] Update tfjob status (#472) --- pkg/controller/controller.go | 7 ++--- pkg/controller/controller_pod.go | 39 +++++++++++++++++++++++++++- pkg/controller/controller_service.go | 9 +++---- 3 files changed, 46 insertions(+), 9 deletions(-) diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index c3db9c398d..b64ba59101 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -329,7 +329,7 @@ func (tc *TFJobController) syncTFJob(key string) (bool, error) { return false, err } - tfjob, err := tc.tfJobLister.TFJobs(namespace).Get(name) + sharedtfjob, err := tc.tfJobLister.TFJobs(namespace).Get(name) if err != nil { if errors.IsNotFound(err) { log.Infof("TFJob has been deleted: %v", key) @@ -339,6 +339,7 @@ func (tc *TFJobController) syncTFJob(key string) (bool, error) { return false, err } + tfjob := sharedtfjob.DeepCopy() tfjobNeedsSync := tc.satisfiedExpectations(tfjob) var reconcileTFJobsErr error @@ -434,8 +435,8 @@ func (tc *TFJobController) updateTFJob(old, cur interface{}) { } func (tc *TFJobController) updateTFJobStatus(tfjob *tfv1alpha2.TFJob) error { - // TODO - return nil + _, err := tc.tfJobClientSet.KubeflowV1alpha2().TFJobs(tfjob.Namespace).Update(tfjob) + return err } // resolveControllerRef returns the tfjob referenced by a ControllerRef, diff --git a/pkg/controller/controller_pod.go b/pkg/controller/controller_pod.go index 7c576acd96..4a87a9f019 100644 --- a/pkg/controller/controller_pod.go +++ b/pkg/controller/controller_pod.go @@ -50,6 +50,8 @@ func (tc *TFJobController) reconcilePods( // Get active pods for this TFReplicaType. activePods := filterActivePodsForTFReplicaType(pods, rt) + succeeded, failed := getPodStatus(pods) + diff := len(activePods) - int(*(spec.Replicas)) if diff < 0 { @@ -57,7 +59,7 @@ func (tc *TFJobController) reconcilePods( diffIndexes := getDiffPodIndexes(activePods, *spec.Replicas) if diff+len(diffIndexes) != 0 { // This should never happened. - return fmt.Errorf("diff is not equal to length of diffIndexes") + return fmt.Errorf("pods diff(%d) is not equal to length(%d) of diffIndexes", diff, len(diffIndexes)) } expectationPodsKey := genExpectationPodsKey(tfjobKey, rt) @@ -119,6 +121,23 @@ func (tc *TFJobController) reconcilePods( // TODO(CPH): Need to delete pods. } + if tfjob.Status.TFReplicaStatuses == nil { + tfjob.Status.TFReplicaStatuses = make(map[tfv1alpha2.TFReplicaType]*tfv1alpha2.TFReplicaStatus) + } + + if _, ok := tfjob.Status.TFReplicaStatuses[rtype]; !ok { + tfjob.Status.TFReplicaStatuses[rtype] = &tfv1alpha2.TFReplicaStatus{} + } + + tfjob.Status.TFReplicaStatuses[rtype].Active = int32(len(activePods)) + tfjob.Status.TFReplicaStatuses[rtype].Succeeded = succeeded + tfjob.Status.TFReplicaStatuses[rtype].Failed = failed + + // TODO(CPH): Add check here, no need to update the tfjob if the status hasn't changed since last time. + if err := tc.updateTFJobStatus(tfjob); err != nil { + return err + } + return nil } @@ -277,3 +296,21 @@ func (tc *TFJobController) updatePod(old, cur interface{}) { func (tc *TFJobController) deletePod(obj interface{}) { // TODO(CPH): handle this gracefully. } + +// getPodStatus returns no of succeeded and failed pods running a job +func getPodStatus(pods []*v1.Pod) (succeeded, failed int32) { + succeeded = int32(filterPods(pods, v1.PodSucceeded)) + failed = int32(filterPods(pods, v1.PodFailed)) + return +} + +// filterPods returns pods based on their phase. +func filterPods(pods []*v1.Pod, phase v1.PodPhase) int { + result := 0 + for i := range pods { + if phase == pods[i].Status.Phase { + result++ + } + } + return result +} diff --git a/pkg/controller/controller_service.go b/pkg/controller/controller_service.go index 31c9931386..33aa3929e6 100644 --- a/pkg/controller/controller_service.go +++ b/pkg/controller/controller_service.go @@ -57,7 +57,7 @@ func (tc *TFJobController) reconcileServices( diffIndexes := getDiffServiceIndexes(activeServices, *spec.Replicas) if diff+len(diffIndexes) != 0 { // This should never happened. - return fmt.Errorf("diff is not equal to length of diffIndexes") + return fmt.Errorf("services diff(%d) is not equal to length(%d) of diffIndexes", diff, len(diffIndexes)) } expectationServicesKey := genExpectationServicesKey(tfjobKey, rt) @@ -75,10 +75,6 @@ func (tc *TFJobController) reconcileServices( labels[tfReplicaIndexLabel] = index service := &v1.Service{ - ObjectMeta: metav1.ObjectMeta{ - Name: genGeneralName(tfjobKey, rt, index), - Labels: labels, - }, Spec: v1.ServiceSpec{ Selector: labels, Ports: []v1.ServicePort{ @@ -90,6 +86,9 @@ func (tc *TFJobController) reconcileServices( }, } + service.Name = genGeneralName(tfjobKey, rt, index) + service.Labels = labels + err := tc.serviceControl.CreateServicesWithControllerRef(tfjob.Namespace, service, tfjob, controllerRef) if err != nil && errors.IsTimeout(err) { // Service is created but its initialization has timed out. From 445733de7a97c4f0cbd6d491dd5f9153679945dd Mon Sep 17 00:00:00 2001 From: Ce Gao Date: Tue, 20 Mar 2018 12:02:55 +0800 Subject: [PATCH 13/24] controller: Update status in time (#476) * controller: Fix the status outdate problem Signed-off-by: Ce Gao * test: Add check for status update Signed-off-by: Ce Gao * test: Remove call for KeyFunc Signed-off-by: Ce Gao * pod: Add comment and remove debug statements Signed-off-by: Ce Gao --- pkg/controller/controller.go | 3 + pkg/controller/controller_pod.go | 9 +-- pkg/controller/controller_test.go | 106 ++++++++++++++++++++++++++---- 3 files changed, 103 insertions(+), 15 deletions(-) diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index b64ba59101..83b724aa1f 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -104,6 +104,8 @@ type TFJobController struct { // To allow injection of syncTFJob for testing. syncHandler func(tfJobKey string) (bool, error) + updateStatusHandler func(tfjob *tfv1alpha2.TFJob) error + // Listers for TFJob, Pod and Service // tfJobLister can list/get tfjobs from the shared informer's store. tfJobLister tfjoblisters.TFJobLister @@ -191,6 +193,7 @@ func NewTFJobController( // Set sync handler. tc.syncHandler = tc.syncTFJob + tc.updateStatusHandler = tc.updateTFJobStatus // Create tfjob informer. tfJobInformer := tfJobInformerFactory.Kubeflow().V1alpha2().TFJobs() diff --git a/pkg/controller/controller_pod.go b/pkg/controller/controller_pod.go index 4a87a9f019..5acdab7700 100644 --- a/pkg/controller/controller_pod.go +++ b/pkg/controller/controller_pod.go @@ -92,7 +92,6 @@ func (tc *TFJobController) reconcilePods( if tfConfigStr == "" { return nil } - // Add TF_CONFIG environment variable. for _, c := range pTemplate.Spec.Containers { if len(c.Env) == 0 { @@ -114,8 +113,9 @@ func (tc *TFJobController) reconcilePods( // receive any update, and the controller will create a new // pod when the expectation expires. return nil + } else if err != nil { + return err } - return err } } else if diff > 0 { // TODO(CPH): Need to delete pods. @@ -129,12 +129,13 @@ func (tc *TFJobController) reconcilePods( tfjob.Status.TFReplicaStatuses[rtype] = &tfv1alpha2.TFReplicaStatus{} } - tfjob.Status.TFReplicaStatuses[rtype].Active = int32(len(activePods)) + // Update the active status since we have created -diff pods during the loop. + tfjob.Status.TFReplicaStatuses[rtype].Active -= int32(diff) tfjob.Status.TFReplicaStatuses[rtype].Succeeded = succeeded tfjob.Status.TFReplicaStatuses[rtype].Failed = failed // TODO(CPH): Add check here, no need to update the tfjob if the status hasn't changed since last time. - if err := tc.updateTFJobStatus(tfjob); err != nil { + if err := tc.updateStatusHandler(tfjob); err != nil { return err } diff --git a/pkg/controller/controller_test.go b/pkg/controller/controller_test.go index 7c8f3dfa56..411913c626 100644 --- a/pkg/controller/controller_test.go +++ b/pkg/controller/controller_test.go @@ -103,9 +103,11 @@ func getKey(tfJob *tfv1alpha2.TFJob, t *testing.T) string { } func newPod(name string, tfJob *tfv1alpha2.TFJob) *v1.Pod { - tfjobKey, err := KeyFunc(tfJob) - if err != nil { - fmt.Errorf("Couldn't get key for tfjob object %#v: %v", tfJob, err) + var tfjobKey string + if len(tfJob.Namespace) > 0 { + tfjobKey = fmt.Sprintf("%s/%s", tfJob.Namespace, tfJob.Name) + } else { + tfjobKey = tfJob.Name } return &v1.Pod{ @@ -144,6 +146,15 @@ func setPodsStatuses(podIndexer cache.Indexer, tfJob *tfv1alpha2.TFJob, pendingP } } +func getCondition(tfJob *tfv1alpha2.TFJob, condition tfv1alpha2.TFJobConditionType, reason string) bool { + for _, v := range tfJob.Status.Conditions { + if v.Type == condition && v.Status == v1.ConditionTrue && v.Reason == reason { + return true + } + } + return false +} + func TestNormalPath(t *testing.T) { testCases := map[string]struct { worker int @@ -162,17 +173,25 @@ func TestNormalPath(t *testing.T) { // expectations expectedCreations int32 expectedDeletions int32 - expectedActive int32 - expectedSucceeded int32 - expectedFailed int32 + + expectedActiveWorkerPods int32 + expectedSucceededWorkerPods int32 + expectedFailedWorkerPods int32 + + expectedActivePSPods int32 + expectedSucceededPSPods int32 + expectedFailedPSPods int32 // TODO(gaocegege): Add condition check. - // expectedCondition *tfv1alpha2.TFJobConditionType - // expectedConditionReason string + expectedCondition *tfv1alpha2.TFJobConditionType + expectedConditionReason string }{ "Local TFJob created": { 1, 0, nil, true, 0, 0, 0, 0, - 1, 0, 1, 0, 0, + 1, 0, + 1, 0, 0, + 0, 0, 0, + nil, "", }, } @@ -196,6 +215,11 @@ func TestNormalPath(t *testing.T) { controller.tfJobListerSynced = alwaysReady controller.podListerSynced = alwaysReady controller.serviceListerSynced = alwaysReady + var actual *tfv1alpha2.TFJob + controller.updateStatusHandler = func(tfJob *tfv1alpha2.TFJob) error { + actual = tfJob + return nil + } // Run the test logic. tfJob := newTFJob(tc.worker, tc.ps) @@ -217,8 +241,68 @@ func TestNormalPath(t *testing.T) { if forget != tc.jobKeyForget { t.Errorf("%s: unexpected forget value. Expected %v, saw %v\n", name, tc.jobKeyForget, forget) } - if int32(len(controller.podControl.(*FakePodControl).Templates)) != tc.expectedCreations { - t.Errorf("%s: unexpected number of creates. Expected %d, saw %d\n", name, tc.expectedCreations, len(controller.podControl.(*FakePodControl).Templates)) + + fakePodControl := controller.podControl.(*FakePodControl) + if int32(len(fakePodControl.Templates)) != tc.expectedCreations { + t.Errorf("%s: unexpected number of creates. Expected %d, saw %d\n", name, tc.expectedCreations, len(fakePodControl.Templates)) + } + if int32(len(fakePodControl.DeletePodName)) != tc.expectedDeletions { + t.Errorf("%s: unexpected number of deletes. Expected %d, saw %d\n", name, tc.expectedDeletions, len(fakePodControl.DeletePodName)) + } + // Each create should have an accompanying ControllerRef. + if len(fakePodControl.ControllerRefs) != int(tc.expectedCreations) { + t.Errorf("%s: unexpected number of ControllerRefs. Expected %d, saw %d\n", name, tc.expectedCreations, len(fakePodControl.ControllerRefs)) + } + // Make sure the ControllerRefs are correct. + for _, controllerRef := range fakePodControl.ControllerRefs { + if got, want := controllerRef.APIVersion, tfv1alpha2.SchemeGroupVersion.String(); got != want { + t.Errorf("controllerRef.APIVersion = %q, want %q", got, want) + } + if got, want := controllerRef.Kind, tfv1alpha2.TFJobResourceKind; got != want { + t.Errorf("controllerRef.Kind = %q, want %q", got, want) + } + if got, want := controllerRef.Name, tfJob.Name; got != want { + t.Errorf("controllerRef.Name = %q, want %q", got, want) + } + if got, want := controllerRef.UID, tfJob.UID; got != want { + t.Errorf("controllerRef.UID = %q, want %q", got, want) + } + if controllerRef.Controller == nil || *controllerRef.Controller != true { + t.Errorf("controllerRef.Controller is not set to true") + } + } + // Validate worker status. + if actual.Status.TFReplicaStatuses[tfv1alpha2.TFReplicaTypeWorker] != nil { + if actual.Status.TFReplicaStatuses[tfv1alpha2.TFReplicaTypeWorker].Active != tc.expectedActiveWorkerPods { + t.Errorf("%s: unexpected number of active pods. Expected %d, saw %d\n", name, tc.expectedActiveWorkerPods, actual.Status.TFReplicaStatuses[tfv1alpha2.TFReplicaTypeWorker].Active) + } + if actual.Status.TFReplicaStatuses[tfv1alpha2.TFReplicaTypeWorker].Succeeded != tc.expectedSucceededWorkerPods { + t.Errorf("%s: unexpected number of succeeded pods. Expected %d, saw %d\n", name, tc.expectedSucceededWorkerPods, actual.Status.TFReplicaStatuses[tfv1alpha2.TFReplicaTypeWorker].Succeeded) + } + if actual.Status.TFReplicaStatuses[tfv1alpha2.TFReplicaTypeWorker].Failed != tc.expectedFailedWorkerPods { + t.Errorf("%s: unexpected number of failed pods. Expected %d, saw %d\n", name, tc.expectedFailedWorkerPods, actual.Status.TFReplicaStatuses[tfv1alpha2.TFReplicaTypeWorker].Failed) + } + } + // Validate PS status. + if actual.Status.TFReplicaStatuses[tfv1alpha2.TFReplicaTypePS] != nil { + if actual.Status.TFReplicaStatuses[tfv1alpha2.TFReplicaTypePS].Active != tc.expectedActivePSPods { + t.Errorf("%s: unexpected number of active pods. Expected %d, saw %d\n", name, tc.expectedActivePSPods, actual.Status.TFReplicaStatuses[tfv1alpha2.TFReplicaTypePS].Active) + } + if actual.Status.TFReplicaStatuses[tfv1alpha2.TFReplicaTypePS].Succeeded != tc.expectedSucceededPSPods { + t.Errorf("%s: unexpected number of succeeded pods. Expected %d, saw %d\n", name, tc.expectedSucceededPSPods, actual.Status.TFReplicaStatuses[tfv1alpha2.TFReplicaTypePS].Succeeded) + } + if actual.Status.TFReplicaStatuses[tfv1alpha2.TFReplicaTypePS].Failed != tc.expectedFailedPSPods { + t.Errorf("%s: unexpected number of failed pods. Expected %d, saw %d\n", name, tc.expectedFailedPSPods, actual.Status.TFReplicaStatuses[tfv1alpha2.TFReplicaTypePS].Failed) + } + } + // TODO(gaocegege): Set StartTime for the status. + // Validate StartTime. + // if actual.Status.StartTime == nil { + // t.Errorf("%s: .status.startTime was not set", name) + // } + // Validate conditions. + if tc.expectedCondition != nil && !getCondition(actual, *tc.expectedCondition, tc.expectedConditionReason) { + t.Errorf("%s: expected completion condition. Got %#v", name, actual.Status.Conditions) } } } From 78660a928ce50b7529d14b87dc8504e8ecb0bc0d Mon Sep 17 00:00:00 2001 From: Ce Gao Date: Tue, 20 Mar 2018 14:00:59 +0800 Subject: [PATCH 14/24] *: Fix some errors in Travis CI (#477) * *: Fix some errors in Travis CI Signed-off-by: Ce Gao * controller: Fix Signed-off-by: Ce Gao --- linter_config.json | 3 ++- pkg/apis/tensorflow/v1alpha2/types.go | 2 +- pkg/controller/controller.go | 1 - pkg/controller/controller_pod.go | 12 ++++++------ pkg/controller/controller_service.go | 9 +++++++-- pkg/controller/controller_test.go | 2 +- 6 files changed, 17 insertions(+), 12 deletions(-) diff --git a/linter_config.json b/linter_config.json index c648faefd5..4f0344baad 100644 --- a/linter_config.json +++ b/linter_config.json @@ -26,7 +26,8 @@ "pkg/apis/tensorflow/v1alpha1/zz_generated.deepcopy.go", "pkg/apis/tensorflow/v1alpha1/zz_generated.defaults.go", "pkg/apis/tensorflow/v1alpha2/zz_generated.deepcopy.go", - "pkg/apis/tensorflow/v1alpha2/zz_generated.defaults.go" + "pkg/apis/tensorflow/v1alpha2/zz_generated.defaults.go", + "pkg/controller/controller_utils.go" ], "Deadline": "300s", "Skip": ["pkg/client"] diff --git a/pkg/apis/tensorflow/v1alpha2/types.go b/pkg/apis/tensorflow/v1alpha2/types.go index e498ae399a..cc81c99bc3 100644 --- a/pkg/apis/tensorflow/v1alpha2/types.go +++ b/pkg/apis/tensorflow/v1alpha2/types.go @@ -139,7 +139,7 @@ type TFJobStatus struct { // TFReplicaStatus represents the current observed state of the TFReplica. type TFReplicaStatus struct { // The number of actively running pods. - Active int32 `json:"active,omitempty""` + Active int32 `json:"active,omitempty"` // The number of pods which reached phase Succeeded. Succeeded int32 `json:"succeeded,omitempty"` diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index 83b724aa1f..2b9e7e9f43 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -56,7 +56,6 @@ const ( hit = "hit" noHit = "no-hit" - defaultPort = 2222 defaultPortStr = "2222" ) diff --git a/pkg/controller/controller_pod.go b/pkg/controller/controller_pod.go index 5acdab7700..477bb6a1ae 100644 --- a/pkg/controller/controller_pod.go +++ b/pkg/controller/controller_pod.go @@ -63,7 +63,10 @@ func (tc *TFJobController) reconcilePods( } expectationPodsKey := genExpectationPodsKey(tfjobKey, rt) - tc.expectations.ExpectCreations(expectationPodsKey, int(diff)) + err := tc.expectations.ExpectCreations(expectationPodsKey, diff) + if err != nil { + return err + } for _, index := range diffIndexes { log.Infof("need to create new pod: %s-%s", rt, index) @@ -119,6 +122,7 @@ func (tc *TFJobController) reconcilePods( } } else if diff > 0 { // TODO(CPH): Need to delete pods. + log.Infof("need to delete pod but it is not implemented yet") } if tfjob.Status.TFReplicaStatuses == nil { @@ -135,11 +139,7 @@ func (tc *TFJobController) reconcilePods( tfjob.Status.TFReplicaStatuses[rtype].Failed = failed // TODO(CPH): Add check here, no need to update the tfjob if the status hasn't changed since last time. - if err := tc.updateStatusHandler(tfjob); err != nil { - return err - } - - return nil + return tc.updateStatusHandler(tfjob) } // getDiffPodIndexes checks and gets diff indexes from desired and current. diff --git a/pkg/controller/controller_service.go b/pkg/controller/controller_service.go index 33aa3929e6..2bd35a72b0 100644 --- a/pkg/controller/controller_service.go +++ b/pkg/controller/controller_service.go @@ -61,7 +61,10 @@ func (tc *TFJobController) reconcileServices( } expectationServicesKey := genExpectationServicesKey(tfjobKey, rt) - tc.expectations.ExpectCreations(expectationServicesKey, int(diff)) + err := tc.expectations.ExpectCreations(expectationServicesKey, diff) + if err != nil { + return err + } for _, index := range diffIndexes { log.Infof("need to create new service: %s-%s", rt, index) @@ -99,11 +102,13 @@ func (tc *TFJobController) reconcileServices( // receive any update, and the controller will create a new // pod when the expectation expires. return nil + } else if err != nil { + return err } - return err } } else if diff > 0 { // TODO(CPH): Need to delete pods. + log.Infof("need to delete service but it is not implemented yet") } return nil diff --git a/pkg/controller/controller_test.go b/pkg/controller/controller_test.go index 411913c626..1b4b9b2ba9 100644 --- a/pkg/controller/controller_test.go +++ b/pkg/controller/controller_test.go @@ -267,7 +267,7 @@ func TestNormalPath(t *testing.T) { if got, want := controllerRef.UID, tfJob.UID; got != want { t.Errorf("controllerRef.UID = %q, want %q", got, want) } - if controllerRef.Controller == nil || *controllerRef.Controller != true { + if controllerRef.Controller == nil || !*controllerRef.Controller { t.Errorf("controllerRef.Controller is not set to true") } } From e7e6005900e63169726689bca074f4cf64f3726d Mon Sep 17 00:00:00 2001 From: Ce Gao Date: Tue, 20 Mar 2018 15:47:59 +0800 Subject: [PATCH 15/24] controller: Add internal state test (#480) * controller: Add internal state test Signed-off-by: Ce Gao * test: Remove useless log Signed-off-by: Ce Gao --- pkg/controller/controller_pod.go | 3 +- pkg/controller/controller_test.go | 150 ++++++++++++++++++++---------- 2 files changed, 103 insertions(+), 50 deletions(-) diff --git a/pkg/controller/controller_pod.go b/pkg/controller/controller_pod.go index 477bb6a1ae..e870a3fd24 100644 --- a/pkg/controller/controller_pod.go +++ b/pkg/controller/controller_pod.go @@ -78,6 +78,7 @@ func (tc *TFJobController) reconcilePods( pTemplate := spec.Template.DeepCopy() labels := genLabels(tfjobKey) + // Set type and index for the worker. labels[tfReplicaTypeLabel] = rt labels[tfReplicaIndexLabel] = index @@ -134,7 +135,7 @@ func (tc *TFJobController) reconcilePods( } // Update the active status since we have created -diff pods during the loop. - tfjob.Status.TFReplicaStatuses[rtype].Active -= int32(diff) + tfjob.Status.TFReplicaStatuses[rtype].Active = int32(len(activePods) - diff) tfjob.Status.TFReplicaStatuses[rtype].Succeeded = succeeded tfjob.Status.TFReplicaStatuses[rtype].Failed = failed diff --git a/pkg/controller/controller_test.go b/pkg/controller/controller_test.go index 1b4b9b2ba9..4c29b1d0c4 100644 --- a/pkg/controller/controller_test.go +++ b/pkg/controller/controller_test.go @@ -22,7 +22,6 @@ import ( "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/rand" kubeinformers "k8s.io/client-go/informers" kubeclientset "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" @@ -33,6 +32,13 @@ import ( tfjobinformers "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions" ) +const ( + testImageName = "test-image-for-kubeflow-tf-operator:latest" + testTFJobName = "test-tfjob" + labelWorker = "worker" + labelPS = "ps" +) + var alwaysReady = func() bool { return true } func newTFJobControllerFromClient(kubeClientSet kubeclientset.Interface, tfJobClientSet tfjobclientset.Interface, resyncPeriod ResyncPeriodFunc) (*TFJobController, kubeinformers.SharedInformerFactory, tfjobinformers.SharedInformerFactory) { @@ -46,10 +52,22 @@ func newTFJobControllerFromClient(kubeClientSet kubeclientset.Interface, tfJobCl return controller, kubeInformerFactory, tfJobInformerFactory } +func newTFReplicaSpecTemplate() v1.PodTemplateSpec { + return v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + v1.Container{ + Image: testImageName, + }, + }, + }, + } +} + func newTFJob(worker, ps int) *tfv1alpha2.TFJob { tfJob := &tfv1alpha2.TFJob{ ObjectMeta: metav1.ObjectMeta{ - Name: "foobar", + Name: testTFJobName, Namespace: metav1.NamespaceDefault, }, Spec: tfv1alpha2.TFJobSpec{ @@ -61,15 +79,7 @@ func newTFJob(worker, ps int) *tfv1alpha2.TFJob { worker := int32(worker) workerReplicaSpec := &tfv1alpha2.TFReplicaSpec{ Replicas: &worker, - Template: v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - v1.Container{ - Image: "foo/bar", - }, - }, - }, - }, + Template: newTFReplicaSpecTemplate(), } tfJob.Spec.TFReplicaSpecs[tfv1alpha2.TFReplicaTypeWorker] = workerReplicaSpec } @@ -78,15 +88,7 @@ func newTFJob(worker, ps int) *tfv1alpha2.TFJob { ps := int32(ps) psReplicaSpec := &tfv1alpha2.TFReplicaSpec{ Replicas: &ps, - Template: v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - v1.Container{ - Image: "foo/bar", - }, - }, - }, - }, + Template: newTFReplicaSpecTemplate(), } tfJob.Spec.TFReplicaSpecs[tfv1alpha2.TFReplicaTypePS] = psReplicaSpec } @@ -102,47 +104,47 @@ func getKey(tfJob *tfv1alpha2.TFJob, t *testing.T) string { } } -func newPod(name string, tfJob *tfv1alpha2.TFJob) *v1.Pod { - var tfjobKey string - if len(tfJob.Namespace) > 0 { - tfjobKey = fmt.Sprintf("%s/%s", tfJob.Namespace, tfJob.Name) - } else { - tfjobKey = tfJob.Name - } - +func newBasePod(name string, tfJob *tfv1alpha2.TFJob, t *testing.T) *v1.Pod { return &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: name, - Labels: genLabels(tfjobKey), + Labels: genLabels(getKey(tfJob, t)), Namespace: tfJob.Namespace, OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(tfJob, controllerKind)}, }, } } +func newPod(tfJob *tfv1alpha2.TFJob, typ string, index int, t *testing.T) *v1.Pod { + pod := newBasePod(fmt.Sprintf("%s-%d", typ, index), tfJob, t) + pod.Labels[tfReplicaTypeLabel] = typ + pod.Labels[tfReplicaIndexLabel] = fmt.Sprintf("%d", index) + return pod +} + // create count pods with the given phase for the given tfJob -func newPodList(count int32, status v1.PodPhase, tfJob *tfv1alpha2.TFJob) []v1.Pod { - pods := []v1.Pod{} +func newPodList(count int32, status v1.PodPhase, tfJob *tfv1alpha2.TFJob, typ string, t *testing.T) []*v1.Pod { + pods := []*v1.Pod{} for i := int32(0); i < count; i++ { - newPod := newPod(fmt.Sprintf("pod-%v", rand.String(10)), tfJob) + newPod := newPod(tfJob, typ, int(i), t) newPod.Status = v1.PodStatus{Phase: status} - pods = append(pods, *newPod) + pods = append(pods, newPod) } return pods } -func setPodsStatuses(podIndexer cache.Indexer, tfJob *tfv1alpha2.TFJob, pendingPods, activePods, succeededPods, failedPods int32) { - for _, pod := range newPodList(pendingPods, v1.PodPending, tfJob) { - podIndexer.Add(&pod) +func setPodsStatuses(podIndexer cache.Indexer, tfJob *tfv1alpha2.TFJob, typ string, pendingPods, activePods, succeededPods, failedPods int32, t *testing.T) { + for _, pod := range newPodList(pendingPods, v1.PodPending, tfJob, typ, t) { + podIndexer.Add(pod) } - for _, pod := range newPodList(activePods, v1.PodRunning, tfJob) { - podIndexer.Add(&pod) + for _, pod := range newPodList(activePods, v1.PodRunning, tfJob, typ, t) { + podIndexer.Add(pod) } - for _, pod := range newPodList(succeededPods, v1.PodSucceeded, tfJob) { - podIndexer.Add(&pod) + for _, pod := range newPodList(succeededPods, v1.PodSucceeded, tfJob, typ, t) { + podIndexer.Add(pod) } - for _, pod := range newPodList(failedPods, v1.PodFailed, tfJob) { - podIndexer.Add(&pod) + for _, pod := range newPodList(failedPods, v1.PodFailed, tfJob, typ, t) { + podIndexer.Add(pod) } } @@ -163,10 +165,16 @@ func TestNormalPath(t *testing.T) { // pod setup podControllerError error jobKeyForget bool - pendingPods int32 - activePods int32 - succeededPods int32 - failedPods int32 + + pendingWorkerPods int32 + activeWorkerPods int32 + succeededWorkerPods int32 + failedWorkerPods int32 + + pendingPSPods int32 + activePSPods int32 + succeededPSPods int32 + failedPSPods int32 // TODO(gaocegege): Add service setup. @@ -181,18 +189,61 @@ func TestNormalPath(t *testing.T) { expectedActivePSPods int32 expectedSucceededPSPods int32 expectedFailedPSPods int32 + // TODO(gaocegege): Add condition check. expectedCondition *tfv1alpha2.TFJobConditionType expectedConditionReason string }{ - "Local TFJob created": { + "Local TFJob is created": { 1, 0, - nil, true, 0, 0, 0, 0, + nil, true, + 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, nil, "", }, + "Distributed TFJob (4 workers, 2 PS) is created": { + 4, 2, + nil, true, + 0, 0, 0, 0, + 0, 0, 0, 0, + 6, 0, + 4, 0, 0, + 2, 0, 0, + nil, "", + }, + "Distributed TFJob (4 workers, 2 PS) is created and all replicas are pending": { + 4, 2, + nil, true, + 4, 0, 0, 0, + 2, 0, 0, 0, + 0, 0, + 4, 0, 0, + 2, 0, 0, + nil, "", + }, + "Distributed TFJob (4 workers, 2 PS) is created and all replicas are running": { + 4, 2, + nil, true, + 0, 4, 0, 0, + 0, 2, 0, 0, + 0, 0, + 4, 0, 0, + 2, 0, 0, + nil, "", + }, + "Distributed TFJob (4 workers, 2 PS) is created, 2 workers, 1 PS are pending": { + 4, 2, + nil, true, + 2, 0, 0, 0, + 1, 0, 0, 0, + 3, 0, + 4, 0, 0, + 2, 0, 0, + nil, "", + }, } for name, tc := range testCases { @@ -225,7 +276,8 @@ func TestNormalPath(t *testing.T) { tfJob := newTFJob(tc.worker, tc.ps) tfJobInformerFactory.Kubeflow().V1alpha2().TFJobs().Informer().GetIndexer().Add(tfJob) podIndexer := kubeInformerFactory.Core().V1().Pods().Informer().GetIndexer() - setPodsStatuses(podIndexer, tfJob, tc.pendingPods, tc.activePods, tc.succeededPods, tc.failedPods) + setPodsStatuses(podIndexer, tfJob, labelWorker, tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods, t) + setPodsStatuses(podIndexer, tfJob, labelPS, tc.pendingPSPods, tc.activePSPods, tc.succeededPSPods, tc.failedPSPods, t) forget, err := controller.syncTFJob(getKey(tfJob, t)) // We need requeue syncJob task if podController error From 326d3f0572852cdd6077092960a5b22966accb90 Mon Sep 17 00:00:00 2001 From: Ce Gao Date: Wed, 21 Mar 2018 13:53:03 +0800 Subject: [PATCH 16/24] controller: Separate ps and worker pods (#481) * controller: Separate ps and worker pods Signed-off-by: Ce Gao * test: Remove log Signed-off-by: Ce Gao * test: Fix travis Signed-off-by: Ce Gao --- pkg/controller/controller_pod.go | 24 ++++++++-------- pkg/controller/controller_service.go | 2 +- pkg/controller/controller_test.go | 42 ++++++++++++++++++++++------ 3 files changed, 45 insertions(+), 23 deletions(-) diff --git a/pkg/controller/controller_pod.go b/pkg/controller/controller_pod.go index e870a3fd24..65fd1c73a3 100644 --- a/pkg/controller/controller_pod.go +++ b/pkg/controller/controller_pod.go @@ -46,17 +46,18 @@ func (tc *TFJobController) reconcilePods( // Convert TFReplicaType to lower string. rt := strings.ToLower(string(rtype)) - - // Get active pods for this TFReplicaType. - activePods := filterActivePodsForTFReplicaType(pods, rt) - + // Get all pods for the type rt. + pods = filterPodsForTFReplicaType(pods, rt) + activePods := FilterActivePods(pods) succeeded, failed := getPodStatus(pods) - diff := len(activePods) - int(*(spec.Replicas)) + // Expect to have `replicas - succeeded` pods alive. + expected := *spec.Replicas - succeeded + diff := len(activePods) - int(expected) if diff < 0 { // Need to create new pods. - diffIndexes := getDiffPodIndexes(activePods, *spec.Replicas) + diffIndexes := getDiffPodIndexes(activePods, expected) if diff+len(diffIndexes) != 0 { // This should never happened. return fmt.Errorf("pods diff(%d) is not equal to length(%d) of diffIndexes", diff, len(diffIndexes)) @@ -135,7 +136,7 @@ func (tc *TFJobController) reconcilePods( } // Update the active status since we have created -diff pods during the loop. - tfjob.Status.TFReplicaStatuses[rtype].Active = int32(len(activePods) - diff) + tfjob.Status.TFReplicaStatuses[rtype].Active = expected tfjob.Status.TFReplicaStatuses[rtype].Succeeded = succeeded tfjob.Status.TFReplicaStatuses[rtype].Failed = failed @@ -214,11 +215,8 @@ func (tc *TFJobController) getPodsForTFJob(tfjob *tfv1alpha2.TFJob) ([]*v1.Pod, return cm.ClaimPods(pods) } -// filterActivePodsForTFReplicaType returns pods that have not terminated, -// and belong to a TFReplicaType. -func filterActivePodsForTFReplicaType(pods []*v1.Pod, tfReplicaType string) []*v1.Pod { - activePods := FilterActivePods(pods) - +// filterPodsForTFReplicaType returns pods belong to a TFReplicaType. +func filterPodsForTFReplicaType(pods []*v1.Pod, tfReplicaType string) []*v1.Pod { var result []*v1.Pod tfReplicaSelector := &metav1.LabelSelector{ @@ -227,7 +225,7 @@ func filterActivePodsForTFReplicaType(pods []*v1.Pod, tfReplicaType string) []*v tfReplicaSelector.MatchLabels[tfReplicaTypeLabel] = tfReplicaType - for _, pod := range activePods { + for _, pod := range pods { selector, _ := metav1.LabelSelectorAsSelector(tfReplicaSelector) if !selector.Matches(labels.Set(pod.Labels)) { continue diff --git a/pkg/controller/controller_service.go b/pkg/controller/controller_service.go index 2bd35a72b0..d06e5473c7 100644 --- a/pkg/controller/controller_service.go +++ b/pkg/controller/controller_service.go @@ -107,7 +107,7 @@ func (tc *TFJobController) reconcileServices( } } } else if diff > 0 { - // TODO(CPH): Need to delete pods. + // TODO(CPH): Need to delete service. log.Infof("need to delete service but it is not implemented yet") } diff --git a/pkg/controller/controller_test.go b/pkg/controller/controller_test.go index 4c29b1d0c4..d59c0fe97b 100644 --- a/pkg/controller/controller_test.go +++ b/pkg/controller/controller_test.go @@ -123,10 +123,10 @@ func newPod(tfJob *tfv1alpha2.TFJob, typ string, index int, t *testing.T) *v1.Po } // create count pods with the given phase for the given tfJob -func newPodList(count int32, status v1.PodPhase, tfJob *tfv1alpha2.TFJob, typ string, t *testing.T) []*v1.Pod { +func newPodList(count int32, status v1.PodPhase, tfJob *tfv1alpha2.TFJob, typ string, start int32, t *testing.T) []*v1.Pod { pods := []*v1.Pod{} for i := int32(0); i < count; i++ { - newPod := newPod(tfJob, typ, int(i), t) + newPod := newPod(tfJob, typ, int(start+i), t) newPod.Status = v1.PodStatus{Phase: status} pods = append(pods, newPod) } @@ -134,16 +134,20 @@ func newPodList(count int32, status v1.PodPhase, tfJob *tfv1alpha2.TFJob, typ st } func setPodsStatuses(podIndexer cache.Indexer, tfJob *tfv1alpha2.TFJob, typ string, pendingPods, activePods, succeededPods, failedPods int32, t *testing.T) { - for _, pod := range newPodList(pendingPods, v1.PodPending, tfJob, typ, t) { + var index int32 + for _, pod := range newPodList(pendingPods, v1.PodPending, tfJob, typ, index, t) { podIndexer.Add(pod) } - for _, pod := range newPodList(activePods, v1.PodRunning, tfJob, typ, t) { + index += pendingPods + for _, pod := range newPodList(activePods, v1.PodRunning, tfJob, typ, index, t) { podIndexer.Add(pod) } - for _, pod := range newPodList(succeededPods, v1.PodSucceeded, tfJob, typ, t) { + index += activePods + for _, pod := range newPodList(succeededPods, v1.PodSucceeded, tfJob, typ, index, t) { podIndexer.Add(pod) } - for _, pod := range newPodList(failedPods, v1.PodFailed, tfJob, typ, t) { + index += succeededPods + for _, pod := range newPodList(failedPods, v1.PodFailed, tfJob, typ, index, t) { podIndexer.Add(pod) } } @@ -163,8 +167,8 @@ func TestNormalPath(t *testing.T) { ps int // pod setup - podControllerError error - jobKeyForget bool + ControllerError error + jobKeyForget bool pendingWorkerPods int32 activeWorkerPods int32 @@ -244,6 +248,26 @@ func TestNormalPath(t *testing.T) { 2, 0, 0, nil, "", }, + "Distributed TFJob (4 workers, 2 PS) is created, 2 workers, 1 PS are pending, 1 worker is running": { + 4, 2, + nil, true, + 2, 1, 0, 0, + 1, 0, 0, 0, + 2, 0, + 4, 0, 0, + 2, 0, 0, + nil, "", + }, + "Distributed TFJob (4 workers, 2 PS) is succeeded": { + 4, 2, + nil, true, + 0, 0, 4, 0, + 0, 0, 2, 0, + 0, 0, + 0, 4, 0, + 0, 2, 0, + nil, "", + }, } for name, tc := range testCases { @@ -281,7 +305,7 @@ func TestNormalPath(t *testing.T) { forget, err := controller.syncTFJob(getKey(tfJob, t)) // We need requeue syncJob task if podController error - if tc.podControllerError != nil { + if tc.ControllerError != nil { if err == nil { t.Errorf("%s: Syncing jobs would return error when podController exception", name) } From 683c6d408747554941a147c83292861f77701b8e Mon Sep 17 00:00:00 2001 From: Ce Gao Date: Wed, 21 Mar 2018 14:17:02 +0800 Subject: [PATCH 17/24] controller: Add check for service and fix service (#482) * controller: Separate ps and worker pods Signed-off-by: Ce Gao * test: Remove log Signed-off-by: Ce Gao * test: Add Signed-off-by: Ce Gao --- pkg/controller/controller_test.go | 80 ++++++++++++++++++++++++------- 1 file changed, 63 insertions(+), 17 deletions(-) diff --git a/pkg/controller/controller_test.go b/pkg/controller/controller_test.go index d59c0fe97b..602213de5a 100644 --- a/pkg/controller/controller_test.go +++ b/pkg/controller/controller_test.go @@ -152,6 +152,36 @@ func setPodsStatuses(podIndexer cache.Indexer, tfJob *tfv1alpha2.TFJob, typ stri } } +func newService(tfJob *tfv1alpha2.TFJob, typ string, index int, t *testing.T) *v1.Service { + service := &v1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("%s-%d", typ, index), + Labels: genLabels(getKey(tfJob, t)), + Namespace: tfJob.Namespace, + OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(tfJob, controllerKind)}, + }, + } + service.Labels[tfReplicaTypeLabel] = typ + service.Labels[tfReplicaIndexLabel] = fmt.Sprintf("%d", index) + return service +} + +// create count pods with the given phase for the given tfJob +func newServiceList(count int32, tfJob *tfv1alpha2.TFJob, typ string, t *testing.T) []*v1.Service { + services := []*v1.Service{} + for i := int32(0); i < count; i++ { + newService := newService(tfJob, typ, int(i), t) + services = append(services, newService) + } + return services +} + +func setServices(serviceIndexer cache.Indexer, tfJob *tfv1alpha2.TFJob, typ string, activeWorkerServices int32, t *testing.T) { + for _, service := range newServiceList(activeWorkerServices, tfJob, typ, t) { + serviceIndexer.Add(service) + } +} + func getCondition(tfJob *tfv1alpha2.TFJob, condition tfv1alpha2.TFJobConditionType, reason string) bool { for _, v := range tfJob.Status.Conditions { if v.Type == condition && v.Status == v1.ConditionTrue && v.Reason == reason { @@ -180,11 +210,13 @@ func TestNormalPath(t *testing.T) { succeededPSPods int32 failedPSPods int32 - // TODO(gaocegege): Add service setup. + activeWorkerServices int32 + activePSServices int32 // expectations - expectedCreations int32 - expectedDeletions int32 + expectedPodCreations int32 + expectedPodDeletions int32 + expectedServiceCreations int32 expectedActiveWorkerPods int32 expectedSucceededWorkerPods int32 @@ -194,7 +226,6 @@ func TestNormalPath(t *testing.T) { expectedSucceededPSPods int32 expectedFailedPSPods int32 - // TODO(gaocegege): Add condition check. expectedCondition *tfv1alpha2.TFJobConditionType expectedConditionReason string }{ @@ -203,7 +234,8 @@ func TestNormalPath(t *testing.T) { nil, true, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, + 0, 0, + 1, 0, 1, 1, 0, 0, 0, 0, 0, nil, "", @@ -213,7 +245,8 @@ func TestNormalPath(t *testing.T) { nil, true, 0, 0, 0, 0, 0, 0, 0, 0, - 6, 0, + 0, 0, + 6, 0, 6, 4, 0, 0, 2, 0, 0, nil, "", @@ -223,7 +256,8 @@ func TestNormalPath(t *testing.T) { nil, true, 4, 0, 0, 0, 2, 0, 0, 0, - 0, 0, + 4, 2, + 0, 0, 0, 4, 0, 0, 2, 0, 0, nil, "", @@ -233,7 +267,8 @@ func TestNormalPath(t *testing.T) { nil, true, 0, 4, 0, 0, 0, 2, 0, 0, - 0, 0, + 4, 2, + 0, 0, 0, 4, 0, 0, 2, 0, 0, nil, "", @@ -243,7 +278,8 @@ func TestNormalPath(t *testing.T) { nil, true, 2, 0, 0, 0, 1, 0, 0, 0, - 3, 0, + 2, 1, + 3, 0, 3, 4, 0, 0, 2, 0, 0, nil, "", @@ -253,7 +289,8 @@ func TestNormalPath(t *testing.T) { nil, true, 2, 1, 0, 0, 1, 0, 0, 0, - 2, 0, + 3, 1, + 2, 0, 2, 4, 0, 0, 2, 0, 0, nil, "", @@ -263,7 +300,8 @@ func TestNormalPath(t *testing.T) { nil, true, 0, 0, 4, 0, 0, 0, 2, 0, - 0, 0, + 4, 2, + 0, 0, 0, 0, 4, 0, 0, 2, 0, nil, "", @@ -303,6 +341,10 @@ func TestNormalPath(t *testing.T) { setPodsStatuses(podIndexer, tfJob, labelWorker, tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods, t) setPodsStatuses(podIndexer, tfJob, labelPS, tc.pendingPSPods, tc.activePSPods, tc.succeededPSPods, tc.failedPSPods, t) + serviceIndexer := kubeInformerFactory.Core().V1().Services().Informer().GetIndexer() + setServices(serviceIndexer, tfJob, labelWorker, tc.activeWorkerServices, t) + setServices(serviceIndexer, tfJob, labelPS, tc.activePSServices, t) + forget, err := controller.syncTFJob(getKey(tfJob, t)) // We need requeue syncJob task if podController error if tc.ControllerError != nil { @@ -319,15 +361,19 @@ func TestNormalPath(t *testing.T) { } fakePodControl := controller.podControl.(*FakePodControl) - if int32(len(fakePodControl.Templates)) != tc.expectedCreations { - t.Errorf("%s: unexpected number of creates. Expected %d, saw %d\n", name, tc.expectedCreations, len(fakePodControl.Templates)) + fakeServiceControl := controller.serviceControl.(*FakeServiceControl) + if int32(len(fakePodControl.Templates)) != tc.expectedPodCreations { + t.Errorf("%s: unexpected number of pod creates. Expected %d, saw %d\n", name, tc.expectedPodCreations, len(fakePodControl.Templates)) + } + if int32(len(fakeServiceControl.Templates)) != tc.expectedServiceCreations { + t.Errorf("%s: unexpected number of service creates. Expected %d, saw %d\n", name, tc.expectedServiceCreations, len(fakeServiceControl.Templates)) } - if int32(len(fakePodControl.DeletePodName)) != tc.expectedDeletions { - t.Errorf("%s: unexpected number of deletes. Expected %d, saw %d\n", name, tc.expectedDeletions, len(fakePodControl.DeletePodName)) + if int32(len(fakePodControl.DeletePodName)) != tc.expectedPodDeletions { + t.Errorf("%s: unexpected number of pod deletes. Expected %d, saw %d\n", name, tc.expectedPodDeletions, len(fakePodControl.DeletePodName)) } // Each create should have an accompanying ControllerRef. - if len(fakePodControl.ControllerRefs) != int(tc.expectedCreations) { - t.Errorf("%s: unexpected number of ControllerRefs. Expected %d, saw %d\n", name, tc.expectedCreations, len(fakePodControl.ControllerRefs)) + if len(fakePodControl.ControllerRefs) != int(tc.expectedPodCreations) { + t.Errorf("%s: unexpected number of ControllerRefs. Expected %d, saw %d\n", name, tc.expectedPodCreations, len(fakePodControl.ControllerRefs)) } // Make sure the ControllerRefs are correct. for _, controllerRef := range fakePodControl.ControllerRefs { From 7c157932898279f7bc7d596a0ad571f013deb8b3 Mon Sep 17 00:00:00 2001 From: Penghao Cen Date: Wed, 21 Mar 2018 15:32:03 +0800 Subject: [PATCH 18/24] Add sleep and random exit image for e2e test (#487) --- test/e2e/sleep-and-random-exit/Dockerfile | 5 +++++ .../sleep-and-random-exit/sleep-and-random-exit.sh | 12 ++++++++++++ 2 files changed, 17 insertions(+) create mode 100644 test/e2e/sleep-and-random-exit/Dockerfile create mode 100755 test/e2e/sleep-and-random-exit/sleep-and-random-exit.sh diff --git a/test/e2e/sleep-and-random-exit/Dockerfile b/test/e2e/sleep-and-random-exit/Dockerfile new file mode 100644 index 0000000000..1ffca7bb34 --- /dev/null +++ b/test/e2e/sleep-and-random-exit/Dockerfile @@ -0,0 +1,5 @@ +FROM ubuntu:16.04 + +COPY ./sleep_and_random_exit.sh ~/ + +ENTRYPOINT ["~/sleep_and_random_exit.sh"] diff --git a/test/e2e/sleep-and-random-exit/sleep-and-random-exit.sh b/test/e2e/sleep-and-random-exit/sleep-and-random-exit.sh new file mode 100755 index 0000000000..14c5ead560 --- /dev/null +++ b/test/e2e/sleep-and-random-exit/sleep-and-random-exit.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +# This script will sleep a random seconds, +# Then exit a random code + +random_sleep=`shuf -i 10-30 -n 1` +echo "sleep $random_sleep" +sleep $random_sleep + +random_exit=`shuf -i 0-3 -n 1` +echo "exit $random_exit" +exit $random_exit From 0453c0aeb350aab01b3e4eae20490e1c21e98478 Mon Sep 17 00:00:00 2001 From: Ce Gao Date: Wed, 21 Mar 2018 15:38:03 +0800 Subject: [PATCH 19/24] controller: Add defaulter (#483) * defaulter: Add Signed-off-by: Ce Gao * default: Add Signed-off-by: Ce Gao * default: Fix Signed-off-by: Ce Gao * defaulter: Remove image default value Signed-off-by: Ce Gao * example: Add minimal example Signed-off-by: Ce Gao * utils.go: Add copyright holder Signed-off-by: Ce Gao * defaulter: Remove restartpolicy Signed-off-by: Ce Gao --- examples/simple_tf_job.yaml | 13 ++ hack/update-codegen.sh | 6 +- pkg/apis/tensorflow/v1alpha2/constants.go | 3 + pkg/apis/tensorflow/v1alpha2/defaults.go | 55 ++++++++ pkg/apis/tensorflow/v1alpha2/defaults_test.go | 118 ++++++++++++++++++ pkg/apis/tensorflow/v1alpha2/register.go | 1 + pkg/apis/tensorflow/v1alpha2/utils.go | 35 ++++++ .../v1alpha2/zz_generated.defaults.go | 45 +++++++ pkg/controller/controller.go | 11 +- 9 files changed, 284 insertions(+), 3 deletions(-) create mode 100644 examples/simple_tf_job.yaml create mode 100644 pkg/apis/tensorflow/v1alpha2/defaults.go create mode 100644 pkg/apis/tensorflow/v1alpha2/defaults_test.go create mode 100644 pkg/apis/tensorflow/v1alpha2/utils.go create mode 100644 pkg/apis/tensorflow/v1alpha2/zz_generated.defaults.go diff --git a/examples/simple_tf_job.yaml b/examples/simple_tf_job.yaml new file mode 100644 index 0000000000..c77e350c24 --- /dev/null +++ b/examples/simple_tf_job.yaml @@ -0,0 +1,13 @@ +apiVersion: "kubeflow.org/v1alpha2" +kind: "TFJob" +metadata: + name: "simple-job" +spec: + tfReplicaSpecs: + Worker: + template: + spec: + containers: + - name: worker-busybox + image: busybox + command: ["sleep", "30000"] diff --git a/hack/update-codegen.sh b/hack/update-codegen.sh index d73dd7e5fd..5fe42b523c 100755 --- a/hack/update-codegen.sh +++ b/hack/update-codegen.sh @@ -28,7 +28,11 @@ CODEGEN_PKG=${CODEGEN_PKG:-$(cd ${SCRIPT_ROOT}; ls -d -1 ./vendor/k8s.io/code-ge # --output-base because this script should also be able to run inside the vendor dir of # k8s.io/kubernetes. The output-base is needed for the generators to output into the vendor dir # instead of the $GOPATH directly. For normal projects this can be dropped. -${CODEGEN_PKG}/generate-groups.sh "defaulter,deepcopy,client,informer,lister" \ +${CODEGEN_PKG}/generate-groups.sh "all" \ github.com/kubeflow/tf-operator/pkg/client github.com/kubeflow/tf-operator/pkg/apis \ tensorflow:v1alpha2 \ --go-header-file ${SCRIPT_ROOT}/hack/boilerplate/boilerplate.go.txt + +# Notice: The code in code-generator does not generate defaulter by default. +echo "Generating defaulters" +${GOPATH}/bin/defaulter-gen --input-dirs github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2 -O zz_generated.defaults "$@" diff --git a/pkg/apis/tensorflow/v1alpha2/constants.go b/pkg/apis/tensorflow/v1alpha2/constants.go index 2f712de58d..e1f2aab4fc 100644 --- a/pkg/apis/tensorflow/v1alpha2/constants.go +++ b/pkg/apis/tensorflow/v1alpha2/constants.go @@ -17,4 +17,7 @@ package v1alpha2 const ( // EnvKubeflowNamespace is ENV for kubeflow namespace specified by user. EnvKubeflowNamespace = "KUBEFLOW_NAMESPACE" + + defaultPortName = "tfjob-port" + defaultPort = 2222 ) diff --git a/pkg/apis/tensorflow/v1alpha2/defaults.go b/pkg/apis/tensorflow/v1alpha2/defaults.go new file mode 100644 index 0000000000..5d574f9d70 --- /dev/null +++ b/pkg/apis/tensorflow/v1alpha2/defaults.go @@ -0,0 +1,55 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1alpha2 + +import ( + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +// Int32 is a helper routine that allocates a new int32 value +// to store v and returns a pointer to it. +func Int32(v int32) *int32 { + return &v +} + +func addDefaultingFuncs(scheme *runtime.Scheme) error { + return RegisterDefaults(scheme) +} + +func setDefaultPort(spec *v1.PodSpec) { + for i := range spec.Containers { + if len(spec.Containers[i].Ports) == 0 { + spec.Containers[i].Ports = append(spec.Containers[i].Ports, v1.ContainerPort{ + Name: defaultPortName, + ContainerPort: defaultPort, + }) + } + } +} + +func setDefaultReplicas(spec *TFReplicaSpec) { + if spec.Replicas == nil { + spec.Replicas = Int32(1) + } +} + +// SetDefaults_TFJob sets any unspecified values to defaults. +func SetDefaults_TFJob(tfjob *TFJob) { + for _, spec := range tfjob.Spec.TFReplicaSpecs { + setDefaultReplicas(spec) + setDefaultPort(&spec.Template.Spec) + } +} diff --git a/pkg/apis/tensorflow/v1alpha2/defaults_test.go b/pkg/apis/tensorflow/v1alpha2/defaults_test.go new file mode 100644 index 0000000000..48b81dca60 --- /dev/null +++ b/pkg/apis/tensorflow/v1alpha2/defaults_test.go @@ -0,0 +1,118 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1alpha2 + +import ( + "reflect" + "testing" + + "k8s.io/api/core/v1" +) + +const ( + testImage = "test-image:latest" +) + +func expectedTFJob() *TFJob { + return &TFJob{ + Spec: TFJobSpec{ + TFReplicaSpecs: map[TFReplicaType]*TFReplicaSpec{ + TFReplicaTypeWorker: &TFReplicaSpec{ + Replicas: Int32(1), + RestartPolicy: RestartPolicyAlways, + Template: v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + v1.Container{ + Image: testImage, + Ports: []v1.ContainerPort{ + v1.ContainerPort{ + Name: defaultPortName, + ContainerPort: defaultPort, + }, + }, + }, + }, + }, + }, + }, + }, + }, + } +} + +func TestSetDefaultTFJob(t *testing.T) { + testCases := map[string]struct { + original *TFJob + expected *TFJob + }{ + "set replicas": { + original: &TFJob{ + Spec: TFJobSpec{ + TFReplicaSpecs: map[TFReplicaType]*TFReplicaSpec{ + TFReplicaTypeWorker: &TFReplicaSpec{ + RestartPolicy: RestartPolicyAlways, + Template: v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + v1.Container{ + Image: testImage, + Ports: []v1.ContainerPort{ + v1.ContainerPort{ + Name: defaultPortName, + ContainerPort: defaultPort, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + expected: expectedTFJob(), + }, + "set default port": { + original: &TFJob{ + Spec: TFJobSpec{ + TFReplicaSpecs: map[TFReplicaType]*TFReplicaSpec{ + TFReplicaTypeWorker: &TFReplicaSpec{ + Replicas: Int32(1), + RestartPolicy: RestartPolicyAlways, + Template: v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + v1.Container{ + Image: testImage, + }, + }, + }, + }, + }, + }, + }, + }, + expected: expectedTFJob(), + }, + } + + for name, tc := range testCases { + SetDefaults_TFJob(tc.original) + if !reflect.DeepEqual(tc.original, tc.expected) { + t.Errorf("%s: Want\n%v; Got\n %v", name, Pformat(tc.expected), Pformat(tc.original)) + } + } +} diff --git a/pkg/apis/tensorflow/v1alpha2/register.go b/pkg/apis/tensorflow/v1alpha2/register.go index 694b971720..6a00938b03 100644 --- a/pkg/apis/tensorflow/v1alpha2/register.go +++ b/pkg/apis/tensorflow/v1alpha2/register.go @@ -45,6 +45,7 @@ func init() { // generated functions takes place in the generated files. The separation // makes the code compile even when the generated files are missing. localSchemeBuilder.Register(addKnownTypes) + localSchemeBuilder.Register(addDefaultingFuncs) } // Resource takes an unqualified resource and returns a Group-qualified GroupResource. diff --git a/pkg/apis/tensorflow/v1alpha2/utils.go b/pkg/apis/tensorflow/v1alpha2/utils.go new file mode 100644 index 0000000000..8e4a6fe69b --- /dev/null +++ b/pkg/apis/tensorflow/v1alpha2/utils.go @@ -0,0 +1,35 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1alpha2 + +import ( + "encoding/json" + "fmt" + + log "github.com/sirupsen/logrus" +) + +// Pformat returns a pretty format output of any value that can be marshalled to JSON. +func Pformat(value interface{}) string { + if s, ok := value.(string); ok { + return s + } + valueJSON, err := json.MarshalIndent(value, "", " ") + if err != nil { + log.Warningf("Couldn't pretty format %v, error: %v", value, err) + return fmt.Sprintf("%v", value) + } + return string(valueJSON) +} diff --git a/pkg/apis/tensorflow/v1alpha2/zz_generated.defaults.go b/pkg/apis/tensorflow/v1alpha2/zz_generated.defaults.go new file mode 100644 index 0000000000..9170f67600 --- /dev/null +++ b/pkg/apis/tensorflow/v1alpha2/zz_generated.defaults.go @@ -0,0 +1,45 @@ +// +build !ignore_autogenerated + +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This file was autogenerated by defaulter-gen. Do not edit it manually! + +package v1alpha2 + +import ( + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// RegisterDefaults adds defaulters functions to the given scheme. +// Public to allow building arbitrary schemes. +// All generated defaulters are covering - they call all nested defaulters. +func RegisterDefaults(scheme *runtime.Scheme) error { + scheme.AddTypeDefaultingFunc(&TFJob{}, func(obj interface{}) { SetObjectDefaults_TFJob(obj.(*TFJob)) }) + scheme.AddTypeDefaultingFunc(&TFJobList{}, func(obj interface{}) { SetObjectDefaults_TFJobList(obj.(*TFJobList)) }) + return nil +} + +func SetObjectDefaults_TFJob(in *TFJob) { + SetDefaults_TFJob(in) +} + +func SetObjectDefaults_TFJobList(in *TFJobList) { + for i := range in.Items { + a := &in.Items[i] + SetObjectDefaults_TFJob(a) + } +} diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index 2b9e7e9f43..8b5c829771 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -22,7 +22,6 @@ import ( "time" log "github.com/sirupsen/logrus" - "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -199,7 +198,7 @@ func NewTFJobController( // Set up an event handler for when tfjob resources change. tfJobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: tc.enqueueTFJob, + AddFunc: tc.addTFJob, UpdateFunc: tc.updateTFJob, // This will enter the sync loop and no-op, // because the tfjob has been deleted from the store. @@ -429,6 +428,14 @@ func genLabels(tfjobKey string) map[string]string { } } +// When a pod is added, set the defaults and enqueue the current tfjob. +func (tc *TFJobController) addTFJob(obj interface{}) { + tfjob := obj.(*tfv1alpha2.TFJob) + log.Infof("Adding tfjob: %s", tfjob.Name) + scheme.Scheme.Default(tfjob) + tc.enqueueTFJob(obj) +} + // When a pod is updated, enqueue the current tfjob. func (tc *TFJobController) updateTFJob(old, cur interface{}) { oldTFJob := old.(*tfv1alpha2.TFJob) From 70f2a8de87a938a0d0780efbb270c1bb7b20d24a Mon Sep 17 00:00:00 2001 From: Ce Gao Date: Wed, 21 Mar 2018 19:37:03 +0800 Subject: [PATCH 20/24] test: Add test cases (#488) * test: Add run test case Signed-off-by: Ce Gao * test: Add enqueue test Signed-off-by: Ce Gao * pod_test: Add Signed-off-by: Ce Gao * service_test: Add Signed-off-by: Ce Gao --- pkg/controller/controller_pod.go | 4 +- pkg/controller/controller_pod_test.go | 126 +++++++++++++++++ pkg/controller/controller_service_test.go | 112 +++++++++++++++ pkg/controller/controller_test.go | 164 +++++++++++----------- 4 files changed, 322 insertions(+), 84 deletions(-) create mode 100644 pkg/controller/controller_pod_test.go create mode 100644 pkg/controller/controller_service_test.go diff --git a/pkg/controller/controller_pod.go b/pkg/controller/controller_pod.go index 65fd1c73a3..da5cd747fa 100644 --- a/pkg/controller/controller_pod.go +++ b/pkg/controller/controller_pod.go @@ -253,16 +253,18 @@ func (tc *TFJobController) addPod(obj interface{}) { if controllerRef := metav1.GetControllerOf(pod); controllerRef != nil { tfjob := tc.resolveControllerRef(pod.Namespace, controllerRef) if tfjob == nil { + log.Info("This pod's tfjob does not exists") return } tfjobKey, err := KeyFunc(tfjob) if err != nil { + log.Infof("Failed to get the key of the tfjob: %v", err) return } if _, ok := pod.Labels[tfReplicaTypeLabel]; !ok { - log.Infof("This pod maybe not created by tf-operator") + log.Info("This pod maybe not created by tf-operator") return } diff --git a/pkg/controller/controller_pod_test.go b/pkg/controller/controller_pod_test.go new file mode 100644 index 0000000000..27ebcb759e --- /dev/null +++ b/pkg/controller/controller_pod_test.go @@ -0,0 +1,126 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package controller provides a Kubernetes controller for a TFJob resource. + +package controller + +import ( + "fmt" + "testing" + "time" + + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kubeclientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/cache" + + tfv1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" + tfjobclientset "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" +) + +func newBasePod(name string, tfJob *tfv1alpha2.TFJob, t *testing.T) *v1.Pod { + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: genLabels(getKey(tfJob, t)), + Namespace: tfJob.Namespace, + OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(tfJob, controllerKind)}, + }, + } +} + +func newPod(tfJob *tfv1alpha2.TFJob, typ string, index int, t *testing.T) *v1.Pod { + pod := newBasePod(fmt.Sprintf("%s-%d", typ, index), tfJob, t) + pod.Labels[tfReplicaTypeLabel] = typ + pod.Labels[tfReplicaIndexLabel] = fmt.Sprintf("%d", index) + return pod +} + +// create count pods with the given phase for the given tfJob +func newPodList(count int32, status v1.PodPhase, tfJob *tfv1alpha2.TFJob, typ string, start int32, t *testing.T) []*v1.Pod { + pods := []*v1.Pod{} + for i := int32(0); i < count; i++ { + newPod := newPod(tfJob, typ, int(start+i), t) + newPod.Status = v1.PodStatus{Phase: status} + pods = append(pods, newPod) + } + return pods +} + +func setPodsStatuses(podIndexer cache.Indexer, tfJob *tfv1alpha2.TFJob, typ string, pendingPods, activePods, succeededPods, failedPods int32, t *testing.T) { + var index int32 + for _, pod := range newPodList(pendingPods, v1.PodPending, tfJob, typ, index, t) { + podIndexer.Add(pod) + } + index += pendingPods + for _, pod := range newPodList(activePods, v1.PodRunning, tfJob, typ, index, t) { + podIndexer.Add(pod) + } + index += activePods + for _, pod := range newPodList(succeededPods, v1.PodSucceeded, tfJob, typ, index, t) { + podIndexer.Add(pod) + } + index += succeededPods + for _, pod := range newPodList(failedPods, v1.PodFailed, tfJob, typ, index, t) { + podIndexer.Add(pod) + } +} + +func TestAddPod(t *testing.T) { + // Prepare the clientset and controller for the test. + kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &v1.SchemeGroupVersion, + }, + }, + ) + tfJobClientSet := tfjobclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &tfv1alpha2.SchemeGroupVersion, + }, + }, + ) + controller, _, tfJobInformerFactory := newTFJobControllerFromClient(kubeClientSet, tfJobClientSet, NoResyncPeriodFunc) + controller.tfJobListerSynced = alwaysReady + controller.podListerSynced = alwaysReady + controller.serviceListerSynced = alwaysReady + tfJobIndexer := tfJobInformerFactory.Kubeflow().V1alpha2().TFJobs().Informer().GetIndexer() + + stopCh := make(chan struct{}) + run := func(<-chan struct{}) { + controller.Run(threadCount, stopCh) + } + go run(stopCh) + + var key string + controller.syncHandler = func(tfJobKey string) (bool, error) { + key = tfJobKey + return true, nil + } + + tfJob := newTFJob(1, 0) + tfJobIndexer.Add(tfJob) + pod := newPod(tfJob, labelWorker, 0, t) + + controller.addPod(pod) + time.Sleep(sleepInterval) + if key != getKey(tfJob, t) { + t.Errorf("Failed to enqueue the TFJob %s: expected %s, got %s", tfJob.Name, getKey(tfJob, t), key) + } + close(stopCh) +} diff --git a/pkg/controller/controller_service_test.go b/pkg/controller/controller_service_test.go new file mode 100644 index 0000000000..edcf40b3b1 --- /dev/null +++ b/pkg/controller/controller_service_test.go @@ -0,0 +1,112 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package controller provides a Kubernetes controller for a TFJob resource. + +package controller + +import ( + "fmt" + "testing" + "time" + + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kubeclientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/cache" + + tfv1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" + tfjobclientset "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" +) + +func newBaseService(name string, tfJob *tfv1alpha2.TFJob, t *testing.T) *v1.Service { + return &v1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: genLabels(getKey(tfJob, t)), + Namespace: tfJob.Namespace, + OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(tfJob, controllerKind)}, + }, + } +} + +func newService(tfJob *tfv1alpha2.TFJob, typ string, index int, t *testing.T) *v1.Service { + service := newBaseService(fmt.Sprintf("%s-%d", typ, index), tfJob, t) + service.Labels[tfReplicaTypeLabel] = typ + service.Labels[tfReplicaIndexLabel] = fmt.Sprintf("%d", index) + return service +} + +// create count pods with the given phase for the given tfJob +func newServiceList(count int32, tfJob *tfv1alpha2.TFJob, typ string, t *testing.T) []*v1.Service { + services := []*v1.Service{} + for i := int32(0); i < count; i++ { + newService := newService(tfJob, typ, int(i), t) + services = append(services, newService) + } + return services +} + +func setServices(serviceIndexer cache.Indexer, tfJob *tfv1alpha2.TFJob, typ string, activeWorkerServices int32, t *testing.T) { + for _, service := range newServiceList(activeWorkerServices, tfJob, typ, t) { + serviceIndexer.Add(service) + } +} + +func TestAddService(t *testing.T) { + // Prepare the clientset and controller for the test. + kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &v1.SchemeGroupVersion, + }, + }, + ) + tfJobClientSet := tfjobclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &tfv1alpha2.SchemeGroupVersion, + }, + }, + ) + controller, _, tfJobInformerFactory := newTFJobControllerFromClient(kubeClientSet, tfJobClientSet, NoResyncPeriodFunc) + controller.tfJobListerSynced = alwaysReady + controller.podListerSynced = alwaysReady + controller.serviceListerSynced = alwaysReady + tfJobIndexer := tfJobInformerFactory.Kubeflow().V1alpha2().TFJobs().Informer().GetIndexer() + + stopCh := make(chan struct{}) + run := func(<-chan struct{}) { + controller.Run(threadCount, stopCh) + } + go run(stopCh) + + var key string + controller.syncHandler = func(tfJobKey string) (bool, error) { + key = tfJobKey + return true, nil + } + + tfJob := newTFJob(1, 0) + tfJobIndexer.Add(tfJob) + service := newService(tfJob, labelWorker, 0, t) + + controller.addService(service) + time.Sleep(sleepInterval) + if key != getKey(tfJob, t) { + t.Errorf("Failed to enqueue the TFJob %s: expected %s, got %s", tfJob.Name, getKey(tfJob, t), key) + } + close(stopCh) +} diff --git a/pkg/controller/controller_test.go b/pkg/controller/controller_test.go index 602213de5a..54fddb105a 100644 --- a/pkg/controller/controller_test.go +++ b/pkg/controller/controller_test.go @@ -17,15 +17,14 @@ package controller import ( - "fmt" "testing" + "time" "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kubeinformers "k8s.io/client-go/informers" kubeclientset "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" - "k8s.io/client-go/tools/cache" tfv1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" tfjobclientset "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" @@ -37,6 +36,9 @@ const ( testTFJobName = "test-tfjob" labelWorker = "worker" labelPS = "ps" + + sleepInterval = 2 * time.Second + threadCount = 1 ) var alwaysReady = func() bool { return true } @@ -96,90 +98,12 @@ func newTFJob(worker, ps int) *tfv1alpha2.TFJob { } func getKey(tfJob *tfv1alpha2.TFJob, t *testing.T) string { - if key, err := KeyFunc(tfJob); err != nil { + key, err := KeyFunc(tfJob) + if err != nil { t.Errorf("Unexpected error getting key for job %v: %v", tfJob.Name, err) return "" - } else { - return key - } -} - -func newBasePod(name string, tfJob *tfv1alpha2.TFJob, t *testing.T) *v1.Pod { - return &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Labels: genLabels(getKey(tfJob, t)), - Namespace: tfJob.Namespace, - OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(tfJob, controllerKind)}, - }, - } -} - -func newPod(tfJob *tfv1alpha2.TFJob, typ string, index int, t *testing.T) *v1.Pod { - pod := newBasePod(fmt.Sprintf("%s-%d", typ, index), tfJob, t) - pod.Labels[tfReplicaTypeLabel] = typ - pod.Labels[tfReplicaIndexLabel] = fmt.Sprintf("%d", index) - return pod -} - -// create count pods with the given phase for the given tfJob -func newPodList(count int32, status v1.PodPhase, tfJob *tfv1alpha2.TFJob, typ string, start int32, t *testing.T) []*v1.Pod { - pods := []*v1.Pod{} - for i := int32(0); i < count; i++ { - newPod := newPod(tfJob, typ, int(start+i), t) - newPod.Status = v1.PodStatus{Phase: status} - pods = append(pods, newPod) - } - return pods -} - -func setPodsStatuses(podIndexer cache.Indexer, tfJob *tfv1alpha2.TFJob, typ string, pendingPods, activePods, succeededPods, failedPods int32, t *testing.T) { - var index int32 - for _, pod := range newPodList(pendingPods, v1.PodPending, tfJob, typ, index, t) { - podIndexer.Add(pod) - } - index += pendingPods - for _, pod := range newPodList(activePods, v1.PodRunning, tfJob, typ, index, t) { - podIndexer.Add(pod) - } - index += activePods - for _, pod := range newPodList(succeededPods, v1.PodSucceeded, tfJob, typ, index, t) { - podIndexer.Add(pod) - } - index += succeededPods - for _, pod := range newPodList(failedPods, v1.PodFailed, tfJob, typ, index, t) { - podIndexer.Add(pod) - } -} - -func newService(tfJob *tfv1alpha2.TFJob, typ string, index int, t *testing.T) *v1.Service { - service := &v1.Service{ - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%d", typ, index), - Labels: genLabels(getKey(tfJob, t)), - Namespace: tfJob.Namespace, - OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(tfJob, controllerKind)}, - }, - } - service.Labels[tfReplicaTypeLabel] = typ - service.Labels[tfReplicaIndexLabel] = fmt.Sprintf("%d", index) - return service -} - -// create count pods with the given phase for the given tfJob -func newServiceList(count int32, tfJob *tfv1alpha2.TFJob, typ string, t *testing.T) []*v1.Service { - services := []*v1.Service{} - for i := int32(0); i < count; i++ { - newService := newService(tfJob, typ, int(i), t) - services = append(services, newService) - } - return services -} - -func setServices(serviceIndexer cache.Indexer, tfJob *tfv1alpha2.TFJob, typ string, activeWorkerServices int32, t *testing.T) { - for _, service := range newServiceList(activeWorkerServices, tfJob, typ, t) { - serviceIndexer.Add(service) } + return key } func getCondition(tfJob *tfv1alpha2.TFJob, condition tfv1alpha2.TFJobConditionType, reason string) bool { @@ -428,3 +352,77 @@ func TestNormalPath(t *testing.T) { } } } + +func TestRun(t *testing.T) { + // Prepare the clientset and controller for the test. + kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &v1.SchemeGroupVersion, + }, + }, + ) + tfJobClientSet := tfjobclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &tfv1alpha2.SchemeGroupVersion, + }, + }, + ) + controller, _, _ := newTFJobControllerFromClient(kubeClientSet, tfJobClientSet, NoResyncPeriodFunc) + controller.tfJobListerSynced = alwaysReady + controller.podListerSynced = alwaysReady + controller.serviceListerSynced = alwaysReady + + stopCh := make(chan struct{}) + go func() { + time.Sleep(sleepInterval) + close(stopCh) + }() + err := controller.Run(threadCount, stopCh) + if err != nil { + t.Errorf("Failed to run: %v", err) + } +} + +func TestAddTFJob(t *testing.T) { + // Prepare the clientset and controller for the test. + kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &v1.SchemeGroupVersion, + }, + }, + ) + tfJobClientSet := tfjobclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &tfv1alpha2.SchemeGroupVersion, + }, + }, + ) + controller, _, _ := newTFJobControllerFromClient(kubeClientSet, tfJobClientSet, NoResyncPeriodFunc) + controller.tfJobListerSynced = alwaysReady + controller.podListerSynced = alwaysReady + controller.serviceListerSynced = alwaysReady + + stopCh := make(chan struct{}) + run := func(<-chan struct{}) { + controller.Run(threadCount, stopCh) + } + go run(stopCh) + + var key string + controller.syncHandler = func(tfJobKey string) (bool, error) { + key = tfJobKey + return true, nil + } + + tfJob := newTFJob(1, 0) + controller.addTFJob(tfJob) + time.Sleep(sleepInterval) + if key != getKey(tfJob, t) { + t.Errorf("Failed to enqueue the TFJob %s: expected %s, got %s", tfJob.Name, getKey(tfJob, t), key) + } + close(stopCh) +} From caf21fad8937032a3301a1feb9b22f9d58ef3445 Mon Sep 17 00:00:00 2001 From: Ce Gao Date: Thu, 22 Mar 2018 14:37:59 +0800 Subject: [PATCH 21/24] refactor: Set v2 as v1alpha2 and keep v1alpha1 Signed-off-by: Ce Gao --- cmd/tf-operator.v2/app/options/options.go | 52 ++ cmd/tf-operator.v2/app/server.go | 162 ++++++ cmd/tf-operator.v2/main.go | 49 ++ cmd/tf-operator/app/options/options.go | 33 +- cmd/tf-operator/app/server.go | 99 ++-- cmd/tf-operator/main.go | 6 +- hack/update-codegen.sh | 6 +- linter_config.json | 3 +- pkg/apis/tensorflow/helper/helpers.go | 119 ++++ pkg/apis/tensorflow/helper/helpers_test.go | 248 +++++++++ pkg/apis/tensorflow/v1alpha1/defaults.go | 58 ++ pkg/apis/tensorflow/v1alpha1/defaults_test.go | 118 ++++ pkg/apis/tensorflow/v1alpha1/doc.go | 20 + pkg/apis/tensorflow/v1alpha1/register.go | 60 +++ pkg/apis/tensorflow/v1alpha1/types.go | 193 +++++++ .../v1alpha1/zz_generated.deepcopy.go | 405 ++++++++++++++ .../v1alpha1/zz_generated.defaults.go | 45 ++ pkg/apis/tensorflow/validation/validation.go | 79 +++ .../tensorflow/validation/validation_test.go | 113 ++++ pkg/client/clientset/versioned/clientset.go | 26 +- .../versioned/fake/clientset_generated.go | 17 +- .../clientset/versioned/fake/register.go | 2 + .../clientset/versioned/scheme/register.go | 2 + .../versioned/typed/kubeflow/v1alpha1/doc.go | 18 + .../typed/kubeflow/v1alpha1/fake/doc.go | 18 + .../v1alpha1/fake/fake_kubeflow_client.go | 35 ++ .../kubeflow/v1alpha1/fake/fake_tfjob.go | 123 +++++ .../kubeflow/v1alpha1/generated_expansion.go | 16 + .../kubeflow/v1alpha1/kubeflow_client.go | 85 +++ .../typed/kubeflow/v1alpha1/tfjob.go | 152 ++++++ .../informers/externalversions/generic.go | 7 +- .../externalversions/kubeflow/interface.go | 8 + .../kubeflow/v1alpha1/interface.go | 41 ++ .../kubeflow/v1alpha1/tfjob.go | 71 +++ .../kubeflow/v1alpha1/expansion_generated.go | 25 + pkg/client/listers/kubeflow/v1alpha1/tfjob.go | 92 ++++ pkg/controller.v2/controller.go | 484 +++++++++++++++++ .../controller_pod.go | 0 .../controller_pod_test.go | 0 .../controller_ref_manager.go | 0 .../controller_service.go | 0 .../controller_service_test.go | 0 .../controller_tensorflow.go | 0 .../controller_test.go | 0 .../controller_utils.go | 0 pkg/controller/controller.go | 476 +++++----------- pkg/trainer/labels.go | 33 ++ pkg/trainer/replicas.go | 509 ++++++++++++++++++ pkg/trainer/replicas_test.go | 368 +++++++++++++ pkg/trainer/training.go | 468 ++++++++++++++++ pkg/trainer/training_test.go | 489 +++++++++++++++++ pkg/util/k8sutil/k8sutil.go | 120 +++++ pkg/util/util.go | 74 +++ 53 files changed, 5200 insertions(+), 427 deletions(-) create mode 100644 cmd/tf-operator.v2/app/options/options.go create mode 100644 cmd/tf-operator.v2/app/server.go create mode 100644 cmd/tf-operator.v2/main.go create mode 100644 pkg/apis/tensorflow/helper/helpers.go create mode 100644 pkg/apis/tensorflow/helper/helpers_test.go create mode 100644 pkg/apis/tensorflow/v1alpha1/defaults.go create mode 100644 pkg/apis/tensorflow/v1alpha1/defaults_test.go create mode 100644 pkg/apis/tensorflow/v1alpha1/doc.go create mode 100644 pkg/apis/tensorflow/v1alpha1/register.go create mode 100644 pkg/apis/tensorflow/v1alpha1/types.go create mode 100644 pkg/apis/tensorflow/v1alpha1/zz_generated.deepcopy.go create mode 100644 pkg/apis/tensorflow/v1alpha1/zz_generated.defaults.go create mode 100644 pkg/apis/tensorflow/validation/validation.go create mode 100644 pkg/apis/tensorflow/validation/validation_test.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/doc.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/doc.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_kubeflow_client.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_tfjob.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/generated_expansion.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/kubeflow_client.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/tfjob.go create mode 100644 pkg/client/informers/externalversions/kubeflow/v1alpha1/interface.go create mode 100644 pkg/client/informers/externalversions/kubeflow/v1alpha1/tfjob.go create mode 100644 pkg/client/listers/kubeflow/v1alpha1/expansion_generated.go create mode 100644 pkg/client/listers/kubeflow/v1alpha1/tfjob.go create mode 100644 pkg/controller.v2/controller.go rename pkg/{controller => controller.v2}/controller_pod.go (100%) rename pkg/{controller => controller.v2}/controller_pod_test.go (100%) rename pkg/{controller => controller.v2}/controller_ref_manager.go (100%) rename pkg/{controller => controller.v2}/controller_service.go (100%) rename pkg/{controller => controller.v2}/controller_service_test.go (100%) rename pkg/{controller => controller.v2}/controller_tensorflow.go (100%) rename pkg/{controller => controller.v2}/controller_test.go (100%) rename pkg/{controller => controller.v2}/controller_utils.go (100%) create mode 100644 pkg/trainer/labels.go create mode 100644 pkg/trainer/replicas.go create mode 100644 pkg/trainer/replicas_test.go create mode 100644 pkg/trainer/training.go create mode 100644 pkg/trainer/training_test.go create mode 100644 pkg/util/k8sutil/k8sutil.go create mode 100644 pkg/util/util.go diff --git a/cmd/tf-operator.v2/app/options/options.go b/cmd/tf-operator.v2/app/options/options.go new file mode 100644 index 0000000000..f7e152a13d --- /dev/null +++ b/cmd/tf-operator.v2/app/options/options.go @@ -0,0 +1,52 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package options + +import ( + "flag" +) + +// ServerOption is the main context object for the controller manager. +type ServerOption struct { + Kubeconfig string + MasterURL string + Threadiness int + PrintVersion bool + JSONLogFormat bool +} + +// NewServerOption creates a new CMServer with a default config. +func NewServerOption() *ServerOption { + s := ServerOption{} + return &s +} + +// AddFlags adds flags for a specific CMServer to the specified FlagSet. +func (s *ServerOption) AddFlags(fs *flag.FlagSet) { + fs.StringVar(&s.Kubeconfig, "kubeconfig", "~/.kube/config", + `Path to a kubeconfig, only required if out-of-cluster.`) + + fs.StringVar(&s.MasterURL, "master", "", + `The url of the Kubernetes API server, + will overrides any value in kubeconfig, only required if out-of-cluster.`) + + fs.IntVar(&s.Threadiness, "threadiness", 2, + `How many threads to process the main logic`) + + fs.BoolVar(&s.PrintVersion, "version", false, "Show version and quit") + + fs.BoolVar(&s.JSONLogFormat, "json-log-format", true, + "Set true to use json style log format. Set false to use plaintext style log format") +} diff --git a/cmd/tf-operator.v2/app/server.go b/cmd/tf-operator.v2/app/server.go new file mode 100644 index 0000000000..0adcdfad0f --- /dev/null +++ b/cmd/tf-operator.v2/app/server.go @@ -0,0 +1,162 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package app + +import ( + "fmt" + "os" + "time" + + log "github.com/sirupsen/logrus" + + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kubeinformers "k8s.io/client-go/informers" + kubeclientset "k8s.io/client-go/kubernetes" + restclientset "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + election "k8s.io/client-go/tools/leaderelection" + "k8s.io/client-go/tools/leaderelection/resourcelock" + "k8s.io/client-go/tools/record" + + "github.com/kubeflow/tf-operator/cmd/tf-operator.v2/app/options" + "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" + tfjobclientset "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" + "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" + tfjobinformers "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions" + controller "github.com/kubeflow/tf-operator/pkg/controller.v2" + "github.com/kubeflow/tf-operator/pkg/util/signals" + "github.com/kubeflow/tf-operator/version" +) + +var ( + // leader election config + leaseDuration = 15 * time.Second + renewDuration = 5 * time.Second + retryPeriod = 3 * time.Second +) + +const RecommendedKubeConfigPathEnv = "KUBECONFIG" + +func Run(opt *options.ServerOption) error { + + // Check if the -version flag was passed and, if so, print the version and exit. + if opt.PrintVersion { + version.PrintVersionAndExit() + } + + namespace := os.Getenv(v1alpha2.EnvKubeflowNamespace) + if len(namespace) == 0 { + log.Infof("KUBEFLOW_NAMESPACE not set, using default namespace") + namespace = metav1.NamespaceDefault + } + + // To help debugging, immediately log version. + log.Infof("%+v", version.Info()) + + // Set up signals so we handle the first shutdown signal gracefully. + stopCh := signals.SetupSignalHandler() + + // Note: ENV KUBECONFIG will overwrite user defined Kubeconfig option. + if len(os.Getenv(RecommendedKubeConfigPathEnv)) > 0 { + // use the current context in kubeconfig + // This is very useful for running locally. + opt.Kubeconfig = os.Getenv(RecommendedKubeConfigPathEnv) + } + + // Get kubernetes config. + kcfg, err := clientcmd.BuildConfigFromFlags(opt.MasterURL, opt.Kubeconfig) + if err != nil { + log.Fatalf("Error building kubeconfig: %s", err.Error()) + } + + // Create clients. + kubeClientSet, leaderElectionClientSet, tfJobClientSet, err := createClientSets(kcfg) + if err != nil { + return err + } + + // Create informer factory. + kubeInformerFactory := kubeinformers.NewSharedInformerFactory(kubeClientSet, time.Second*30) + tfJobInformerFactory := tfjobinformers.NewSharedInformerFactory(tfJobClientSet, time.Second*30) + + // Create tf controller. + tc := controller.NewTFJobController(kubeClientSet, tfJobClientSet, kubeInformerFactory, tfJobInformerFactory) + + // Start informer goroutines. + go kubeInformerFactory.Start(stopCh) + go tfJobInformerFactory.Start(stopCh) + + // Set leader election start function. + run := func(<-chan struct{}) { + tc.Run(opt.Threadiness, stopCh) + } + + id, err := os.Hostname() + if err != nil { + return fmt.Errorf("Failed to get hostname: %v", err) + } + + // Prepare event clients. + eventBroadcaster := record.NewBroadcaster() + recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "tf-operator"}) + + rl := &resourcelock.EndpointsLock{ + EndpointsMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: "tf-operator", + }, + Client: leaderElectionClientSet.CoreV1(), + LockConfig: resourcelock.ResourceLockConfig{ + Identity: id, + EventRecorder: recorder, + }, + } + + // Start leader election. + election.RunOrDie(election.LeaderElectionConfig{ + Lock: rl, + LeaseDuration: leaseDuration, + RenewDeadline: renewDuration, + RetryPeriod: retryPeriod, + Callbacks: election.LeaderCallbacks{ + OnStartedLeading: run, + OnStoppedLeading: func() { + log.Fatalf("leader election lost") + }, + }, + }) + + return nil +} + +func createClientSets(config *restclientset.Config) (kubeclientset.Interface, kubeclientset.Interface, tfjobclientset.Interface, error) { + kubeClientSet, err := kubeclientset.NewForConfig(restclientset.AddUserAgent(config, "tf-operator")) + if err != nil { + return nil, nil, nil, err + } + + leaderElectionClientSet, err := kubeclientset.NewForConfig(restclientset.AddUserAgent(config, "leader-election")) + if err != nil { + return nil, nil, nil, err + } + + tfJobClientSet, err := tfjobclientset.NewForConfig(config) + if err != nil { + return nil, nil, nil, err + } + + return kubeClientSet, leaderElectionClientSet, tfJobClientSet, nil +} diff --git a/cmd/tf-operator.v2/main.go b/cmd/tf-operator.v2/main.go new file mode 100644 index 0000000000..b862565efd --- /dev/null +++ b/cmd/tf-operator.v2/main.go @@ -0,0 +1,49 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "flag" + + "github.com/onrik/logrus/filename" + log "github.com/sirupsen/logrus" + + "github.com/kubeflow/tf-operator/cmd/tf-operator.v2/app" + "github.com/kubeflow/tf-operator/cmd/tf-operator.v2/app/options" +) + +func init() { + // Add filename as one of the fields of the structured log message. + filenameHook := filename.NewHook() + filenameHook.Field = "filename" + log.AddHook(filenameHook) +} + +func main() { + s := options.NewServerOption() + s.AddFlags(flag.CommandLine) + + flag.Parse() + + if s.JSONLogFormat { + // Output logs in a json format so that it can be parsed by services like Stackdriver. + log.SetFormatter(&log.JSONFormatter{}) + } + + if err := app.Run(s); err != nil { + log.Fatalf("%v\n", err) + } + +} diff --git a/cmd/tf-operator/app/options/options.go b/cmd/tf-operator/app/options/options.go index f7e152a13d..5f7cb90e43 100644 --- a/cmd/tf-operator/app/options/options.go +++ b/cmd/tf-operator/app/options/options.go @@ -16,15 +16,17 @@ package options import ( "flag" + "time" ) // ServerOption is the main context object for the controller manager. type ServerOption struct { - Kubeconfig string - MasterURL string - Threadiness int - PrintVersion bool - JSONLogFormat bool + ChaosLevel int + ControllerConfigFile string + PrintVersion bool + GCInterval time.Duration + JsonLogFormat bool + EnableGangScheduling bool } // NewServerOption creates a new CMServer with a default config. @@ -33,20 +35,13 @@ func NewServerOption() *ServerOption { return &s } -// AddFlags adds flags for a specific CMServer to the specified FlagSet. +// AddFlags adds flags for a specific CMServer to the specified FlagSet func (s *ServerOption) AddFlags(fs *flag.FlagSet) { - fs.StringVar(&s.Kubeconfig, "kubeconfig", "~/.kube/config", - `Path to a kubeconfig, only required if out-of-cluster.`) - - fs.StringVar(&s.MasterURL, "master", "", - `The url of the Kubernetes API server, - will overrides any value in kubeconfig, only required if out-of-cluster.`) - - fs.IntVar(&s.Threadiness, "threadiness", 2, - `How many threads to process the main logic`) - + // chaos level will be removed once we have a formal tool to inject failures. + fs.IntVar(&s.ChaosLevel, "chaos-level", -1, "DO NOT USE IN PRODUCTION - level of chaos injected into the TFJob created by the operator.") fs.BoolVar(&s.PrintVersion, "version", false, "Show version and quit") - - fs.BoolVar(&s.JSONLogFormat, "json-log-format", true, - "Set true to use json style log format. Set false to use plaintext style log format") + fs.DurationVar(&s.GCInterval, "gc-interval", 10*time.Minute, "GC interval") + fs.StringVar(&s.ControllerConfigFile, "controller-config-file", "", "Path to file containing the controller config.") + fs.BoolVar(&s.JsonLogFormat, "json-log-format", true, "Set true to use json style log format. Set false to use plaintext style log format") + fs.BoolVar(&s.EnableGangScheduling, "enable-gang-scheduling", false, "Set true to enable gang scheduling by kube-arbitrator.") } diff --git a/cmd/tf-operator/app/server.go b/cmd/tf-operator/app/server.go index 811cebfb4d..f4794d3dcb 100644 --- a/cmd/tf-operator/app/server.go +++ b/cmd/tf-operator/app/server.go @@ -16,40 +16,37 @@ package app import ( "fmt" + "io/ioutil" "os" "time" + "github.com/ghodss/yaml" log "github.com/sirupsen/logrus" - "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - kubeinformers "k8s.io/client-go/informers" - kubeclientset "k8s.io/client-go/kubernetes" - restclientset "k8s.io/client-go/rest" - "k8s.io/client-go/tools/clientcmd" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" election "k8s.io/client-go/tools/leaderelection" "k8s.io/client-go/tools/leaderelection/resourcelock" "k8s.io/client-go/tools/record" "github.com/kubeflow/tf-operator/cmd/tf-operator/app/options" - "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" - tfjobclientset "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" + "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" + tfjobclient "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" - tfjobinformers "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions" + informers "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions" "github.com/kubeflow/tf-operator/pkg/controller" - "github.com/kubeflow/tf-operator/pkg/util/signals" + "github.com/kubeflow/tf-operator/pkg/util" + "github.com/kubeflow/tf-operator/pkg/util/k8sutil" "github.com/kubeflow/tf-operator/version" ) var ( - // leader election config leaseDuration = 15 * time.Second renewDuration = 5 * time.Second retryPeriod = 3 * time.Second ) -const RecommendedKubeConfigPathEnv = "KUBECONFIG" - func Run(opt *options.ServerOption) error { // Check if the -version flag was passed and, if so, print the version and exit. @@ -57,51 +54,40 @@ func Run(opt *options.ServerOption) error { version.PrintVersionAndExit() } - namespace := os.Getenv(v1alpha2.EnvKubeflowNamespace) + namespace := os.Getenv(util.EnvKubeflowNamespace) if len(namespace) == 0 { log.Infof("KUBEFLOW_NAMESPACE not set, using default namespace") namespace = metav1.NamespaceDefault } - // To help debugging, immediately log version. + // To help debugging, immediately log version log.Infof("%+v", version.Info()) - // Set up signals so we handle the first shutdown signal gracefully. - stopCh := signals.SetupSignalHandler() - - // Note: ENV KUBECONFIG will overwrite user defined Kubeconfig option. - if len(os.Getenv(RecommendedKubeConfigPathEnv)) > 0 { - // use the current context in kubeconfig - // This is very useful for running locally. - opt.Kubeconfig = os.Getenv(RecommendedKubeConfigPathEnv) - } - - // Get kubernetes config. - kcfg, err := clientcmd.BuildConfigFromFlags(opt.MasterURL, opt.Kubeconfig) + config, err := k8sutil.GetClusterConfig() if err != nil { - log.Fatalf("Error building kubeconfig: %s", err.Error()) + return err } - // Create clients. - kubeClientSet, leaderElectionClientSet, tfJobClientSet, err := createClientSets(kcfg) + kubeClient, leaderElectionClient, tfJobClient, err := createClients(config) if err != nil { return err } - // Create informer factory. - kubeInformerFactory := kubeinformers.NewSharedInformerFactory(kubeClientSet, time.Second*30) - tfJobInformerFactory := tfjobinformers.NewSharedInformerFactory(tfJobClientSet, time.Second*30) + controllerConfig := readControllerConfig(opt.ControllerConfigFile) - // Create tf controller. - tc := controller.NewTFJobController(kubeClientSet, tfJobClientSet, kubeInformerFactory, tfJobInformerFactory) + neverStop := make(chan struct{}) + defer close(neverStop) - // Start informer goroutines. - go kubeInformerFactory.Start(stopCh) - go tfJobInformerFactory.Start(stopCh) + tfJobInformerFactory := informers.NewSharedInformerFactory(tfJobClient, time.Second*30) + controller, err := controller.New(kubeClient, tfJobClient, *controllerConfig, tfJobInformerFactory, opt.EnableGangScheduling) + if err != nil { + return err + } + + go tfJobInformerFactory.Start(neverStop) - // Set leader election start function. - run := func(<-chan struct{}) { - tc.Run(opt.Threadiness, stopCh) + run := func(stopCh <-chan struct{}) { + controller.Run(1, stopCh) } id, err := os.Hostname() @@ -118,14 +104,13 @@ func Run(opt *options.ServerOption) error { Namespace: namespace, Name: "tf-operator", }, - Client: leaderElectionClientSet.CoreV1(), + Client: leaderElectionClient.CoreV1(), LockConfig: resourcelock.ResourceLockConfig{ Identity: id, EventRecorder: recorder, }, } - // Start leader election. election.RunOrDie(election.LeaderElectionConfig{ Lock: rl, LeaseDuration: leaseDuration, @@ -142,21 +127,41 @@ func Run(opt *options.ServerOption) error { return nil } -func createClientSets(config *restclientset.Config) (kubeclientset.Interface, kubeclientset.Interface, tfjobclientset.Interface, error) { - kubeClientSet, err := kubeclientset.NewForConfig(restclientset.AddUserAgent(config, "tf-operator")) +func readControllerConfig(controllerConfigFile string) *v1alpha1.ControllerConfig { + controllerConfig := &v1alpha1.ControllerConfig{} + if controllerConfigFile != "" { + log.Infof("Loading controller config from %v.", controllerConfigFile) + data, err := ioutil.ReadFile(controllerConfigFile) + if err != nil { + log.Fatalf("Could not read file: %v. Error: %v", controllerConfigFile, err) + return controllerConfig + } + err = yaml.Unmarshal(data, controllerConfig) + if err != nil { + log.Fatalf("Could not parse controller config; Error: %v\n", err) + } + log.Infof("ControllerConfig: %v", util.Pformat(controllerConfig)) + } else { + log.Info("No controller_config_file provided; using empty config.") + } + return controllerConfig +} + +func createClients(config *rest.Config) (clientset.Interface, clientset.Interface, tfjobclient.Interface, error) { + kubeClient, err := clientset.NewForConfig(rest.AddUserAgent(config, "tfjob_operator")) if err != nil { return nil, nil, nil, err } - leaderElectionClientSet, err := kubeclientset.NewForConfig(restclientset.AddUserAgent(config, "leader-election")) + leaderElectionClient, err := clientset.NewForConfig(rest.AddUserAgent(config, "leader-election")) if err != nil { return nil, nil, nil, err } - tfJobClientSet, err := tfjobclientset.NewForConfig(config) + tfJobClient, err := tfjobclient.NewForConfig(config) if err != nil { return nil, nil, nil, err } - return kubeClientSet, leaderElectionClientSet, tfJobClientSet, nil + return kubeClient, leaderElectionClient, tfJobClient, nil } diff --git a/cmd/tf-operator/main.go b/cmd/tf-operator/main.go index d4fcfe4bef..d01b288526 100644 --- a/cmd/tf-operator/main.go +++ b/cmd/tf-operator/main.go @@ -25,7 +25,7 @@ import ( ) func init() { - // Add filename as one of the fields of the structured log message. + // Add filename as one of the fields of the structured log message filenameHook := filename.NewHook() filenameHook.Field = "filename" log.AddHook(filenameHook) @@ -37,8 +37,8 @@ func main() { flag.Parse() - if s.JSONLogFormat { - // Output logs in a json format so that it can be parsed by services like Stackdriver. + if s.JsonLogFormat { + // Output logs in a json format so that it can be parsed by services like Stackdriver log.SetFormatter(&log.JSONFormatter{}) } diff --git a/hack/update-codegen.sh b/hack/update-codegen.sh index 5fe42b523c..3906654b63 100755 --- a/hack/update-codegen.sh +++ b/hack/update-codegen.sh @@ -30,9 +30,13 @@ CODEGEN_PKG=${CODEGEN_PKG:-$(cd ${SCRIPT_ROOT}; ls -d -1 ./vendor/k8s.io/code-ge # instead of the $GOPATH directly. For normal projects this can be dropped. ${CODEGEN_PKG}/generate-groups.sh "all" \ github.com/kubeflow/tf-operator/pkg/client github.com/kubeflow/tf-operator/pkg/apis \ - tensorflow:v1alpha2 \ + tensorflow:v1alpha1,v1alpha2 \ --go-header-file ${SCRIPT_ROOT}/hack/boilerplate/boilerplate.go.txt +# Notice: The code in code-generator does not generate defaulter by default. +echo "Generating defaulters" +${GOPATH}/bin/defaulter-gen --input-dirs github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1 -O zz_generated.defaults "$@" + # Notice: The code in code-generator does not generate defaulter by default. echo "Generating defaulters" ${GOPATH}/bin/defaulter-gen --input-dirs github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2 -O zz_generated.defaults "$@" diff --git a/linter_config.json b/linter_config.json index 4f0344baad..8bacccbebf 100644 --- a/linter_config.json +++ b/linter_config.json @@ -27,7 +27,8 @@ "pkg/apis/tensorflow/v1alpha1/zz_generated.defaults.go", "pkg/apis/tensorflow/v1alpha2/zz_generated.deepcopy.go", "pkg/apis/tensorflow/v1alpha2/zz_generated.defaults.go", - "pkg/controller/controller_utils.go" + "pkg/controller/controller_utils.go", + "pkg/controller.v2/controller_utils.go" ], "Deadline": "300s", "Skip": ["pkg/client"] diff --git a/pkg/apis/tensorflow/helper/helpers.go b/pkg/apis/tensorflow/helper/helpers.go new file mode 100644 index 0000000000..632194e2c4 --- /dev/null +++ b/pkg/apis/tensorflow/helper/helpers.go @@ -0,0 +1,119 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package helper + +import ( + "fmt" + + tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" + "github.com/kubeflow/tf-operator/pkg/util" + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +var ( + groupVersionKind = schema.GroupVersionKind{ + Group: tfv1.GroupName, + Version: tfv1.GroupVersion, + Kind: tfv1.TFJobResourceKind, + } +) + +// AsOwner make OwnerReference according to the parameter +func AsOwner(tfJob *tfv1.TFJob) metav1.OwnerReference { + trueVar := true + // Both api.OwnerReference and metatypes.OwnerReference are combined into that. + return metav1.OwnerReference{ + APIVersion: groupVersionKind.GroupVersion().String(), + Kind: groupVersionKind.Kind, + Name: tfJob.ObjectMeta.Name, + UID: tfJob.ObjectMeta.UID, + Controller: &trueVar, + BlockOwnerDeletion: &trueVar, + } +} + +// ConfigureAcceleratorsForTFJobSpec adds any accelerator specific configuration to the pods. +func ConfigureAcceleratorsForTFJobSpec(c *tfv1.TFJobSpec, accelerators map[string]tfv1.AcceleratorConfig) error { + for _, r := range c.ReplicaSpecs { + if r.Template == nil { + return fmt.Errorf("Replica is missing Template; %v", util.Pformat(r)) + } + for i, c := range r.Template.Spec.Containers { + if c.Name == tfv1.DefaultTFContainer { + // Identify the accelerators attached to this container. + a := map[string]tfv1.AcceleratorConfig{} + + lists := []v1.ResourceList{c.Resources.Limits, c.Resources.Requests} + for _, resources := range lists { + for name, _ := range resources { + + if _, ok := accelerators[string(name)]; !ok { + continue + } + + // Add the expected mounts to the pods. + a[string(name)] = accelerators[string(name)] + } + } + + // Add accelerator information to the pod. + for _, config := range a { + for _, v := range config.Volumes { + r.Template.Spec.Volumes = append(r.Template.Spec.Volumes, + v1.Volume{ + Name: v.Name, + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{ + Path: v.HostPath, + }, + }, + }) + c.VolumeMounts = append(c.VolumeMounts, v1.VolumeMount{ + Name: v.Name, + MountPath: v.MountPath, + }) + } + + for _, envVar := range config.EnvVars { + c.Env = append(c.Env, v1.EnvVar{ + Name: envVar.Name, + Value: envVar.Value, + }) + } + } + r.Template.Spec.Containers[i] = c + break + } + } + } + return nil +} + +// Cleanup cleans up user passed spec, e.g. defaulting, transforming fields. +// TODO: move this to admission controller +func Cleanup(c *tfv1.TFJobSpec) { + // TODO(jlewi): Add logic to cleanup user provided spec; e.g. by filling in defaults. + // We should have default container images so user doesn't have to provide these. +} + +func CRDName() string { + return fmt.Sprintf("%s.%s", tfv1.CRDKindPlural, tfv1.CRDGroup) +} + +func scalingReason(from, to int) string { + return fmt.Sprintf("Current cluster size: %d, desired cluster size: %d", from, to) +} diff --git a/pkg/apis/tensorflow/helper/helpers_test.go b/pkg/apis/tensorflow/helper/helpers_test.go new file mode 100644 index 0000000000..9380555eaa --- /dev/null +++ b/pkg/apis/tensorflow/helper/helpers_test.go @@ -0,0 +1,248 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package helper + +import ( + "reflect" + "testing" + + "github.com/gogo/protobuf/proto" + tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" + "github.com/kubeflow/tf-operator/pkg/util" + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" +) + +func TestAddAccelertor(t *testing.T) { + type testCase struct { + in *tfv1.TFJobSpec + expected *tfv1.TFJobSpec + config map[string]tfv1.AcceleratorConfig + } + + testCases := []testCase{ + // Case 1 checks that we look at requests. + { + in: &tfv1.TFJobSpec{ + ReplicaSpecs: []*tfv1.TFReplicaSpec{ + { + Replicas: proto.Int32(2), + TFPort: proto.Int32(10), + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + Resources: v1.ResourceRequirements{ + Requests: map[v1.ResourceName]resource.Quantity{ + "nvidia-gpu": resource.MustParse("1"), + }, + }, + }, + }, + }, + }, + TFReplicaType: tfv1.PS, + }, + }, + }, + expected: &tfv1.TFJobSpec{ + ReplicaSpecs: []*tfv1.TFReplicaSpec{ + { + Replicas: proto.Int32(2), + TFPort: proto.Int32(10), + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + Resources: v1.ResourceRequirements{ + Requests: map[v1.ResourceName]resource.Quantity{ + "nvidia-gpu": resource.MustParse("1"), + }, + }, + VolumeMounts: []v1.VolumeMount{ + { + Name: "cuda-lib", + MountPath: "/usr/local/cuda", + }, + }, + }, + }, + Volumes: []v1.Volume{ + { + Name: "cuda-lib", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{ + Path: "/home/cuda", + }, + }, + }, + }, + }, + }, + TFReplicaType: tfv1.PS, + }, + }, + }, + config: map[string]tfv1.AcceleratorConfig{ + "nvidia-gpu": tfv1.AcceleratorConfig{ + Volumes: []tfv1.AcceleratorVolume{ + { + Name: "cuda-lib", + HostPath: "/home/cuda", + MountPath: "/usr/local/cuda", + }, + }, + }, + }, + }, + // Case 2 checks that we look at limit. + { + in: &tfv1.TFJobSpec{ + ReplicaSpecs: []*tfv1.TFReplicaSpec{ + { + Replicas: proto.Int32(2), + TFPort: proto.Int32(10), + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + Resources: v1.ResourceRequirements{ + Limits: map[v1.ResourceName]resource.Quantity{ + "nvidia-gpu": resource.MustParse("1"), + }, + }, + }, + }, + }, + }, + TFReplicaType: tfv1.PS, + }, + }, + }, + expected: &tfv1.TFJobSpec{ + ReplicaSpecs: []*tfv1.TFReplicaSpec{ + { + Replicas: proto.Int32(2), + TFPort: proto.Int32(10), + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + Resources: v1.ResourceRequirements{ + Limits: map[v1.ResourceName]resource.Quantity{ + "nvidia-gpu": resource.MustParse("1"), + }, + }, + VolumeMounts: []v1.VolumeMount{ + { + Name: "cuda-lib", + MountPath: "/usr/local/cuda", + }, + }, + }, + }, + Volumes: []v1.Volume{ + { + Name: "cuda-lib", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{ + Path: "/home/cuda", + }, + }, + }, + }, + }, + }, + TFReplicaType: tfv1.PS, + }, + }, + }, + config: map[string]tfv1.AcceleratorConfig{ + "nvidia-gpu": tfv1.AcceleratorConfig{ + Volumes: []tfv1.AcceleratorVolume{ + { + Name: "cuda-lib", + HostPath: "/home/cuda", + MountPath: "/usr/local/cuda", + }, + }, + }, + }, + }, + // Case 3 no GPUs + { + in: &tfv1.TFJobSpec{ + ReplicaSpecs: []*tfv1.TFReplicaSpec{ + { + Replicas: proto.Int32(2), + TFPort: proto.Int32(10), + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + }, + }, + }, + }, + TFReplicaType: tfv1.PS, + }, + }, + }, + expected: &tfv1.TFJobSpec{ + ReplicaSpecs: []*tfv1.TFReplicaSpec{ + { + Replicas: proto.Int32(2), + TFPort: proto.Int32(10), + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + }, + }, + }, + }, + TFReplicaType: tfv1.PS, + }, + }, + }, + config: map[string]tfv1.AcceleratorConfig{ + "nvidia-gpu": tfv1.AcceleratorConfig{ + Volumes: []tfv1.AcceleratorVolume{ + { + Name: "cuda-lib", + HostPath: "/home/cuda", + MountPath: "/usr/local/cuda", + }, + }, + }, + }, + }, + } + + for _, c := range testCases { + if err := ConfigureAcceleratorsForTFJobSpec(c.in, c.config); err != nil { + t.Errorf("ConfigureAccelerators error; %v", err) + } + if !reflect.DeepEqual(c.in, c.expected) { + t.Errorf("Want\n%v; Got\n %v", util.Pformat(c.expected), util.Pformat(c.in)) + } + } +} diff --git a/pkg/apis/tensorflow/v1alpha1/defaults.go b/pkg/apis/tensorflow/v1alpha1/defaults.go new file mode 100644 index 0000000000..d87976993f --- /dev/null +++ b/pkg/apis/tensorflow/v1alpha1/defaults.go @@ -0,0 +1,58 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1alpha1 + +import ( + "github.com/golang/protobuf/proto" + "k8s.io/apimachinery/pkg/runtime" +) + +func addDefaultingFuncs(scheme *runtime.Scheme) error { + return RegisterDefaults(scheme) +} + +// SetDefaults_TFJob sets any unspecified values to defaults +func SetDefaults_TFJob(obj *TFJob) { + c := &obj.Spec + + if c.TFImage == "" { + c.TFImage = DefaultTFImage + } + + // Check that each replica has a TensorFlow container. + for _, r := range c.ReplicaSpecs { + + if r.TFPort == nil { + r.TFPort = proto.Int32(TFPort) + } + + if string(r.TFReplicaType) == "" { + r.TFReplicaType = MASTER + } + + if r.Replicas == nil { + r.Replicas = proto.Int32(Replicas) + } + } + if c.TerminationPolicy == nil { + c.TerminationPolicy = &TerminationPolicySpec{ + Chief: &ChiefSpec{ + ReplicaName: "MASTER", + ReplicaIndex: 0, + }, + } + } + +} diff --git a/pkg/apis/tensorflow/v1alpha1/defaults_test.go b/pkg/apis/tensorflow/v1alpha1/defaults_test.go new file mode 100644 index 0000000000..419a647ea0 --- /dev/null +++ b/pkg/apis/tensorflow/v1alpha1/defaults_test.go @@ -0,0 +1,118 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1alpha1 + +import ( + "reflect" + "testing" + + "github.com/gogo/protobuf/proto" + "github.com/kubeflow/tf-operator/pkg/util" + "k8s.io/api/core/v1" +) + +func TestSetDefaults_TFJob(t *testing.T) { + type testCase struct { + in *TFJob + expected *TFJob + } + + testCases := []testCase{ + { + in: &TFJob{ + Spec: TFJobSpec{ + ReplicaSpecs: []*TFReplicaSpec{ + { + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + }, + }, + }, + }, + }, + }, + TFImage: "tensorflow/tensorflow:1.3.0", + }, + }, + expected: &TFJob{ + Spec: TFJobSpec{ + ReplicaSpecs: []*TFReplicaSpec{ + { + Replicas: proto.Int32(1), + TFPort: proto.Int32(2222), + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + }, + }, + }, + }, + TFReplicaType: MASTER, + }, + }, + TFImage: "tensorflow/tensorflow:1.3.0", + TerminationPolicy: &TerminationPolicySpec{ + Chief: &ChiefSpec{ + ReplicaName: "MASTER", + ReplicaIndex: 0, + }, + }, + }, + }, + }, + { + in: &TFJob{ + Spec: TFJobSpec{ + ReplicaSpecs: []*TFReplicaSpec{ + { + TFReplicaType: PS, + }, + }, + TFImage: "tensorflow/tensorflow:1.3.0", + }, + }, + expected: &TFJob{ + Spec: TFJobSpec{ + ReplicaSpecs: []*TFReplicaSpec{ + { + Replicas: proto.Int32(1), + TFPort: proto.Int32(2222), + TFReplicaType: PS, + }, + }, + TFImage: "tensorflow/tensorflow:1.3.0", + TerminationPolicy: &TerminationPolicySpec{ + Chief: &ChiefSpec{ + ReplicaName: "MASTER", + ReplicaIndex: 0, + }, + }, + }, + }, + }, + } + + for _, c := range testCases { + SetDefaults_TFJob(c.in) + if !reflect.DeepEqual(c.in, c.expected) { + t.Errorf("Want\n%v; Got\n %v", util.Pformat(c.expected), util.Pformat(c.in)) + } + } +} diff --git a/pkg/apis/tensorflow/v1alpha1/doc.go b/pkg/apis/tensorflow/v1alpha1/doc.go new file mode 100644 index 0000000000..92db83ef13 --- /dev/null +++ b/pkg/apis/tensorflow/v1alpha1/doc.go @@ -0,0 +1,20 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +k8s:deepcopy-gen=package,register +// +k8s:defaulter-gen=TypeMeta + +// Package v1alpha1 is the v1alpha1 version of the API. +// +groupName=kubeflow.org +package v1alpha1 diff --git a/pkg/apis/tensorflow/v1alpha1/register.go b/pkg/apis/tensorflow/v1alpha1/register.go new file mode 100644 index 0000000000..1fe6fad84c --- /dev/null +++ b/pkg/apis/tensorflow/v1alpha1/register.go @@ -0,0 +1,60 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +var ( + SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) + AddToScheme = SchemeBuilder.AddToScheme +) + +const ( + // GroupName is the group name use in this package. + GroupName = "kubeflow.org" + // TFJobResourceKind is the kind name. + TFJobResourceKind = "TFJob" + // GroupVersion is the version. + GroupVersion = "v1alpha1" +) + +// SchemeGroupVersion is the group version used to register these objects. +var SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: CRDVersion} + +func init() { + // We only register manually written functions here. The registration of the + // generated functions takes place in the generated files. The separation + // makes the code compile even when the generated files are missing. + SchemeBuilder.Register(addDefaultingFuncs) +} + +// Resource takes an unqualified resource and returns a Group-qualified GroupResource. +func Resource(resource string) schema.GroupResource { + return SchemeGroupVersion.WithResource(resource).GroupResource() +} + +// addKnownTypes adds the set of types defined in this package to the supplied scheme. +func addKnownTypes(scheme *runtime.Scheme) error { + scheme.AddKnownTypes(SchemeGroupVersion, + &TFJob{}, + &TFJobList{}, + ) + metav1.AddToGroupVersion(scheme, SchemeGroupVersion) + return nil +} diff --git a/pkg/apis/tensorflow/v1alpha1/types.go b/pkg/apis/tensorflow/v1alpha1/types.go new file mode 100644 index 0000000000..26e26d3c45 --- /dev/null +++ b/pkg/apis/tensorflow/v1alpha1/types.go @@ -0,0 +1,193 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1alpha1 + +import ( + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const ( + CRDKind = "tfjob" + CRDKindPlural = "tfjobs" + CRDGroup = "kubeflow.org" + CRDVersion = "v1alpha1" + // Value of the APP label that gets applied to a lot of entities. + AppLabel = "tensorflow-job" + // Defaults for the Spec + TFPort = 2222 + Replicas = 1 +) + +// +genclient +// +genclient:noStatus +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// +resource:path=tfjob + +// TFJob describes tfjob info +type TFJob struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + Spec TFJobSpec `json:"spec"` + Status TFJobStatus `json:"status"` +} + +type TFJobSpec struct { + // TODO(jlewi): Can we we get rid of this and use some value from Kubernetes or a random ide. + RuntimeId string + + // ReplicaSpecs specifies the TF replicas to run. + ReplicaSpecs []*TFReplicaSpec `json:"replicaSpecs"` + + // TFImage defines the tensorflow docker image that should be used for default parameter server + TFImage string `json:"tfImage,omitempty"` + + // TerminationPolicy specifies the condition that the tfjob should be considered finished. + TerminationPolicy *TerminationPolicySpec `json:"terminationPolicy,omitempty"` + + // SchedulerName specifies the name of scheduler which should handle the TFJob + SchedulerName string `json:"schedulerName,omitempty"` +} + +type TerminationPolicySpec struct { + // Chief policy waits for a particular process (which is the chief) to exit. + Chief *ChiefSpec `json:"chief,omitempty"` +} + +type ChiefSpec struct { + ReplicaName string `json:"replicaName"` + ReplicaIndex int `json:"replicaIndex"` +} + +// TFReplicaType determines how a set of TF processes are handled. +type TFReplicaType string + +const ( + MASTER TFReplicaType = "MASTER" + PS TFReplicaType = "PS" + WORKER TFReplicaType = "WORKER" +) + +const ( + DefaultTFContainer string = "tensorflow" + DefaultTFImage string = "tensorflow/tensorflow:1.3.0" +) + +// TODO(jlewi): We probably want to add a name field. This would allow us to have more than 1 type of each worker. +// This might be useful if you wanted to have a separate set of workers to do eval. +type TFReplicaSpec struct { + // Replicas is the number of desired replicas. + // This is a pointer to distinguish between explicit zero and unspecified. + // Defaults to 1. + // More info: http://kubernetes.io/docs/user-guide/replication-controller#what-is-a-replication-controller + // +optional + Replicas *int32 `json:"replicas,omitempty" protobuf:"varint,1,opt,name=replicas"` + Template *v1.PodTemplateSpec `json:"template,omitempty" protobuf:"bytes,3,opt,name=template"` + // TFPort is the port to use for TF services. + TFPort *int32 `json:"tfPort,omitempty" protobuf:"varint,1,opt,name=tfPort"` + TFReplicaType `json:"tfReplicaType"` +} + +type TFJobPhase string + +const ( + TFJobPhaseNone TFJobPhase = "" + TFJobPhaseCreating TFJobPhase = "Creating" + TFJobPhaseRunning TFJobPhase = "Running" + TFJobPhaseCleanUp TFJobPhase = "CleanUp" + TFJobPhaseFailed TFJobPhase = "Failed" + TFJobPhaseDone TFJobPhase = "Done" +) + +type State string + +const ( + StateUnknown State = "Unknown" + StateRunning State = "Running" + StateSucceeded State = "Succeeded" + StateFailed State = "Failed" +) + +type TFJobStatus struct { + // Phase is the TFJob running phase + Phase TFJobPhase `json:"phase"` + Reason string `json:"reason"` + + // State indicates the state of the job. + State State `json:"state"` + + // ReplicaStatuses specifies the status of each TF replica. + ReplicaStatuses []*TFReplicaStatus `json:"replicaStatuses"` +} + +type ReplicaState string + +const ( + ReplicaStateUnknown ReplicaState = "Unknown" + ReplicaStateRunning ReplicaState = "Running" + ReplicaStateFailed ReplicaState = "Failed" + ReplicaStateSucceeded ReplicaState = "Succeeded" +) + +type TFReplicaStatus struct { + TFReplicaType `json:"tf_replica_type"` + + // State is the overall state of the replica + State ReplicaState `json:"state"` + + // ReplicasStates provides the number of replicas in each status. + ReplicasStates map[ReplicaState]int +} + +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// +resource:path=tfjobs + +// TFJobList is a list of TFJobs clusters. +type TFJobList struct { + metav1.TypeMeta `json:",inline"` + // Standard list metadata + // More info: http://releases.k8s.io/HEAD/docs/devel/api-conventions.md#metadata + metav1.ListMeta `json:"metadata,omitempty"` + // Items is a list of TFJobs + Items []TFJob `json:"items"` +} + +type ControllerConfig struct { + // Accelerators is a map from the name of the accelerator to the config for that accelerator. + // This should match the value specified as a container limit. + // e.g. alpha.kubernetes.io/nvidia-gpu + Accelerators map[string]AcceleratorConfig + + // Path to the file containing the grpc server source + GrpcServerFilePath string +} + +// AcceleratorVolume represents a host path that must be mounted into +// each container that needs to use GPUs. +type AcceleratorVolume struct { + Name string + HostPath string + MountPath string +} + +type AcceleratorConfig struct { + Volumes []AcceleratorVolume + EnvVars []EnvironmentVariableConfig +} + +type EnvironmentVariableConfig struct { + Name string + Value string +} diff --git a/pkg/apis/tensorflow/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/tensorflow/v1alpha1/zz_generated.deepcopy.go new file mode 100644 index 0000000000..739628b8ef --- /dev/null +++ b/pkg/apis/tensorflow/v1alpha1/zz_generated.deepcopy.go @@ -0,0 +1,405 @@ +// +build !ignore_autogenerated + +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file was autogenerated by deepcopy-gen. Do not edit it manually! + +package v1alpha1 + +import ( + v1 "k8s.io/api/core/v1" + conversion "k8s.io/apimachinery/pkg/conversion" + runtime "k8s.io/apimachinery/pkg/runtime" + reflect "reflect" +) + +func init() { + SchemeBuilder.Register(RegisterDeepCopies) +} + +// RegisterDeepCopies adds deep-copy functions to the given scheme. Public +// to allow building arbitrary schemes. +// +// Deprecated: deepcopy registration will go away when static deepcopy is fully implemented. +func RegisterDeepCopies(scheme *runtime.Scheme) error { + return scheme.AddGeneratedDeepCopyFuncs( + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*AcceleratorConfig).DeepCopyInto(out.(*AcceleratorConfig)) + return nil + }, InType: reflect.TypeOf(&AcceleratorConfig{})}, + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*AcceleratorVolume).DeepCopyInto(out.(*AcceleratorVolume)) + return nil + }, InType: reflect.TypeOf(&AcceleratorVolume{})}, + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*ChiefSpec).DeepCopyInto(out.(*ChiefSpec)) + return nil + }, InType: reflect.TypeOf(&ChiefSpec{})}, + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*ControllerConfig).DeepCopyInto(out.(*ControllerConfig)) + return nil + }, InType: reflect.TypeOf(&ControllerConfig{})}, + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*EnvironmentVariableConfig).DeepCopyInto(out.(*EnvironmentVariableConfig)) + return nil + }, InType: reflect.TypeOf(&EnvironmentVariableConfig{})}, + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*TFJob).DeepCopyInto(out.(*TFJob)) + return nil + }, InType: reflect.TypeOf(&TFJob{})}, + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*TFJobList).DeepCopyInto(out.(*TFJobList)) + return nil + }, InType: reflect.TypeOf(&TFJobList{})}, + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*TFJobSpec).DeepCopyInto(out.(*TFJobSpec)) + return nil + }, InType: reflect.TypeOf(&TFJobSpec{})}, + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*TFJobStatus).DeepCopyInto(out.(*TFJobStatus)) + return nil + }, InType: reflect.TypeOf(&TFJobStatus{})}, + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*TFReplicaSpec).DeepCopyInto(out.(*TFReplicaSpec)) + return nil + }, InType: reflect.TypeOf(&TFReplicaSpec{})}, + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*TFReplicaStatus).DeepCopyInto(out.(*TFReplicaStatus)) + return nil + }, InType: reflect.TypeOf(&TFReplicaStatus{})}, + conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { + in.(*TerminationPolicySpec).DeepCopyInto(out.(*TerminationPolicySpec)) + return nil + }, InType: reflect.TypeOf(&TerminationPolicySpec{})}, + ) +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AcceleratorConfig) DeepCopyInto(out *AcceleratorConfig) { + *out = *in + if in.Volumes != nil { + in, out := &in.Volumes, &out.Volumes + *out = make([]AcceleratorVolume, len(*in)) + copy(*out, *in) + } + if in.EnvVars != nil { + in, out := &in.EnvVars, &out.EnvVars + *out = make([]EnvironmentVariableConfig, len(*in)) + copy(*out, *in) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AcceleratorConfig. +func (in *AcceleratorConfig) DeepCopy() *AcceleratorConfig { + if in == nil { + return nil + } + out := new(AcceleratorConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AcceleratorVolume) DeepCopyInto(out *AcceleratorVolume) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AcceleratorVolume. +func (in *AcceleratorVolume) DeepCopy() *AcceleratorVolume { + if in == nil { + return nil + } + out := new(AcceleratorVolume) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ChiefSpec) DeepCopyInto(out *ChiefSpec) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ChiefSpec. +func (in *ChiefSpec) DeepCopy() *ChiefSpec { + if in == nil { + return nil + } + out := new(ChiefSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ControllerConfig) DeepCopyInto(out *ControllerConfig) { + *out = *in + if in.Accelerators != nil { + in, out := &in.Accelerators, &out.Accelerators + *out = make(map[string]AcceleratorConfig, len(*in)) + for key, val := range *in { + newVal := new(AcceleratorConfig) + val.DeepCopyInto(newVal) + (*out)[key] = *newVal + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ControllerConfig. +func (in *ControllerConfig) DeepCopy() *ControllerConfig { + if in == nil { + return nil + } + out := new(ControllerConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnvironmentVariableConfig) DeepCopyInto(out *EnvironmentVariableConfig) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvironmentVariableConfig. +func (in *EnvironmentVariableConfig) DeepCopy() *EnvironmentVariableConfig { + if in == nil { + return nil + } + out := new(EnvironmentVariableConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TFJob) DeepCopyInto(out *TFJob) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFJob. +func (in *TFJob) DeepCopy() *TFJob { + if in == nil { + return nil + } + out := new(TFJob) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *TFJob) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } else { + return nil + } +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TFJobList) DeepCopyInto(out *TFJobList) { + *out = *in + out.TypeMeta = in.TypeMeta + out.ListMeta = in.ListMeta + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]TFJob, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFJobList. +func (in *TFJobList) DeepCopy() *TFJobList { + if in == nil { + return nil + } + out := new(TFJobList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *TFJobList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } else { + return nil + } +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TFJobSpec) DeepCopyInto(out *TFJobSpec) { + *out = *in + if in.ReplicaSpecs != nil { + in, out := &in.ReplicaSpecs, &out.ReplicaSpecs + *out = make([]*TFReplicaSpec, len(*in)) + for i := range *in { + if (*in)[i] == nil { + (*out)[i] = nil + } else { + (*out)[i] = new(TFReplicaSpec) + (*in)[i].DeepCopyInto((*out)[i]) + } + } + } + if in.TerminationPolicy != nil { + in, out := &in.TerminationPolicy, &out.TerminationPolicy + if *in == nil { + *out = nil + } else { + *out = new(TerminationPolicySpec) + (*in).DeepCopyInto(*out) + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFJobSpec. +func (in *TFJobSpec) DeepCopy() *TFJobSpec { + if in == nil { + return nil + } + out := new(TFJobSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TFJobStatus) DeepCopyInto(out *TFJobStatus) { + *out = *in + if in.ReplicaStatuses != nil { + in, out := &in.ReplicaStatuses, &out.ReplicaStatuses + *out = make([]*TFReplicaStatus, len(*in)) + for i := range *in { + if (*in)[i] == nil { + (*out)[i] = nil + } else { + (*out)[i] = new(TFReplicaStatus) + (*in)[i].DeepCopyInto((*out)[i]) + } + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFJobStatus. +func (in *TFJobStatus) DeepCopy() *TFJobStatus { + if in == nil { + return nil + } + out := new(TFJobStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TFReplicaSpec) DeepCopyInto(out *TFReplicaSpec) { + *out = *in + if in.Replicas != nil { + in, out := &in.Replicas, &out.Replicas + if *in == nil { + *out = nil + } else { + *out = new(int32) + **out = **in + } + } + if in.Template != nil { + in, out := &in.Template, &out.Template + if *in == nil { + *out = nil + } else { + *out = new(v1.PodTemplateSpec) + (*in).DeepCopyInto(*out) + } + } + if in.TFPort != nil { + in, out := &in.TFPort, &out.TFPort + if *in == nil { + *out = nil + } else { + *out = new(int32) + **out = **in + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFReplicaSpec. +func (in *TFReplicaSpec) DeepCopy() *TFReplicaSpec { + if in == nil { + return nil + } + out := new(TFReplicaSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TFReplicaStatus) DeepCopyInto(out *TFReplicaStatus) { + *out = *in + if in.ReplicasStates != nil { + in, out := &in.ReplicasStates, &out.ReplicasStates + *out = make(map[ReplicaState]int, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TFReplicaStatus. +func (in *TFReplicaStatus) DeepCopy() *TFReplicaStatus { + if in == nil { + return nil + } + out := new(TFReplicaStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TerminationPolicySpec) DeepCopyInto(out *TerminationPolicySpec) { + *out = *in + if in.Chief != nil { + in, out := &in.Chief, &out.Chief + if *in == nil { + *out = nil + } else { + *out = new(ChiefSpec) + **out = **in + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TerminationPolicySpec. +func (in *TerminationPolicySpec) DeepCopy() *TerminationPolicySpec { + if in == nil { + return nil + } + out := new(TerminationPolicySpec) + in.DeepCopyInto(out) + return out +} diff --git a/pkg/apis/tensorflow/v1alpha1/zz_generated.defaults.go b/pkg/apis/tensorflow/v1alpha1/zz_generated.defaults.go new file mode 100644 index 0000000000..2d3a981187 --- /dev/null +++ b/pkg/apis/tensorflow/v1alpha1/zz_generated.defaults.go @@ -0,0 +1,45 @@ +// +build !ignore_autogenerated + +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This file was autogenerated by defaulter-gen. Do not edit it manually! + +package v1alpha1 + +import ( + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// RegisterDefaults adds defaulters functions to the given scheme. +// Public to allow building arbitrary schemes. +// All generated defaulters are covering - they call all nested defaulters. +func RegisterDefaults(scheme *runtime.Scheme) error { + scheme.AddTypeDefaultingFunc(&TFJob{}, func(obj interface{}) { SetObjectDefaults_TFJob(obj.(*TFJob)) }) + scheme.AddTypeDefaultingFunc(&TFJobList{}, func(obj interface{}) { SetObjectDefaults_TFJobList(obj.(*TFJobList)) }) + return nil +} + +func SetObjectDefaults_TFJob(in *TFJob) { + SetDefaults_TFJob(in) +} + +func SetObjectDefaults_TFJobList(in *TFJobList) { + for i := range in.Items { + a := &in.Items[i] + SetObjectDefaults_TFJob(a) + } +} diff --git a/pkg/apis/tensorflow/validation/validation.go b/pkg/apis/tensorflow/validation/validation.go new file mode 100644 index 0000000000..fa0f012319 --- /dev/null +++ b/pkg/apis/tensorflow/validation/validation.go @@ -0,0 +1,79 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package validation + +import ( + "errors" + "fmt" + + tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" + "github.com/kubeflow/tf-operator/pkg/util" +) + +// ValidateTFJobSpec checks that the TFJobSpec is valid. +func ValidateTFJobSpec(c *tfv1.TFJobSpec) error { + if c.TerminationPolicy == nil || c.TerminationPolicy.Chief == nil { + return fmt.Errorf("invalid termination policy: %v", c.TerminationPolicy) + } + + chiefExists := false + + // Check that each replica has a TensorFlow container and a chief. + for _, r := range c.ReplicaSpecs { + found := false + if r.Template == nil { + return fmt.Errorf("Replica is missing Template; %v", util.Pformat(r)) + } + + if r.TFReplicaType == tfv1.TFReplicaType(c.TerminationPolicy.Chief.ReplicaName) { + chiefExists = true + } + + if r.TFPort == nil { + return errors.New("tfReplicaSpec.TFPort can't be nil.") + } + + // Make sure the replica type is valid. + validReplicaTypes := []tfv1.TFReplicaType{tfv1.MASTER, tfv1.PS, tfv1.WORKER} + + isValidReplicaType := false + for _, t := range validReplicaTypes { + if t == r.TFReplicaType { + isValidReplicaType = true + break + } + } + + if !isValidReplicaType { + return fmt.Errorf("tfReplicaSpec.TFReplicaType is %v but must be one of %v", r.TFReplicaType, validReplicaTypes) + } + + for _, c := range r.Template.Spec.Containers { + if c.Name == tfv1.DefaultTFContainer { + found = true + break + } + } + if !found { + return fmt.Errorf("Replica type %v is missing a container named %s", r.TFReplicaType, tfv1.DefaultTFContainer) + } + } + + if !chiefExists { + return fmt.Errorf("Missing ReplicaSpec for chief: %v", c.TerminationPolicy.Chief.ReplicaName) + } + + return nil +} diff --git a/pkg/apis/tensorflow/validation/validation_test.go b/pkg/apis/tensorflow/validation/validation_test.go new file mode 100644 index 0000000000..670c0b4517 --- /dev/null +++ b/pkg/apis/tensorflow/validation/validation_test.go @@ -0,0 +1,113 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package validation + +import ( + "testing" + + tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" + + "github.com/gogo/protobuf/proto" + "k8s.io/api/core/v1" +) + +func TestValidate(t *testing.T) { + type testCase struct { + in *tfv1.TFJobSpec + expectingError bool + } + + testCases := []testCase{ + { + in: &tfv1.TFJobSpec{ + ReplicaSpecs: []*tfv1.TFReplicaSpec{ + { + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + }, + }, + }, + }, + TFReplicaType: tfv1.MASTER, + Replicas: proto.Int32(1), + }, + }, + TFImage: "tensorflow/tensorflow:1.3.0", + }, + expectingError: false, + }, + { + in: &tfv1.TFJobSpec{ + ReplicaSpecs: []*tfv1.TFReplicaSpec{ + { + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + }, + }, + }, + }, + TFReplicaType: tfv1.WORKER, + Replicas: proto.Int32(1), + }, + }, + TFImage: "tensorflow/tensorflow:1.3.0", + }, + expectingError: true, + }, + { + in: &tfv1.TFJobSpec{ + ReplicaSpecs: []*tfv1.TFReplicaSpec{ + { + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + }, + }, + }, + }, + TFReplicaType: tfv1.WORKER, + Replicas: proto.Int32(1), + }, + }, + TFImage: "tensorflow/tensorflow:1.3.0", + TerminationPolicy: &tfv1.TerminationPolicySpec{ + Chief: &tfv1.ChiefSpec{ + ReplicaName: "WORKER", + ReplicaIndex: 0, + }, + }, + }, + expectingError: false, + }, + } + + for _, c := range testCases { + job := &tfv1.TFJob{ + Spec: *c.in, + } + tfv1.SetObjectDefaults_TFJob(job) + if err := ValidateTFJobSpec(&job.Spec); (err != nil) != c.expectingError { + t.Errorf("unexpected validation result: %v", err) + } + } +} diff --git a/pkg/client/clientset/versioned/clientset.go b/pkg/client/clientset/versioned/clientset.go index 897759d9c3..c7340772ae 100644 --- a/pkg/client/clientset/versioned/clientset.go +++ b/pkg/client/clientset/versioned/clientset.go @@ -15,6 +15,7 @@ package versioned import ( glog "github.com/golang/glog" + kubeflowv1alpha1 "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1" kubeflowv1alpha2 "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2" discovery "k8s.io/client-go/discovery" rest "k8s.io/client-go/rest" @@ -23,26 +24,33 @@ import ( type Interface interface { Discovery() discovery.DiscoveryInterface - KubeflowV1alpha2() kubeflowv1alpha2.KubeflowV1alpha2Interface + KubeflowV1alpha1() kubeflowv1alpha1.KubeflowV1alpha1Interface // Deprecated: please explicitly pick a version if possible. - Kubeflow() kubeflowv1alpha2.KubeflowV1alpha2Interface + Kubeflow() kubeflowv1alpha1.KubeflowV1alpha1Interface + KubeflowV1alpha2() kubeflowv1alpha2.KubeflowV1alpha2Interface } // Clientset contains the clients for groups. Each group has exactly one // version included in a Clientset. type Clientset struct { *discovery.DiscoveryClient + kubeflowV1alpha1 *kubeflowv1alpha1.KubeflowV1alpha1Client kubeflowV1alpha2 *kubeflowv1alpha2.KubeflowV1alpha2Client } -// KubeflowV1alpha2 retrieves the KubeflowV1alpha2Client -func (c *Clientset) KubeflowV1alpha2() kubeflowv1alpha2.KubeflowV1alpha2Interface { - return c.kubeflowV1alpha2 +// KubeflowV1alpha1 retrieves the KubeflowV1alpha1Client +func (c *Clientset) KubeflowV1alpha1() kubeflowv1alpha1.KubeflowV1alpha1Interface { + return c.kubeflowV1alpha1 } // Deprecated: Kubeflow retrieves the default version of KubeflowClient. // Please explicitly pick a version. -func (c *Clientset) Kubeflow() kubeflowv1alpha2.KubeflowV1alpha2Interface { +func (c *Clientset) Kubeflow() kubeflowv1alpha1.KubeflowV1alpha1Interface { + return c.kubeflowV1alpha1 +} + +// KubeflowV1alpha2 retrieves the KubeflowV1alpha2Client +func (c *Clientset) KubeflowV1alpha2() kubeflowv1alpha2.KubeflowV1alpha2Interface { return c.kubeflowV1alpha2 } @@ -62,6 +70,10 @@ func NewForConfig(c *rest.Config) (*Clientset, error) { } var cs Clientset var err error + cs.kubeflowV1alpha1, err = kubeflowv1alpha1.NewForConfig(&configShallowCopy) + if err != nil { + return nil, err + } cs.kubeflowV1alpha2, err = kubeflowv1alpha2.NewForConfig(&configShallowCopy) if err != nil { return nil, err @@ -79,6 +91,7 @@ func NewForConfig(c *rest.Config) (*Clientset, error) { // panics if there is an error in the config. func NewForConfigOrDie(c *rest.Config) *Clientset { var cs Clientset + cs.kubeflowV1alpha1 = kubeflowv1alpha1.NewForConfigOrDie(c) cs.kubeflowV1alpha2 = kubeflowv1alpha2.NewForConfigOrDie(c) cs.DiscoveryClient = discovery.NewDiscoveryClientForConfigOrDie(c) @@ -88,6 +101,7 @@ func NewForConfigOrDie(c *rest.Config) *Clientset { // New creates a new Clientset for the given RESTClient. func New(c rest.Interface) *Clientset { var cs Clientset + cs.kubeflowV1alpha1 = kubeflowv1alpha1.New(c) cs.kubeflowV1alpha2 = kubeflowv1alpha2.New(c) cs.DiscoveryClient = discovery.NewDiscoveryClient(c) diff --git a/pkg/client/clientset/versioned/fake/clientset_generated.go b/pkg/client/clientset/versioned/fake/clientset_generated.go index ba187d2627..cf557518d6 100644 --- a/pkg/client/clientset/versioned/fake/clientset_generated.go +++ b/pkg/client/clientset/versioned/fake/clientset_generated.go @@ -15,6 +15,8 @@ package fake import ( clientset "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" + kubeflowv1alpha1 "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1" + fakekubeflowv1alpha1 "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake" kubeflowv1alpha2 "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2" fakekubeflowv1alpha2 "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake" "k8s.io/apimachinery/pkg/runtime" @@ -57,12 +59,17 @@ func (c *Clientset) Discovery() discovery.DiscoveryInterface { var _ clientset.Interface = &Clientset{} -// KubeflowV1alpha2 retrieves the KubeflowV1alpha2Client -func (c *Clientset) KubeflowV1alpha2() kubeflowv1alpha2.KubeflowV1alpha2Interface { - return &fakekubeflowv1alpha2.FakeKubeflowV1alpha2{Fake: &c.Fake} +// KubeflowV1alpha1 retrieves the KubeflowV1alpha1Client +func (c *Clientset) KubeflowV1alpha1() kubeflowv1alpha1.KubeflowV1alpha1Interface { + return &fakekubeflowv1alpha1.FakeKubeflowV1alpha1{Fake: &c.Fake} } -// Kubeflow retrieves the KubeflowV1alpha2Client -func (c *Clientset) Kubeflow() kubeflowv1alpha2.KubeflowV1alpha2Interface { +// Kubeflow retrieves the KubeflowV1alpha1Client +func (c *Clientset) Kubeflow() kubeflowv1alpha1.KubeflowV1alpha1Interface { + return &fakekubeflowv1alpha1.FakeKubeflowV1alpha1{Fake: &c.Fake} +} + +// KubeflowV1alpha2 retrieves the KubeflowV1alpha2Client +func (c *Clientset) KubeflowV1alpha2() kubeflowv1alpha2.KubeflowV1alpha2Interface { return &fakekubeflowv1alpha2.FakeKubeflowV1alpha2{Fake: &c.Fake} } diff --git a/pkg/client/clientset/versioned/fake/register.go b/pkg/client/clientset/versioned/fake/register.go index cb575beb0d..9cb3a32be6 100644 --- a/pkg/client/clientset/versioned/fake/register.go +++ b/pkg/client/clientset/versioned/fake/register.go @@ -14,6 +14,7 @@ package fake import ( + kubeflowv1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" kubeflowv1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" @@ -45,6 +46,7 @@ func init() { // After this, RawExtensions in Kubernetes types will serialize kube-aggregator types // correctly. func AddToScheme(scheme *runtime.Scheme) { + kubeflowv1alpha1.AddToScheme(scheme) kubeflowv1alpha2.AddToScheme(scheme) } diff --git a/pkg/client/clientset/versioned/scheme/register.go b/pkg/client/clientset/versioned/scheme/register.go index f72adde908..13de9af81a 100644 --- a/pkg/client/clientset/versioned/scheme/register.go +++ b/pkg/client/clientset/versioned/scheme/register.go @@ -14,6 +14,7 @@ package scheme import ( + kubeflowv1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" kubeflowv1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" @@ -45,6 +46,7 @@ func init() { // After this, RawExtensions in Kubernetes types will serialize kube-aggregator types // correctly. func AddToScheme(scheme *runtime.Scheme) { + kubeflowv1alpha1.AddToScheme(scheme) kubeflowv1alpha2.AddToScheme(scheme) } diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/doc.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/doc.go new file mode 100644 index 0000000000..8d24212e9c --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/doc.go @@ -0,0 +1,18 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This package is generated by client-gen with custom arguments. + +// This package has the automatically generated typed clients. +package v1alpha1 diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/doc.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/doc.go new file mode 100644 index 0000000000..41d860c548 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/doc.go @@ -0,0 +1,18 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This package is generated by client-gen with custom arguments. + +// Package fake has the automatically generated clients. +package fake diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_kubeflow_client.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_kubeflow_client.go new file mode 100644 index 0000000000..26f189df83 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_kubeflow_client.go @@ -0,0 +1,35 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package fake + +import ( + v1alpha1 "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1" + rest "k8s.io/client-go/rest" + testing "k8s.io/client-go/testing" +) + +type FakeKubeflowV1alpha1 struct { + *testing.Fake +} + +func (c *FakeKubeflowV1alpha1) TFJobs(namespace string) v1alpha1.TFJobInterface { + return &FakeTFJobs{c, namespace} +} + +// RESTClient returns a RESTClient that is used to communicate +// with API server by this client implementation. +func (c *FakeKubeflowV1alpha1) RESTClient() rest.Interface { + var ret *rest.RESTClient + return ret +} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_tfjob.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_tfjob.go new file mode 100644 index 0000000000..f42aab7738 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_tfjob.go @@ -0,0 +1,123 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package fake + +import ( + v1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + labels "k8s.io/apimachinery/pkg/labels" + schema "k8s.io/apimachinery/pkg/runtime/schema" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + testing "k8s.io/client-go/testing" +) + +// FakeTFJobs implements TFJobInterface +type FakeTFJobs struct { + Fake *FakeKubeflowV1alpha1 + ns string +} + +var tfjobsResource = schema.GroupVersionResource{Group: "kubeflow.org", Version: "v1alpha1", Resource: "tfjobs"} + +var tfjobsKind = schema.GroupVersionKind{Group: "kubeflow.org", Version: "v1alpha1", Kind: "TFJob"} + +// Get takes name of the tFJob, and returns the corresponding tFJob object, and an error if there is any. +func (c *FakeTFJobs) Get(name string, options v1.GetOptions) (result *v1alpha1.TFJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewGetAction(tfjobsResource, c.ns, name), &v1alpha1.TFJob{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.TFJob), err +} + +// List takes label and field selectors, and returns the list of TFJobs that match those selectors. +func (c *FakeTFJobs) List(opts v1.ListOptions) (result *v1alpha1.TFJobList, err error) { + obj, err := c.Fake. + Invokes(testing.NewListAction(tfjobsResource, tfjobsKind, c.ns, opts), &v1alpha1.TFJobList{}) + + if obj == nil { + return nil, err + } + + label, _, _ := testing.ExtractFromListOptions(opts) + if label == nil { + label = labels.Everything() + } + list := &v1alpha1.TFJobList{} + for _, item := range obj.(*v1alpha1.TFJobList).Items { + if label.Matches(labels.Set(item.Labels)) { + list.Items = append(list.Items, item) + } + } + return list, err +} + +// Watch returns a watch.Interface that watches the requested tFJobs. +func (c *FakeTFJobs) Watch(opts v1.ListOptions) (watch.Interface, error) { + return c.Fake. + InvokesWatch(testing.NewWatchAction(tfjobsResource, c.ns, opts)) + +} + +// Create takes the representation of a tFJob and creates it. Returns the server's representation of the tFJob, and an error, if there is any. +func (c *FakeTFJobs) Create(tFJob *v1alpha1.TFJob) (result *v1alpha1.TFJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewCreateAction(tfjobsResource, c.ns, tFJob), &v1alpha1.TFJob{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.TFJob), err +} + +// Update takes the representation of a tFJob and updates it. Returns the server's representation of the tFJob, and an error, if there is any. +func (c *FakeTFJobs) Update(tFJob *v1alpha1.TFJob) (result *v1alpha1.TFJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewUpdateAction(tfjobsResource, c.ns, tFJob), &v1alpha1.TFJob{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.TFJob), err +} + +// Delete takes name of the tFJob and deletes it. Returns an error if one occurs. +func (c *FakeTFJobs) Delete(name string, options *v1.DeleteOptions) error { + _, err := c.Fake. + Invokes(testing.NewDeleteAction(tfjobsResource, c.ns, name), &v1alpha1.TFJob{}) + + return err +} + +// DeleteCollection deletes a collection of objects. +func (c *FakeTFJobs) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { + action := testing.NewDeleteCollectionAction(tfjobsResource, c.ns, listOptions) + + _, err := c.Fake.Invokes(action, &v1alpha1.TFJobList{}) + return err +} + +// Patch applies the patch and returns the patched tFJob. +func (c *FakeTFJobs) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.TFJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewPatchSubresourceAction(tfjobsResource, c.ns, name, data, subresources...), &v1alpha1.TFJob{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.TFJob), err +} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/generated_expansion.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/generated_expansion.go new file mode 100644 index 0000000000..609abde95d --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/generated_expansion.go @@ -0,0 +1,16 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package v1alpha1 + +type TFJobExpansion interface{} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/kubeflow_client.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/kubeflow_client.go new file mode 100644 index 0000000000..dfb97df405 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/kubeflow_client.go @@ -0,0 +1,85 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package v1alpha1 + +import ( + v1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" + "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" + serializer "k8s.io/apimachinery/pkg/runtime/serializer" + rest "k8s.io/client-go/rest" +) + +type KubeflowV1alpha1Interface interface { + RESTClient() rest.Interface + TFJobsGetter +} + +// KubeflowV1alpha1Client is used to interact with features provided by the kubeflow.org group. +type KubeflowV1alpha1Client struct { + restClient rest.Interface +} + +func (c *KubeflowV1alpha1Client) TFJobs(namespace string) TFJobInterface { + return newTFJobs(c, namespace) +} + +// NewForConfig creates a new KubeflowV1alpha1Client for the given config. +func NewForConfig(c *rest.Config) (*KubeflowV1alpha1Client, error) { + config := *c + if err := setConfigDefaults(&config); err != nil { + return nil, err + } + client, err := rest.RESTClientFor(&config) + if err != nil { + return nil, err + } + return &KubeflowV1alpha1Client{client}, nil +} + +// NewForConfigOrDie creates a new KubeflowV1alpha1Client for the given config and +// panics if there is an error in the config. +func NewForConfigOrDie(c *rest.Config) *KubeflowV1alpha1Client { + client, err := NewForConfig(c) + if err != nil { + panic(err) + } + return client +} + +// New creates a new KubeflowV1alpha1Client for the given RESTClient. +func New(c rest.Interface) *KubeflowV1alpha1Client { + return &KubeflowV1alpha1Client{c} +} + +func setConfigDefaults(config *rest.Config) error { + gv := v1alpha1.SchemeGroupVersion + config.GroupVersion = &gv + config.APIPath = "/apis" + config.NegotiatedSerializer = serializer.DirectCodecFactory{CodecFactory: scheme.Codecs} + + if config.UserAgent == "" { + config.UserAgent = rest.DefaultKubernetesUserAgent() + } + + return nil +} + +// RESTClient returns a RESTClient that is used to communicate +// with API server by this client implementation. +func (c *KubeflowV1alpha1Client) RESTClient() rest.Interface { + if c == nil { + return nil + } + return c.restClient +} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/tfjob.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/tfjob.go new file mode 100644 index 0000000000..87d02e3869 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/tfjob.go @@ -0,0 +1,152 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package v1alpha1 + +import ( + v1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" + scheme "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + rest "k8s.io/client-go/rest" +) + +// TFJobsGetter has a method to return a TFJobInterface. +// A group's client should implement this interface. +type TFJobsGetter interface { + TFJobs(namespace string) TFJobInterface +} + +// TFJobInterface has methods to work with TFJob resources. +type TFJobInterface interface { + Create(*v1alpha1.TFJob) (*v1alpha1.TFJob, error) + Update(*v1alpha1.TFJob) (*v1alpha1.TFJob, error) + Delete(name string, options *v1.DeleteOptions) error + DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error + Get(name string, options v1.GetOptions) (*v1alpha1.TFJob, error) + List(opts v1.ListOptions) (*v1alpha1.TFJobList, error) + Watch(opts v1.ListOptions) (watch.Interface, error) + Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.TFJob, err error) + TFJobExpansion +} + +// tFJobs implements TFJobInterface +type tFJobs struct { + client rest.Interface + ns string +} + +// newTFJobs returns a TFJobs +func newTFJobs(c *KubeflowV1alpha1Client, namespace string) *tFJobs { + return &tFJobs{ + client: c.RESTClient(), + ns: namespace, + } +} + +// Get takes name of the tFJob, and returns the corresponding tFJob object, and an error if there is any. +func (c *tFJobs) Get(name string, options v1.GetOptions) (result *v1alpha1.TFJob, err error) { + result = &v1alpha1.TFJob{} + err = c.client.Get(). + Namespace(c.ns). + Resource("tfjobs"). + Name(name). + VersionedParams(&options, scheme.ParameterCodec). + Do(). + Into(result) + return +} + +// List takes label and field selectors, and returns the list of TFJobs that match those selectors. +func (c *tFJobs) List(opts v1.ListOptions) (result *v1alpha1.TFJobList, err error) { + result = &v1alpha1.TFJobList{} + err = c.client.Get(). + Namespace(c.ns). + Resource("tfjobs"). + VersionedParams(&opts, scheme.ParameterCodec). + Do(). + Into(result) + return +} + +// Watch returns a watch.Interface that watches the requested tFJobs. +func (c *tFJobs) Watch(opts v1.ListOptions) (watch.Interface, error) { + opts.Watch = true + return c.client.Get(). + Namespace(c.ns). + Resource("tfjobs"). + VersionedParams(&opts, scheme.ParameterCodec). + Watch() +} + +// Create takes the representation of a tFJob and creates it. Returns the server's representation of the tFJob, and an error, if there is any. +func (c *tFJobs) Create(tFJob *v1alpha1.TFJob) (result *v1alpha1.TFJob, err error) { + result = &v1alpha1.TFJob{} + err = c.client.Post(). + Namespace(c.ns). + Resource("tfjobs"). + Body(tFJob). + Do(). + Into(result) + return +} + +// Update takes the representation of a tFJob and updates it. Returns the server's representation of the tFJob, and an error, if there is any. +func (c *tFJobs) Update(tFJob *v1alpha1.TFJob) (result *v1alpha1.TFJob, err error) { + result = &v1alpha1.TFJob{} + err = c.client.Put(). + Namespace(c.ns). + Resource("tfjobs"). + Name(tFJob.Name). + Body(tFJob). + Do(). + Into(result) + return +} + +// Delete takes name of the tFJob and deletes it. Returns an error if one occurs. +func (c *tFJobs) Delete(name string, options *v1.DeleteOptions) error { + return c.client.Delete(). + Namespace(c.ns). + Resource("tfjobs"). + Name(name). + Body(options). + Do(). + Error() +} + +// DeleteCollection deletes a collection of objects. +func (c *tFJobs) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { + return c.client.Delete(). + Namespace(c.ns). + Resource("tfjobs"). + VersionedParams(&listOptions, scheme.ParameterCodec). + Body(options). + Do(). + Error() +} + +// Patch applies the patch and returns the patched tFJob. +func (c *tFJobs) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.TFJob, err error) { + result = &v1alpha1.TFJob{} + err = c.client.Patch(pt). + Namespace(c.ns). + Resource("tfjobs"). + SubResource(subresources...). + Name(name). + Body(data). + Do(). + Into(result) + return +} diff --git a/pkg/client/informers/externalversions/generic.go b/pkg/client/informers/externalversions/generic.go index f2671fdceb..a2a165dff4 100644 --- a/pkg/client/informers/externalversions/generic.go +++ b/pkg/client/informers/externalversions/generic.go @@ -18,6 +18,7 @@ package externalversions import ( "fmt" + v1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" v1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" schema "k8s.io/apimachinery/pkg/runtime/schema" cache "k8s.io/client-go/tools/cache" @@ -49,7 +50,11 @@ func (f *genericInformer) Lister() cache.GenericLister { // TODO extend this to unknown resources with a client pool func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) { switch resource { - // Group=Kubeflow, Version=V1alpha2 + // Group=Kubeflow, Version=V1alpha1 + case v1alpha1.SchemeGroupVersion.WithResource("tfjobs"): + return &genericInformer{resource: resource.GroupResource(), informer: f.Kubeflow().V1alpha1().TFJobs().Informer()}, nil + + // Group=Kubeflow, Version=V1alpha2 case v1alpha2.SchemeGroupVersion.WithResource("tfjobs"): return &genericInformer{resource: resource.GroupResource(), informer: f.Kubeflow().V1alpha2().TFJobs().Informer()}, nil diff --git a/pkg/client/informers/externalversions/kubeflow/interface.go b/pkg/client/informers/externalversions/kubeflow/interface.go index 1753f9cb56..e79b27801c 100644 --- a/pkg/client/informers/externalversions/kubeflow/interface.go +++ b/pkg/client/informers/externalversions/kubeflow/interface.go @@ -18,11 +18,14 @@ package kubeflow import ( internalinterfaces "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/internalinterfaces" + v1alpha1 "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/kubeflow/v1alpha1" v1alpha2 "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/kubeflow/v1alpha2" ) // Interface provides access to each of this group's versions. type Interface interface { + // V1alpha1 provides access to shared informers for resources in V1alpha1. + V1alpha1() v1alpha1.Interface // V1alpha2 provides access to shared informers for resources in V1alpha2. V1alpha2() v1alpha2.Interface } @@ -36,6 +39,11 @@ func New(f internalinterfaces.SharedInformerFactory) Interface { return &group{f} } +// V1alpha1 returns a new v1alpha1.Interface. +func (g *group) V1alpha1() v1alpha1.Interface { + return v1alpha1.New(g.SharedInformerFactory) +} + // V1alpha2 returns a new v1alpha2.Interface. func (g *group) V1alpha2() v1alpha2.Interface { return v1alpha2.New(g.SharedInformerFactory) diff --git a/pkg/client/informers/externalversions/kubeflow/v1alpha1/interface.go b/pkg/client/informers/externalversions/kubeflow/v1alpha1/interface.go new file mode 100644 index 0000000000..d6d535ee33 --- /dev/null +++ b/pkg/client/informers/externalversions/kubeflow/v1alpha1/interface.go @@ -0,0 +1,41 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file was automatically generated by informer-gen + +package v1alpha1 + +import ( + internalinterfaces "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/internalinterfaces" +) + +// Interface provides access to all the informers in this group version. +type Interface interface { + // TFJobs returns a TFJobInformer. + TFJobs() TFJobInformer +} + +type version struct { + internalinterfaces.SharedInformerFactory +} + +// New returns a new Interface. +func New(f internalinterfaces.SharedInformerFactory) Interface { + return &version{f} +} + +// TFJobs returns a TFJobInformer. +func (v *version) TFJobs() TFJobInformer { + return &tFJobInformer{factory: v.SharedInformerFactory} +} diff --git a/pkg/client/informers/externalversions/kubeflow/v1alpha1/tfjob.go b/pkg/client/informers/externalversions/kubeflow/v1alpha1/tfjob.go new file mode 100644 index 0000000000..719d577179 --- /dev/null +++ b/pkg/client/informers/externalversions/kubeflow/v1alpha1/tfjob.go @@ -0,0 +1,71 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file was automatically generated by informer-gen + +package v1alpha1 + +import ( + tensorflow_v1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" + versioned "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" + internalinterfaces "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/internalinterfaces" + v1alpha1 "github.com/kubeflow/tf-operator/pkg/client/listers/kubeflow/v1alpha1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + watch "k8s.io/apimachinery/pkg/watch" + cache "k8s.io/client-go/tools/cache" + time "time" +) + +// TFJobInformer provides access to a shared informer and lister for +// TFJobs. +type TFJobInformer interface { + Informer() cache.SharedIndexInformer + Lister() v1alpha1.TFJobLister +} + +type tFJobInformer struct { + factory internalinterfaces.SharedInformerFactory +} + +// NewTFJobInformer constructs a new informer for TFJob type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewTFJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { + return cache.NewSharedIndexInformer( + &cache.ListWatch{ + ListFunc: func(options v1.ListOptions) (runtime.Object, error) { + return client.KubeflowV1alpha1().TFJobs(namespace).List(options) + }, + WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { + return client.KubeflowV1alpha1().TFJobs(namespace).Watch(options) + }, + }, + &tensorflow_v1alpha1.TFJob{}, + resyncPeriod, + indexers, + ) +} + +func defaultTFJobInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { + return NewTFJobInformer(client, v1.NamespaceAll, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) +} + +func (f *tFJobInformer) Informer() cache.SharedIndexInformer { + return f.factory.InformerFor(&tensorflow_v1alpha1.TFJob{}, defaultTFJobInformer) +} + +func (f *tFJobInformer) Lister() v1alpha1.TFJobLister { + return v1alpha1.NewTFJobLister(f.Informer().GetIndexer()) +} diff --git a/pkg/client/listers/kubeflow/v1alpha1/expansion_generated.go b/pkg/client/listers/kubeflow/v1alpha1/expansion_generated.go new file mode 100644 index 0000000000..13eb4845c0 --- /dev/null +++ b/pkg/client/listers/kubeflow/v1alpha1/expansion_generated.go @@ -0,0 +1,25 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file was automatically generated by lister-gen + +package v1alpha1 + +// TFJobListerExpansion allows custom methods to be added to +// TFJobLister. +type TFJobListerExpansion interface{} + +// TFJobNamespaceListerExpansion allows custom methods to be added to +// TFJobNamespaceLister. +type TFJobNamespaceListerExpansion interface{} diff --git a/pkg/client/listers/kubeflow/v1alpha1/tfjob.go b/pkg/client/listers/kubeflow/v1alpha1/tfjob.go new file mode 100644 index 0000000000..82c329c3d9 --- /dev/null +++ b/pkg/client/listers/kubeflow/v1alpha1/tfjob.go @@ -0,0 +1,92 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file was automatically generated by lister-gen + +package v1alpha1 + +import ( + v1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/tools/cache" +) + +// TFJobLister helps list TFJobs. +type TFJobLister interface { + // List lists all TFJobs in the indexer. + List(selector labels.Selector) (ret []*v1alpha1.TFJob, err error) + // TFJobs returns an object that can list and get TFJobs. + TFJobs(namespace string) TFJobNamespaceLister + TFJobListerExpansion +} + +// tFJobLister implements the TFJobLister interface. +type tFJobLister struct { + indexer cache.Indexer +} + +// NewTFJobLister returns a new TFJobLister. +func NewTFJobLister(indexer cache.Indexer) TFJobLister { + return &tFJobLister{indexer: indexer} +} + +// List lists all TFJobs in the indexer. +func (s *tFJobLister) List(selector labels.Selector) (ret []*v1alpha1.TFJob, err error) { + err = cache.ListAll(s.indexer, selector, func(m interface{}) { + ret = append(ret, m.(*v1alpha1.TFJob)) + }) + return ret, err +} + +// TFJobs returns an object that can list and get TFJobs. +func (s *tFJobLister) TFJobs(namespace string) TFJobNamespaceLister { + return tFJobNamespaceLister{indexer: s.indexer, namespace: namespace} +} + +// TFJobNamespaceLister helps list and get TFJobs. +type TFJobNamespaceLister interface { + // List lists all TFJobs in the indexer for a given namespace. + List(selector labels.Selector) (ret []*v1alpha1.TFJob, err error) + // Get retrieves the TFJob from the indexer for a given namespace and name. + Get(name string) (*v1alpha1.TFJob, error) + TFJobNamespaceListerExpansion +} + +// tFJobNamespaceLister implements the TFJobNamespaceLister +// interface. +type tFJobNamespaceLister struct { + indexer cache.Indexer + namespace string +} + +// List lists all TFJobs in the indexer for a given namespace. +func (s tFJobNamespaceLister) List(selector labels.Selector) (ret []*v1alpha1.TFJob, err error) { + err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) { + ret = append(ret, m.(*v1alpha1.TFJob)) + }) + return ret, err +} + +// Get retrieves the TFJob from the indexer for a given namespace and name. +func (s tFJobNamespaceLister) Get(name string) (*v1alpha1.TFJob, error) { + obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name) + if err != nil { + return nil, err + } + if !exists { + return nil, errors.NewNotFound(v1alpha1.Resource("tfjob"), name) + } + return obj.(*v1alpha1.TFJob), nil +} diff --git a/pkg/controller.v2/controller.go b/pkg/controller.v2/controller.go new file mode 100644 index 0000000000..8b5c829771 --- /dev/null +++ b/pkg/controller.v2/controller.go @@ -0,0 +1,484 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package controller provides a Kubernetes controller for a TFJob resource. + +package controller + +import ( + "fmt" + "strings" + "time" + + log "github.com/sirupsen/logrus" + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" + kubeinformers "k8s.io/client-go/informers" + kubeclientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/scheme" + typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" + corelisters "k8s.io/client-go/listers/core/v1" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/record" + "k8s.io/client-go/util/workqueue" + + tfv1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" + tfjobclientset "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" + tfjobscheme "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" + tfjobinformers "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions" + tfjoblisters "github.com/kubeflow/tf-operator/pkg/client/listers/kubeflow/v1alpha2" +) + +const ( + controllerName = "tf-operator" + + // labels for pods and servers. + tfReplicaTypeLabel = "tf-replica-type" + tfReplicaIndexLabel = "tf-replica-index" + + hit = "hit" + noHit = "no-hit" + + defaultPortStr = "2222" +) + +// controllerKind contains the schema.GroupVersionKind for this controller type. +var controllerKind = tfv1alpha2.SchemeGroupVersion.WithKind("TFJob") + +var groupVersionKind = schema.GroupVersionKind{ + Group: tfv1alpha2.GroupName, + Version: tfv1alpha2.GroupVersion, + Kind: tfv1alpha2.TFJobResourceKind, +} + +// TFJobControllerConfiguration contains configuration of tf-operator. +// DefaultTimerConfig is the suggested tf-operator configuration for production. +type TFJobControllerConfiguration struct { + // ReconcilerSyncLoopPeriod is the amount of time the reconciler sync states loop + // wait between two reconciler sync. + // It is set to 15 sec by default. + // TODO(cph): maybe we can let it grows by multiple in the future + // and up to 5 minutes to reduce idle loop. + // e.g. 15s, 30s, 60s, 120s... + ReconcilerSyncLoopPeriod metav1.Duration +} + +// DefaultTFJobControllerConfiguration is the suggested tf-operator configuration for production. +var DefaultTFJobControllerConfiguration TFJobControllerConfiguration = TFJobControllerConfiguration{ + ReconcilerSyncLoopPeriod: metav1.Duration{Duration: 15 * time.Second}, +} + +type TFJobController struct { + config TFJobControllerConfiguration + + // podControl is used to add or delete pods. + podControl PodControlInterface + + // serviceControl is used to add or delete services. + serviceControl ServiceControlInterface + + // kubeClientSet is a standard kubernetes clientset. + kubeClientSet kubeclientset.Interface + + // tfJobClientSet is a clientset for CRD TFJob. + tfJobClientSet tfjobclientset.Interface + + // To allow injection of syncTFJob for testing. + syncHandler func(tfJobKey string) (bool, error) + + updateStatusHandler func(tfjob *tfv1alpha2.TFJob) error + + // Listers for TFJob, Pod and Service + // tfJobLister can list/get tfjobs from the shared informer's store. + tfJobLister tfjoblisters.TFJobLister + + // podLister can list/get pods from the shared informer's store. + podLister corelisters.PodLister + + // serviceLister can list/get services from the shared informer's store. + serviceLister corelisters.ServiceLister + + // tfJobListerSynced returns true if the tfjob store has been synced at least once. + tfJobListerSynced cache.InformerSynced + + // podListerSynced returns true if the pod store has been synced at least once. + podListerSynced cache.InformerSynced + + // serviceListerSynced returns true if the service store has been synced at least once. + serviceListerSynced cache.InformerSynced + + // A TTLCache of pod/services creates/deletes each tfjob expects to see + // We use TFJob namespace/name + TFReplicaType + pods/services as an expectation key, + // For example, there is a TFJob with namespace "tf-operator" and name "tfjob-abc": + // { + // "PS": { + // "Replicas": 2, + // }, + // "Worker": { + // "Replicas": 4, + // } + // } + // We will create 4 expectations: + // - "tf-operator/tfjob-abc/ps/services", expects 2 adds. + // - "tf-operator/tfjob-abc/ps/pods", expects 2 adds. + // - "tf-operator/tfjob-abc/worker/services", expects 4 adds. + // - "tf-operator/tfjob-abc/worker/pods", expects 4 adds. + expectations ControllerExpectationsInterface + + // workQueue is a rate limited work queue. This is used to queue work to be + // processed instead of performing it as soon as a change happens. This + // means we can ensure we only process a fixed amount of resources at a + // time, and makes it easy to ensure we are never processing the same item + // simultaneously in two different workers. + workQueue workqueue.RateLimitingInterface + + // recorder is an event recorder for recording Event resources to the + // Kubernetes API. + recorder record.EventRecorder +} + +// NewTFJobController returns a new TFJob controller. +func NewTFJobController( + kubeClientSet kubeclientset.Interface, + tfJobClientSet tfjobclientset.Interface, + kubeInformerFactory kubeinformers.SharedInformerFactory, + tfJobInformerFactory tfjobinformers.SharedInformerFactory) *TFJobController { + + tfjobscheme.AddToScheme(scheme.Scheme) + + log.Debug("Creating event broadcaster") + eventBroadcaster := record.NewBroadcaster() + eventBroadcaster.StartLogging(log.Infof) + eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClientSet.CoreV1().Events("")}) + recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: controllerName}) + + realPodControl := RealPodControl{ + KubeClient: kubeClientSet, + Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "tfjob-controller"}), + } + + realServiceControl := RealServiceControl{ + KubeClient: kubeClientSet, + Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "tfjob-controller"}), + } + + // Create new TFJobController. + tc := &TFJobController{ + podControl: realPodControl, + serviceControl: realServiceControl, + kubeClientSet: kubeClientSet, + tfJobClientSet: tfJobClientSet, + expectations: NewControllerExpectations(), + workQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "tfjobs"), + recorder: recorder, + } + + // Set sync handler. + tc.syncHandler = tc.syncTFJob + tc.updateStatusHandler = tc.updateTFJobStatus + + // Create tfjob informer. + tfJobInformer := tfJobInformerFactory.Kubeflow().V1alpha2().TFJobs() + + // Set up an event handler for when tfjob resources change. + tfJobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: tc.addTFJob, + UpdateFunc: tc.updateTFJob, + // This will enter the sync loop and no-op, + // because the tfjob has been deleted from the store. + DeleteFunc: tc.enqueueTFJob, + }) + + tc.tfJobLister = tfJobInformer.Lister() + tc.tfJobListerSynced = tfJobInformer.Informer().HasSynced + + // Create pod informer. + podInformer := kubeInformerFactory.Core().V1().Pods() + + // Set up an event handler for when pod resources change + podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: tc.addPod, + UpdateFunc: tc.updatePod, + DeleteFunc: tc.deletePod, + }) + + tc.podLister = podInformer.Lister() + tc.podListerSynced = podInformer.Informer().HasSynced + + // Create service informer. + serviceInformer := kubeInformerFactory.Core().V1().Services() + + // Set up an event handler for when service resources change. + serviceInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: tc.addService, + UpdateFunc: tc.updateService, + DeleteFunc: tc.deleteService, + }) + + tc.serviceLister = serviceInformer.Lister() + tc.serviceListerSynced = serviceInformer.Informer().HasSynced + + return tc +} + +// Run will set up the event handlers for types we are interested in, as well +// as syncing informer caches and starting workers. It will block until stopCh +// is closed, at which point it will shutdown the workqueue and wait for +// workers to finish processing their current work items. +func (tc *TFJobController) Run(threadiness int, stopCh <-chan struct{}) error { + defer runtime.HandleCrash() + defer tc.workQueue.ShutDown() + + // Start the informer factories to begin populating the informer caches. + log.Info("Starting TFJob controller") + + // Wait for the caches to be synced before starting workers. + log.Info("Waiting for informer caches to sync") + if ok := cache.WaitForCacheSync(stopCh, tc.tfJobListerSynced); !ok { + return fmt.Errorf("failed to wait for tfjob caches to sync") + } + + if ok := cache.WaitForCacheSync(stopCh, tc.podListerSynced); !ok { + return fmt.Errorf("failed to wait for pod caches to sync") + } + + if ok := cache.WaitForCacheSync(stopCh, tc.serviceListerSynced); !ok { + return fmt.Errorf("failed to wait for service caches to sync") + } + + log.Infof("Starting %v workers", threadiness) + // Launch workers to process TFJob resources. + for i := 0; i < threadiness; i++ { + go wait.Until(tc.runWorker, time.Second, stopCh) + } + + log.Info("Started workers") + <-stopCh + log.Info("Shutting down workers") + + return nil +} + +// runWorker is a long-running function that will continually call the +// processNextWorkItem function in order to read and process a message on the +// workqueue. +func (tc *TFJobController) runWorker() { + for tc.processNextWorkItem() { + } +} + +// processNextWorkItem will read a single work item off the workqueue and +// attempt to process it, by calling the syncHandler. +func (tc *TFJobController) processNextWorkItem() bool { + key, quit := tc.workQueue.Get() + if quit { + return false + } + defer tc.workQueue.Done(key) + + forget, err := tc.syncHandler(key.(string)) + if err == nil { + if forget { + tc.workQueue.Forget(key) + } + return true + } + + utilruntime.HandleError(fmt.Errorf("Error syncing tfjob: %v", err)) + tc.workQueue.AddRateLimited(key) + + return true +} + +func (tc *TFJobController) enqueueTFJob(tfjob interface{}) { + key, err := KeyFunc(tfjob) + if err != nil { + utilruntime.HandleError(fmt.Errorf("Couldn't get key for tfjob object %#v: %v", tfjob, err)) + return + } + + tc.workQueue.Add(key) +} + +// syncTFJob will sync the tfjob with the given key if it has had its expectations fulfilled, meaning +// it did not expect to see any more of its pods/services created or deleted. +// This function is not meant to be invoked concurrently with the same key. +func (tc *TFJobController) syncTFJob(key string) (bool, error) { + startTime := time.Now() + defer func() { + log.Infof("Finished syncing tfjob %q (%v)", key, time.Since(startTime)) + }() + + namespace, name, err := cache.SplitMetaNamespaceKey(key) + if err != nil { + return false, err + } + + sharedtfjob, err := tc.tfJobLister.TFJobs(namespace).Get(name) + if err != nil { + if errors.IsNotFound(err) { + log.Infof("TFJob has been deleted: %v", key) + // jm.expectations.DeleteExpectations(key) + return true, nil + } + return false, err + } + + tfjob := sharedtfjob.DeepCopy() + tfjobNeedsSync := tc.satisfiedExpectations(tfjob) + + var reconcileTFJobsErr error + if tfjobNeedsSync && tfjob.DeletionTimestamp == nil { + reconcileTFJobsErr = tc.reconcileTFJobs(tfjob) + } + + if reconcileTFJobsErr != nil { + return false, reconcileTFJobsErr + } + + return true, err +} + +// reconcileTFJobs checks and updates replicas for each given TFReplicaSpec. +// It will requeue the tfjob in case of an error while creating/deleting pods/services. +func (tc *TFJobController) reconcileTFJobs(tfjob *tfv1alpha2.TFJob) error { + + pods, err := tc.getPodsForTFJob(tfjob) + + if err != nil { + log.Infof("getPodsForTFJob error %v", err) + return err + } + + services, err := tc.getServicesForTFJob(tfjob) + + if err != nil { + log.Infof("getServicesForTFJob error %v", err) + return err + } + + // Diff current active pods/services with replicas. + for rtype, spec := range tfjob.Spec.TFReplicaSpecs { + err = tc.reconcilePods(tfjob, pods, rtype, spec) + if err != nil { + log.Infof("reconcilePods error %v", err) + return err + } + + err = tc.reconcileServices(tfjob, services, rtype, spec) + + if err != nil { + log.Infof("reconcileServices error %v", err) + return err + } + } + + return nil +} + +func genGeneralName(tfjobKey, rtype, index string) string { + n := tfjobKey + "-" + rtype + "-" + index + return strings.Replace(n, "/", "-", -1) +} + +// satisfiedExpectations returns true if the required adds/dels for the given tfjob have been observed. +// Add/del counts are established by the controller at sync time, and updated as controllees are observed by the controller +// manager. +func (tc *TFJobController) satisfiedExpectations(tfjob *tfv1alpha2.TFJob) bool { + satisfied := false + tfjobKey, err := KeyFunc(tfjob) + if err != nil { + utilruntime.HandleError(fmt.Errorf("Couldn't get key for tfjob object %#v: %v", tfjob, err)) + return false + } + + for rtype, _ := range tfjob.Spec.TFReplicaSpecs { + // Check the expectations of the pods. + expectationPodsKey := genExpectationPodsKey(tfjobKey, string(rtype)) + satisfied = satisfied || tc.expectations.SatisfiedExpectations(expectationPodsKey) + + // Check the expectations of the services. + expectationServicesKey := genExpectationServicesKey(tfjobKey, string(rtype)) + satisfied = satisfied || tc.expectations.SatisfiedExpectations(expectationServicesKey) + } + + return satisfied +} + +func genLabels(tfjobKey string) map[string]string { + return map[string]string{ + "group_name": tfv1alpha2.GroupName, + "tf_job_key": strings.Replace(tfjobKey, "/", "-", -1), + } +} + +// When a pod is added, set the defaults and enqueue the current tfjob. +func (tc *TFJobController) addTFJob(obj interface{}) { + tfjob := obj.(*tfv1alpha2.TFJob) + log.Infof("Adding tfjob: %s", tfjob.Name) + scheme.Scheme.Default(tfjob) + tc.enqueueTFJob(obj) +} + +// When a pod is updated, enqueue the current tfjob. +func (tc *TFJobController) updateTFJob(old, cur interface{}) { + oldTFJob := old.(*tfv1alpha2.TFJob) + log.Infof("Updating tfjob: %s", oldTFJob.Name) + tc.enqueueTFJob(cur) +} + +func (tc *TFJobController) updateTFJobStatus(tfjob *tfv1alpha2.TFJob) error { + _, err := tc.tfJobClientSet.KubeflowV1alpha2().TFJobs(tfjob.Namespace).Update(tfjob) + return err +} + +// resolveControllerRef returns the tfjob referenced by a ControllerRef, +// or nil if the ControllerRef could not be resolved to a matching tfjob +// of the correct Kind. +func (tc *TFJobController) resolveControllerRef(namespace string, controllerRef *metav1.OwnerReference) *tfv1alpha2.TFJob { + // We can't look up by UID, so look up by Name and then verify UID. + // Don't even try to look up by Name if it's the wrong Kind. + if controllerRef.Kind != controllerKind.Kind { + return nil + } + tfjob, err := tc.tfJobLister.TFJobs(namespace).Get(controllerRef.Name) + if err != nil { + return nil + } + if tfjob.UID != controllerRef.UID { + // The controller we found with this Name is not the same one that the + // ControllerRef points to. + return nil + } + return tfjob +} + +func genOwnerReference(tfjob *tfv1alpha2.TFJob) *metav1.OwnerReference { + boolPtr := func(b bool) *bool { return &b } + controllerRef := &metav1.OwnerReference{ + APIVersion: groupVersionKind.GroupVersion().String(), + Kind: groupVersionKind.Kind, + Name: tfjob.Name, + UID: tfjob.UID, + BlockOwnerDeletion: boolPtr(true), + Controller: boolPtr(true), + } + + return controllerRef +} diff --git a/pkg/controller/controller_pod.go b/pkg/controller.v2/controller_pod.go similarity index 100% rename from pkg/controller/controller_pod.go rename to pkg/controller.v2/controller_pod.go diff --git a/pkg/controller/controller_pod_test.go b/pkg/controller.v2/controller_pod_test.go similarity index 100% rename from pkg/controller/controller_pod_test.go rename to pkg/controller.v2/controller_pod_test.go diff --git a/pkg/controller/controller_ref_manager.go b/pkg/controller.v2/controller_ref_manager.go similarity index 100% rename from pkg/controller/controller_ref_manager.go rename to pkg/controller.v2/controller_ref_manager.go diff --git a/pkg/controller/controller_service.go b/pkg/controller.v2/controller_service.go similarity index 100% rename from pkg/controller/controller_service.go rename to pkg/controller.v2/controller_service.go diff --git a/pkg/controller/controller_service_test.go b/pkg/controller.v2/controller_service_test.go similarity index 100% rename from pkg/controller/controller_service_test.go rename to pkg/controller.v2/controller_service_test.go diff --git a/pkg/controller/controller_tensorflow.go b/pkg/controller.v2/controller_tensorflow.go similarity index 100% rename from pkg/controller/controller_tensorflow.go rename to pkg/controller.v2/controller_tensorflow.go diff --git a/pkg/controller/controller_test.go b/pkg/controller.v2/controller_test.go similarity index 100% rename from pkg/controller/controller_test.go rename to pkg/controller.v2/controller_test.go diff --git a/pkg/controller/controller_utils.go b/pkg/controller.v2/controller_utils.go similarity index 100% rename from pkg/controller/controller_utils.go rename to pkg/controller.v2/controller_utils.go diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index 8b5c829771..130e5d3ffa 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -12,260 +12,152 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package controller provides a Kubernetes controller for a TFJob resource. - +// Package controller provides a Kubernetes controller for a TensorFlow job resource. package controller import ( + "errors" "fmt" - "strings" "time" log "github.com/sirupsen/logrus" "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/util/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" - kubeinformers "k8s.io/client-go/informers" - kubeclientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" - corelisters "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" "k8s.io/client-go/util/workqueue" - tfv1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2" - tfjobclientset "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" - tfjobscheme "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" - tfjobinformers "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions" - tfjoblisters "github.com/kubeflow/tf-operator/pkg/client/listers/kubeflow/v1alpha2" + tfv1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" + tfjobclient "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" + kubeflowscheme "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" + informers "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions" + listers "github.com/kubeflow/tf-operator/pkg/client/listers/kubeflow/v1alpha1" + "github.com/kubeflow/tf-operator/pkg/trainer" ) const ( - controllerName = "tf-operator" - - // labels for pods and servers. - tfReplicaTypeLabel = "tf-replica-type" - tfReplicaIndexLabel = "tf-replica-index" - - hit = "hit" - noHit = "no-hit" - - defaultPortStr = "2222" + controllerName = "kubeflow" ) -// controllerKind contains the schema.GroupVersionKind for this controller type. -var controllerKind = tfv1alpha2.SchemeGroupVersion.WithKind("TFJob") - -var groupVersionKind = schema.GroupVersionKind{ - Group: tfv1alpha2.GroupName, - Version: tfv1alpha2.GroupVersion, - Kind: tfv1alpha2.TFJobResourceKind, -} - -// TFJobControllerConfiguration contains configuration of tf-operator. -// DefaultTimerConfig is the suggested tf-operator configuration for production. -type TFJobControllerConfiguration struct { - // ReconcilerSyncLoopPeriod is the amount of time the reconciler sync states loop - // wait between two reconciler sync. - // It is set to 15 sec by default. - // TODO(cph): maybe we can let it grows by multiple in the future - // and up to 5 minutes to reduce idle loop. - // e.g. 15s, 30s, 60s, 120s... - ReconcilerSyncLoopPeriod metav1.Duration -} - -// DefaultTFJobControllerConfiguration is the suggested tf-operator configuration for production. -var DefaultTFJobControllerConfiguration TFJobControllerConfiguration = TFJobControllerConfiguration{ - ReconcilerSyncLoopPeriod: metav1.Duration{Duration: 15 * time.Second}, -} - -type TFJobController struct { - config TFJobControllerConfiguration - - // podControl is used to add or delete pods. - podControl PodControlInterface +var ( + ErrVersionOutdated = errors.New("requested version is outdated in apiserver") - // serviceControl is used to add or delete services. - serviceControl ServiceControlInterface + // IndexerInformer uses a delta queue, therefore for deletes we have to use this + // key function but it should be just fine for non delete events. + keyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc - // kubeClientSet is a standard kubernetes clientset. - kubeClientSet kubeclientset.Interface - - // tfJobClientSet is a clientset for CRD TFJob. - tfJobClientSet tfjobclientset.Interface - - // To allow injection of syncTFJob for testing. - syncHandler func(tfJobKey string) (bool, error) - - updateStatusHandler func(tfjob *tfv1alpha2.TFJob) error - - // Listers for TFJob, Pod and Service - // tfJobLister can list/get tfjobs from the shared informer's store. - tfJobLister tfjoblisters.TFJobLister - - // podLister can list/get pods from the shared informer's store. - podLister corelisters.PodLister - - // serviceLister can list/get services from the shared informer's store. - serviceLister corelisters.ServiceLister - - // tfJobListerSynced returns true if the tfjob store has been synced at least once. - tfJobListerSynced cache.InformerSynced + // DefaultJobBackOff is the max backoff period, exported for the e2e test + DefaultJobBackOff = 10 * time.Second + // MaxJobBackOff is the max backoff period, exported for the e2e test + MaxJobBackOff = 360 * time.Second +) - // podListerSynced returns true if the pod store has been synced at least once. - podListerSynced cache.InformerSynced +type Controller struct { + KubeClient kubernetes.Interface + TFJobClient tfjobclient.Interface - // serviceListerSynced returns true if the service store has been synced at least once. - serviceListerSynced cache.InformerSynced + config tfv1alpha1.ControllerConfig + jobs map[string]*trainer.TrainingJob - // A TTLCache of pod/services creates/deletes each tfjob expects to see - // We use TFJob namespace/name + TFReplicaType + pods/services as an expectation key, - // For example, there is a TFJob with namespace "tf-operator" and name "tfjob-abc": - // { - // "PS": { - // "Replicas": 2, - // }, - // "Worker": { - // "Replicas": 4, - // } - // } - // We will create 4 expectations: - // - "tf-operator/tfjob-abc/ps/services", expects 2 adds. - // - "tf-operator/tfjob-abc/ps/pods", expects 2 adds. - // - "tf-operator/tfjob-abc/worker/services", expects 4 adds. - // - "tf-operator/tfjob-abc/worker/pods", expects 4 adds. - expectations ControllerExpectationsInterface + TFJobLister listers.TFJobLister + TFJobSynced cache.InformerSynced - // workQueue is a rate limited work queue. This is used to queue work to be + // WorkQueue is a rate limited work queue. This is used to queue work to be // processed instead of performing it as soon as a change happens. This // means we can ensure we only process a fixed amount of resources at a // time, and makes it easy to ensure we are never processing the same item // simultaneously in two different workers. - workQueue workqueue.RateLimitingInterface + WorkQueue workqueue.RateLimitingInterface // recorder is an event recorder for recording Event resources to the // Kubernetes API. recorder record.EventRecorder -} -// NewTFJobController returns a new TFJob controller. -func NewTFJobController( - kubeClientSet kubeclientset.Interface, - tfJobClientSet tfjobclientset.Interface, - kubeInformerFactory kubeinformers.SharedInformerFactory, - tfJobInformerFactory tfjobinformers.SharedInformerFactory) *TFJobController { + syncHandler func(jobKey string) (bool, error) + + enableGangScheduling bool +} - tfjobscheme.AddToScheme(scheme.Scheme) +func New(kubeClient kubernetes.Interface, tfJobClient tfjobclient.Interface, + config tfv1alpha1.ControllerConfig, tfJobInformerFactory informers.SharedInformerFactory, + enableGangScheduling bool) (*Controller, error) { + tfJobInformer := tfJobInformerFactory.Kubeflow().V1alpha1().TFJobs() + kubeflowscheme.AddToScheme(scheme.Scheme) log.Debug("Creating event broadcaster") eventBroadcaster := record.NewBroadcaster() eventBroadcaster.StartLogging(log.Infof) - eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClientSet.CoreV1().Events("")}) + eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: controllerName}) - realPodControl := RealPodControl{ - KubeClient: kubeClientSet, - Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "tfjob-controller"}), + controller := &Controller{ + KubeClient: kubeClient, + TFJobClient: tfJobClient, + WorkQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "TFjobs"), + recorder: recorder, + // TODO(jlewi)): What to do about cluster.Cluster? + jobs: make(map[string]*trainer.TrainingJob), + config: config, + enableGangScheduling: enableGangScheduling, } - realServiceControl := RealServiceControl{ - KubeClient: kubeClientSet, - Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "tfjob-controller"}), - } - - // Create new TFJobController. - tc := &TFJobController{ - podControl: realPodControl, - serviceControl: realServiceControl, - kubeClientSet: kubeClientSet, - tfJobClientSet: tfJobClientSet, - expectations: NewControllerExpectations(), - workQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "tfjobs"), - recorder: recorder, - } - - // Set sync handler. - tc.syncHandler = tc.syncTFJob - tc.updateStatusHandler = tc.updateTFJobStatus - - // Create tfjob informer. - tfJobInformer := tfJobInformerFactory.Kubeflow().V1alpha2().TFJobs() - - // Set up an event handler for when tfjob resources change. - tfJobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: tc.addTFJob, - UpdateFunc: tc.updateTFJob, - // This will enter the sync loop and no-op, - // because the tfjob has been deleted from the store. - DeleteFunc: tc.enqueueTFJob, - }) - - tc.tfJobLister = tfJobInformer.Lister() - tc.tfJobListerSynced = tfJobInformer.Informer().HasSynced - - // Create pod informer. - podInformer := kubeInformerFactory.Core().V1().Pods() - - // Set up an event handler for when pod resources change - podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: tc.addPod, - UpdateFunc: tc.updatePod, - DeleteFunc: tc.deletePod, - }) - - tc.podLister = podInformer.Lister() - tc.podListerSynced = podInformer.Informer().HasSynced - - // Create service informer. - serviceInformer := kubeInformerFactory.Core().V1().Services() - - // Set up an event handler for when service resources change. - serviceInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: tc.addService, - UpdateFunc: tc.updateService, - DeleteFunc: tc.deleteService, - }) - - tc.serviceLister = serviceInformer.Lister() - tc.serviceListerSynced = serviceInformer.Informer().HasSynced - - return tc + log.Info("Setting up event handlers") + // Set up an event handler for when Foo resources change + tfJobInformer.Informer().AddEventHandler( + cache.FilteringResourceEventHandler{ + FilterFunc: func(obj interface{}) bool { + switch t := obj.(type) { + case *tfv1alpha1.TFJob: + log.Debugf("filter tfjob name: %v", t.Name) + return true + default: + return false + } + }, + Handler: cache.ResourceEventHandlerFuncs{ + AddFunc: controller.enqueueController, + UpdateFunc: func(oldObj, newObj interface{}) { + controller.enqueueController(newObj) + }, + DeleteFunc: controller.enqueueController, + }, + }) + + controller.TFJobLister = tfJobInformer.Lister() + controller.TFJobSynced = tfJobInformer.Informer().HasSynced + controller.syncHandler = controller.syncTFJob + + return controller, nil } // Run will set up the event handlers for types we are interested in, as well // as syncing informer caches and starting workers. It will block until stopCh // is closed, at which point it will shutdown the workqueue and wait for // workers to finish processing their current work items. -func (tc *TFJobController) Run(threadiness int, stopCh <-chan struct{}) error { +func (c *Controller) Run(threadiness int, stopCh <-chan struct{}) error { defer runtime.HandleCrash() - defer tc.workQueue.ShutDown() + defer c.WorkQueue.ShutDown() - // Start the informer factories to begin populating the informer caches. + // Start the informer factories to begin populating the informer caches log.Info("Starting TFJob controller") - // Wait for the caches to be synced before starting workers. + // Wait for the caches to be synced before starting workers log.Info("Waiting for informer caches to sync") - if ok := cache.WaitForCacheSync(stopCh, tc.tfJobListerSynced); !ok { - return fmt.Errorf("failed to wait for tfjob caches to sync") - } - - if ok := cache.WaitForCacheSync(stopCh, tc.podListerSynced); !ok { - return fmt.Errorf("failed to wait for pod caches to sync") - } - - if ok := cache.WaitForCacheSync(stopCh, tc.serviceListerSynced); !ok { - return fmt.Errorf("failed to wait for service caches to sync") + if ok := cache.WaitForCacheSync(stopCh, c.TFJobSynced); !ok { + return fmt.Errorf("failed to wait for caches to sync") } log.Infof("Starting %v workers", threadiness) - // Launch workers to process TFJob resources. + // Launch workers to process TFJob resources for i := 0; i < threadiness; i++ { - go wait.Until(tc.runWorker, time.Second, stopCh) + go wait.Until(c.runWorker, time.Second, stopCh) } log.Info("Started workers") @@ -278,207 +170,103 @@ func (tc *TFJobController) Run(threadiness int, stopCh <-chan struct{}) error { // runWorker is a long-running function that will continually call the // processNextWorkItem function in order to read and process a message on the // workqueue. -func (tc *TFJobController) runWorker() { - for tc.processNextWorkItem() { +func (c *Controller) runWorker() { + for c.processNextWorkItem() { } } // processNextWorkItem will read a single work item off the workqueue and // attempt to process it, by calling the syncHandler. -func (tc *TFJobController) processNextWorkItem() bool { - key, quit := tc.workQueue.Get() +func (c *Controller) processNextWorkItem() bool { + key, quit := c.WorkQueue.Get() if quit { return false } - defer tc.workQueue.Done(key) + defer c.WorkQueue.Done(key) - forget, err := tc.syncHandler(key.(string)) + forget, err := c.syncHandler(key.(string)) if err == nil { if forget { - tc.workQueue.Forget(key) + c.WorkQueue.Forget(key) } return true } - utilruntime.HandleError(fmt.Errorf("Error syncing tfjob: %v", err)) - tc.workQueue.AddRateLimited(key) + utilruntime.HandleError(fmt.Errorf("Error syncing job: %v", err)) + c.WorkQueue.AddRateLimited(key) return true } -func (tc *TFJobController) enqueueTFJob(tfjob interface{}) { - key, err := KeyFunc(tfjob) - if err != nil { - utilruntime.HandleError(fmt.Errorf("Couldn't get key for tfjob object %#v: %v", tfjob, err)) - return - } - - tc.workQueue.Add(key) -} - -// syncTFJob will sync the tfjob with the given key if it has had its expectations fulfilled, meaning -// it did not expect to see any more of its pods/services created or deleted. -// This function is not meant to be invoked concurrently with the same key. -func (tc *TFJobController) syncTFJob(key string) (bool, error) { +// syncTFJob will sync the job with the given. This function is not meant to be invoked +// concurrently with the same key. +// +// When a job is completely processed it will return true indicating that its ok to forget about this job since +// no more processing will occur for it. +func (c *Controller) syncTFJob(key string) (bool, error) { startTime := time.Now() defer func() { - log.Infof("Finished syncing tfjob %q (%v)", key, time.Since(startTime)) + log.Debugf("Finished syncing job %q (%v)", key, time.Since(startTime)) }() - namespace, name, err := cache.SplitMetaNamespaceKey(key) + ns, name, err := cache.SplitMetaNamespaceKey(key) if err != nil { return false, err } + if len(ns) == 0 || len(name) == 0 { + return false, fmt.Errorf("invalid job key %q: either namespace or name is missing", key) + } + + tfJob, err := c.TFJobLister.TFJobs(ns).Get(name) - sharedtfjob, err := tc.tfJobLister.TFJobs(namespace).Get(name) if err != nil { - if errors.IsNotFound(err) { - log.Infof("TFJob has been deleted: %v", key) - // jm.expectations.DeleteExpectations(key) + if apierrors.IsNotFound(err) { + log.Debugf("Job has been deleted: %v", key) return true, nil } return false, err } - tfjob := sharedtfjob.DeepCopy() - tfjobNeedsSync := tc.satisfiedExpectations(tfjob) - - var reconcileTFJobsErr error - if tfjobNeedsSync && tfjob.DeletionTimestamp == nil { - reconcileTFJobsErr = tc.reconcileTFJobs(tfjob) - } - - if reconcileTFJobsErr != nil { - return false, reconcileTFJobsErr - } - - return true, err -} - -// reconcileTFJobs checks and updates replicas for each given TFReplicaSpec. -// It will requeue the tfjob in case of an error while creating/deleting pods/services. -func (tc *TFJobController) reconcileTFJobs(tfjob *tfv1alpha2.TFJob) error { - - pods, err := tc.getPodsForTFJob(tfjob) - - if err != nil { - log.Infof("getPodsForTFJob error %v", err) - return err - } - - services, err := tc.getServicesForTFJob(tfjob) - - if err != nil { - log.Infof("getServicesForTFJob error %v", err) - return err - } - - // Diff current active pods/services with replicas. - for rtype, spec := range tfjob.Spec.TFReplicaSpecs { - err = tc.reconcilePods(tfjob, pods, rtype, spec) - if err != nil { - log.Infof("reconcilePods error %v", err) - return err - } - - err = tc.reconcileServices(tfjob, services, rtype, spec) + // Create a new TrainingJob if there is no TrainingJob stored for it in the jobs map or if the UID's don't match. + // The UID's won't match in the event we deleted the job and then recreated the job with the same name. + if cJob, ok := c.jobs[key]; !ok || cJob.UID() != tfJob.UID { + nc, err := trainer.NewJob(c.KubeClient, c.TFJobClient, c.recorder, tfJob, &c.config) if err != nil { - log.Infof("reconcileServices error %v", err) - return err + return false, err } + c.jobs[key] = nc } - return nil -} - -func genGeneralName(tfjobKey, rtype, index string) string { - n := tfjobKey + "-" + rtype + "-" + index - return strings.Replace(n, "/", "-", -1) -} + nc := c.jobs[key] -// satisfiedExpectations returns true if the required adds/dels for the given tfjob have been observed. -// Add/del counts are established by the controller at sync time, and updated as controllees are observed by the controller -// manager. -func (tc *TFJobController) satisfiedExpectations(tfjob *tfv1alpha2.TFJob) bool { - satisfied := false - tfjobKey, err := KeyFunc(tfjob) - if err != nil { - utilruntime.HandleError(fmt.Errorf("Couldn't get key for tfjob object %#v: %v", tfjob, err)) - return false + if err := nc.Reconcile(&c.config, c.enableGangScheduling); err != nil { + return false, err } - for rtype, _ := range tfjob.Spec.TFReplicaSpecs { - // Check the expectations of the pods. - expectationPodsKey := genExpectationPodsKey(tfjobKey, string(rtype)) - satisfied = satisfied || tc.expectations.SatisfiedExpectations(expectationPodsKey) + tfJob, err = c.TFJobClient.KubeflowV1alpha1().TFJobs(tfJob.ObjectMeta.Namespace).Get(tfJob.ObjectMeta.Name, metav1.GetOptions{}) - // Check the expectations of the services. - expectationServicesKey := genExpectationServicesKey(tfjobKey, string(rtype)) - satisfied = satisfied || tc.expectations.SatisfiedExpectations(expectationServicesKey) + if err != nil { + return false, err } - return satisfied -} - -func genLabels(tfjobKey string) map[string]string { - return map[string]string{ - "group_name": tfv1alpha2.GroupName, - "tf_job_key": strings.Replace(tfjobKey, "/", "-", -1), + // TODO(jlewi): This logic will need to change when/if we get rid of phases and move to conditions. At that + // case we should forget about a job when the appropriate condition is reached. + if tfJob.Status.Phase == tfv1alpha1.TFJobPhaseCleanUp { + return true, nil + } else { + return false, nil } -} - -// When a pod is added, set the defaults and enqueue the current tfjob. -func (tc *TFJobController) addTFJob(obj interface{}) { - tfjob := obj.(*tfv1alpha2.TFJob) - log.Infof("Adding tfjob: %s", tfjob.Name) - scheme.Scheme.Default(tfjob) - tc.enqueueTFJob(obj) -} -// When a pod is updated, enqueue the current tfjob. -func (tc *TFJobController) updateTFJob(old, cur interface{}) { - oldTFJob := old.(*tfv1alpha2.TFJob) - log.Infof("Updating tfjob: %s", oldTFJob.Name) - tc.enqueueTFJob(cur) } -func (tc *TFJobController) updateTFJobStatus(tfjob *tfv1alpha2.TFJob) error { - _, err := tc.tfJobClientSet.KubeflowV1alpha2().TFJobs(tfjob.Namespace).Update(tfjob) - return err -} - -// resolveControllerRef returns the tfjob referenced by a ControllerRef, -// or nil if the ControllerRef could not be resolved to a matching tfjob -// of the correct Kind. -func (tc *TFJobController) resolveControllerRef(namespace string, controllerRef *metav1.OwnerReference) *tfv1alpha2.TFJob { - // We can't look up by UID, so look up by Name and then verify UID. - // Don't even try to look up by Name if it's the wrong Kind. - if controllerRef.Kind != controllerKind.Kind { - return nil - } - tfjob, err := tc.tfJobLister.TFJobs(namespace).Get(controllerRef.Name) +// obj could be an *batch.Job, or a DeletionFinalStateUnknown marker item. +func (c *Controller) enqueueController(obj interface{}) { + key, err := keyFunc(obj) if err != nil { - return nil - } - if tfjob.UID != controllerRef.UID { - // The controller we found with this Name is not the same one that the - // ControllerRef points to. - return nil - } - return tfjob -} - -func genOwnerReference(tfjob *tfv1alpha2.TFJob) *metav1.OwnerReference { - boolPtr := func(b bool) *bool { return &b } - controllerRef := &metav1.OwnerReference{ - APIVersion: groupVersionKind.GroupVersion().String(), - Kind: groupVersionKind.Kind, - Name: tfjob.Name, - UID: tfjob.UID, - BlockOwnerDeletion: boolPtr(true), - Controller: boolPtr(true), + utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err)) + return } - return controllerRef + c.WorkQueue.AddRateLimited(key) } diff --git a/pkg/trainer/labels.go b/pkg/trainer/labels.go new file mode 100644 index 0000000000..1e1a698f32 --- /dev/null +++ b/pkg/trainer/labels.go @@ -0,0 +1,33 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package trainer + +import ( + "fmt" + "strings" +) + +// KubernetesLabels represents a set of labels to apply to a Kubernetes resources. +type KubernetesLabels map[string]string + +// ToSelector converts the labels to a selector matching the labels. +func (l KubernetesLabels) ToSelector() (string, error) { + pieces := make([]string, 0, len(l)) + for k, v := range l { + pieces = append(pieces, fmt.Sprintf("%v=%v", k, v)) + } + + return strings.Join(pieces, ","), nil +} diff --git a/pkg/trainer/replicas.go b/pkg/trainer/replicas.go new file mode 100644 index 0000000000..dbb4c95a6d --- /dev/null +++ b/pkg/trainer/replicas.go @@ -0,0 +1,509 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package trainer + +import ( + "encoding/json" + "errors" + "fmt" + "strings" + + log "github.com/golang/glog" + "k8s.io/api/core/v1" + k8s_errors "k8s.io/apimachinery/pkg/api/errors" + meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sErrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/record" + + tfv1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" + "github.com/kubeflow/tf-operator/pkg/util/k8sutil" + // TOOO(jlewi): Rename to apiErrors + "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/helper" + "github.com/kubeflow/tf-operator/pkg/util" +) + +const ( + SuccessfulCreateReason = "SuccessfulCreate" + FailedCreateReason = "FailedCreate" +) + +// TFReplicaSet is a set of TF processes all acting as the same role (e.g. worker +type TFReplicaSet struct { + ClientSet kubernetes.Interface + recorder record.EventRecorder + // Job is a pointer to the TrainingJob to which this replica belongs. + Job *TrainingJob + Spec tfv1alpha1.TFReplicaSpec +} + +// TFReplicas is an interface for managing a set of replicas. +type TFReplicaSetInterface interface { + Create() error + Delete() error + GetStatus() (tfv1alpha1.TFReplicaStatus, error) +} + +// TFConfig is a struct representing the TensorFlow config. This struct is turned into an environment +// which is used by TensorFlow processes to configure themselves. +type TFConfig struct { + // Cluster represents a TensorFlow ClusterSpec. + // See: https://www.tensorflow.org/api_docs/python/tf/train/ClusterSpechttps://www.tensorflow.org/api_docs/python/tf/train/ClusterSpec + Cluster ClusterSpec `json:"cluster"` + Task TaskSpec `json:"task"` + // Environment is used by tensorflow.contrib.learn.python.learn in versions <= 1.3 + // TODO(jlewi): I don't think it is used in versions TF >- 1.4. So we can eventually get rid of it. + Environment string `json:"environment"` +} + +func NewTFReplicaSet(clientSet kubernetes.Interface, recorder record.EventRecorder, tfReplicaSpec tfv1alpha1.TFReplicaSpec, job *TrainingJob) (*TFReplicaSet, error) { + if tfReplicaSpec.TFReplicaType == tfv1alpha1.MASTER && *tfReplicaSpec.Replicas != 1 { + return nil, errors.New("The MASTER must have Replicas = 1") + } + + if tfReplicaSpec.TFPort == nil { + return nil, errors.New("tfReplicaSpec.TFPort can't be nil.") + } + + if tfReplicaSpec.Template == nil && tfReplicaSpec.TFReplicaType != tfv1alpha1.PS { + return nil, fmt.Errorf("tfReplicatfv1alpha1.Template can't be nil for replica type %v.", tfReplicaSpec.TFReplicaType) + } + + // Make sure the replica type is valid. + validReplicaTypes := []tfv1alpha1.TFReplicaType{tfv1alpha1.MASTER, tfv1alpha1.PS, tfv1alpha1.WORKER} + + isValidReplicaType := false + for _, t := range validReplicaTypes { + if t == tfReplicaSpec.TFReplicaType { + isValidReplicaType = true + break + } + } + + if !isValidReplicaType { + return nil, fmt.Errorf("tfReplicaSpec.TFReplicaType is %v but must be one of %v", tfReplicaSpec.TFReplicaType, validReplicaTypes) + } + + return &TFReplicaSet{ + ClientSet: clientSet, + recorder: recorder, + Job: job, + Spec: tfReplicaSpec, + }, nil +} + +// Labels returns the labels for this replica set. +func (s *TFReplicaSet) Labels() KubernetesLabels { + return KubernetesLabels(map[string]string{ + "kubeflow.org": "", + "job_type": string(s.Spec.TFReplicaType), + // runtime_id is set by Job.setup, which is called after the TFReplicaSet is created. + // this is why labels aren't a member variable. + "runtime_id": s.Job.job.Spec.RuntimeId, + "tf_job_name": s.Job.job.ObjectMeta.Name}) +} + +// LabelsByIndex returns the labels for a pod in this replica set. +func (s *TFReplicaSet) LabelsByIndex(index int32) KubernetesLabels { + labels := s.Labels() + labels["task_index"] = fmt.Sprintf("%v", index) + return labels +} + +// CreateServiceWithIndex will create a new service with specify index +func (s *TFReplicaSet) CreateServiceWithIndex(index int32) (*v1.Service, error) { + taskLabels := s.LabelsByIndex(index) + + // Create the service. + service := &v1.Service{ + ObjectMeta: meta_v1.ObjectMeta{ + Name: s.genName(index), + Labels: taskLabels, + OwnerReferences: []meta_v1.OwnerReference{ + helper.AsOwner(s.Job.job), + }, + }, + Spec: v1.ServiceSpec{ + Selector: taskLabels, + // We use headless services here, because we don't need load balancing + // since there is a single pod that is the backend for each service. + ClusterIP: "None", + Ports: []v1.ServicePort{ + { + Name: "tf-port", + Port: *s.Spec.TFPort, + }, + }, + }, + } + + log.Infof("Creating service: %v", service.ObjectMeta.Name) + return s.ClientSet.CoreV1().Services(s.Job.job.ObjectMeta.Namespace).Create(service) +} + +// CreatePodWithIndex will create a new pod with specify index +func (s *TFReplicaSet) CreatePodWithIndex(index int32) (*v1.Pod, error) { + taskLabels := s.LabelsByIndex(index) + + pod := &v1.Pod{ + ObjectMeta: meta_v1.ObjectMeta{ + Name: s.genPodName(index), + Labels: taskLabels, + OwnerReferences: []meta_v1.OwnerReference{ + helper.AsOwner(s.Job.job), + }, + }, + Spec: *s.Spec.Template.Spec.DeepCopy(), + } + + pod.Spec.SchedulerName = s.Job.SchedulerName() + + // Configure the TFCONFIG environment variable. + tfConfig := TFConfig{ + Cluster: s.Job.ClusterSpec(), + Task: TaskSpec{ + Type: strings.ToLower(string(s.Spec.TFReplicaType)), + Index: int(index), + }, + // We need to set environment to cloud otherwise it will default to local which isn't what we want. + Environment: "cloud", + } + + tfConfigJson, err := json.Marshal(tfConfig) + if err != nil { + log.Errorf("Job: %v serializing tfConfig: %v return error; %v", s.Job.job.ObjectMeta.Name, util.Pformat(tfConfig), err) + return nil, err + } + + // Add TF_CONFIG environment variable. + for i, _ := range pod.Spec.Containers { + // We can't get c in the loop variable because that would be by value so our modifications + // wouldn't have any effect. + c := &pod.Spec.Containers[i] + if c.Name != tfv1alpha1.DefaultTFContainer { + continue + } + if len(c.Env) == 0 { + c.Env = make([]v1.EnvVar, 0) + } + c.Env = append(c.Env, v1.EnvVar{ + Name: "TF_CONFIG", + Value: string(tfConfigJson), + }) + } + + log.Infof("Creating pod: %v", pod.ObjectMeta.Name) + return s.ClientSet.CoreV1().Pods(s.Job.job.ObjectMeta.Namespace).Create(pod) +} + +// Delete deletes the replicas +func (s *TFReplicaSet) Delete() error { + selector, err := s.Labels().ToSelector() + if err != nil { + return err + } + + failures := false + + options := meta_v1.ListOptions{ + LabelSelector: selector, + } + + log.V(1).Infof("Deleting Jobs namespace=%v selector=%v", s.Job.job.ObjectMeta.Namespace, selector) + err = s.ClientSet.CoreV1().Pods(s.Job.job.ObjectMeta.Namespace).DeleteCollection(&meta_v1.DeleteOptions{}, options) + + if err != nil { + log.Errorf("There was a problem deleting the jobs; %v", err) + failures = true + } + + // We need to delete the completed pods. + log.Infof("Deleting Pods namespace=%v selector=%v", s.Job.job.ObjectMeta.Namespace, selector) + err = s.ClientSet.CoreV1().Pods(s.Job.job.ObjectMeta.Namespace).DeleteCollection(&meta_v1.DeleteOptions{}, options) + + if err != nil { + log.Errorf("There was a problem deleting the pods; %v", err) + failures = true + } + + // Services doesn't support DeleteCollection so we delete them individually. + // TODO(jlewi): We should check if this has changed with K8s 1.8 or other releases. + for index := int32(0); index < *s.Spec.Replicas; index++ { + log.V(1).Infof("Deleting Service %v:%v", s.Job.job.ObjectMeta.Namespace, s.genName((index))) + err = s.ClientSet.CoreV1().Services(s.Job.job.ObjectMeta.Namespace).Delete(s.genName(index), &meta_v1.DeleteOptions{}) + + if err != nil { + log.Errorf("Error deleting service %v; %v", s.genName(index), err) + failures = true + } + } + + // If the ConfigMap for the default parameter server exists, we delete it + log.Infof("Get ConfigMaps %v:%v", s.Job.job.ObjectMeta.Namespace, s.defaultPSConfigMapName()) + _, err = s.ClientSet.CoreV1().ConfigMaps(s.Job.job.ObjectMeta.Namespace).Get(s.defaultPSConfigMapName(), meta_v1.GetOptions{}) + if err != nil { + if !k8sutil.IsKubernetesResourceNotFoundError(err) { + log.Errorf("Error deleting ConfigMap %v; %v", s.defaultPSConfigMapName(), err) + failures = true + } + } else { + log.Infof("Delete ConfigMaps %v:%v", s.Job.job.ObjectMeta.Namespace, s.defaultPSConfigMapName()) + err = s.ClientSet.CoreV1().ConfigMaps(s.Job.job.ObjectMeta.Namespace).Delete(s.defaultPSConfigMapName(), &meta_v1.DeleteOptions{}) + if err != nil { + log.Errorf("There was a problem deleting the ConfigMaps; %v", err) + failures = true + } + } + + if failures { + return errors.New("Some of the replicas resources could not be deleted") + } + return nil +} + +// replicaStatusFromPodList returns a status from a list of pods for a job. +func replicaStatusFromPodList(l v1.PodList, name string) tfv1alpha1.ReplicaState { + var latest *v1.Pod + for _, i := range l.Items { + if latest == nil { + latest = &i + continue + } + if latest.Status.StartTime.Before(i.Status.StartTime) { + latest = &i + } + } + + if latest == nil { + return tfv1alpha1.ReplicaStateRunning + } + + var tfState v1.ContainerState + + for _, i := range latest.Status.ContainerStatuses { + if i.Name != name { + continue + } + + // We need to decide whether to use the current state or the previous termination state. + tfState = i.State + + // If the container previously terminated we will look at the termination to decide whether it is a retryable + // or permanenent error. + if i.LastTerminationState.Terminated != nil { + tfState = i.LastTerminationState + } + } + + if tfState.Running != nil || tfState.Waiting != nil { + return tfv1alpha1.ReplicaStateRunning + } + + if tfState.Terminated != nil { + if tfState.Terminated.ExitCode == 0 { + return tfv1alpha1.ReplicaStateSucceeded + } + + if isRetryableTerminationState(tfState.Terminated) { + // Since its a retryable error just return RUNNING. + // We can just let Kubernetes restart the container to retry. + return tfv1alpha1.ReplicaStateRunning + } + + return tfv1alpha1.ReplicaStateFailed + } + + return tfv1alpha1.ReplicaStateUnknown +} + +func (s *TFReplicaSet) GetSingleReplicaStatus(index int32) tfv1alpha1.ReplicaState { + p, err := s.ClientSet.CoreV1().Pods(s.Job.job.ObjectMeta.Namespace).Get(s.genName(index), meta_v1.GetOptions{}) + + if err != nil { + return tfv1alpha1.ReplicaStateUnknown + } + + if v1.PodSucceeded == p.Status.Phase { + return tfv1alpha1.ReplicaStateSucceeded + } + + labels := s.LabelsByIndex(index) + selector, err := labels.ToSelector() + if err != nil { + log.Errorf("labels.ToSelector() error; %v", err) + return tfv1alpha1.ReplicaStateFailed + } + + // TODO(jlewi): Handle errors. We need to get the pod and looking at recent container exits. + l, err := s.ClientSet.CoreV1().Pods(s.Job.job.ObjectMeta.Namespace).List(meta_v1.ListOptions{ + // TODO(jlewi): Why isn't the label selector working? + LabelSelector: selector, + }) + + if err != nil { + // TODO(jlewi): Are there errors that should be treated as retryable errors? + return tfv1alpha1.ReplicaStateFailed + } + + status := replicaStatusFromPodList(*l, tfv1alpha1.DefaultTFContainer) + return status +} + +// Status returns the status of the replica set. +func (s *TFReplicaSet) GetStatus() (tfv1alpha1.TFReplicaStatus, error) { + status := tfv1alpha1.TFReplicaStatus{ + TFReplicaType: s.Spec.TFReplicaType, + State: tfv1alpha1.ReplicaStateUnknown, + ReplicasStates: make(map[tfv1alpha1.ReplicaState]int), + } + + increment := func(state tfv1alpha1.ReplicaState) { + v, ok := status.ReplicasStates[state] + if ok { + status.ReplicasStates[state] = v + 1 + } else { + status.ReplicasStates[state] = 1 + } + } + + for index := int32(0); index < *s.Spec.Replicas; index++ { + increment(s.GetSingleReplicaStatus(index)) + } + + // Determine the overall status for the replica set based on the status of the individual + // replicas. + // If any of the replicas failed mark the set as failed. + if _, ok := status.ReplicasStates[tfv1alpha1.ReplicaStateFailed]; ok { + status.State = tfv1alpha1.ReplicaStateFailed + return status, nil + } + + // If any replicas are RUNNING mark it as RUNNING. + if _, ok := status.ReplicasStates[tfv1alpha1.ReplicaStateRunning]; ok { + status.State = tfv1alpha1.ReplicaStateRunning + return status, nil + } + + // If all of the replicas succeeded consider it success. + if v, ok := status.ReplicasStates[tfv1alpha1.ReplicaStateSucceeded]; ok && int32(v) == *s.Spec.Replicas { + status.State = tfv1alpha1.ReplicaStateSucceeded + return status, nil + } + + return status, nil +} + +// SyncPods will try to check current pods for this TFReplicaSet and try to make it as desired. +func (s *TFReplicaSet) SyncPods() error { + for index := int32(0); index < *s.Spec.Replicas; index++ { + + // Label to get all pods of this TFReplicaType + index + labels := s.LabelsByIndex(index) + + labelSelector, err := labels.ToSelector() + if err != nil { + return err + } + + // Filter the unactive pods + fieldSelector := fmt.Sprintf("status.phase!=%s", string(v1.PodFailed)) + + options := meta_v1.ListOptions{ + LabelSelector: labelSelector, + FieldSelector: fieldSelector, + } + + // List to get pods + pl, err := s.ClientSet.CoreV1().Pods(s.Job.job.ObjectMeta.Namespace).List(options) + if err != nil { + return err + } + + if len(pl.Items) == 0 { + log.Infof("No pod found for job %s, creating a new one.", s.Job.name) + // Create the pod + createdPod, err := s.CreatePodWithIndex(index) + + // If the pod already exists do nothing. + if err != nil { + if k8s_errors.IsAlreadyExists(err) { + log.Infof("Pod: %v already exists.", createdPod.ObjectMeta.Name) + continue + } + s.recorder.Eventf(s.Job.job, v1.EventTypeWarning, FailedCreateReason, "Error creating: %v", err) + return k8sErrors.NewAggregate([]error{fmt.Errorf("Creating pod %v returned error.", createdPod.ObjectMeta.Name), err}) + } + + s.recorder.Eventf(s.Job.job, v1.EventTypeNormal, SuccessfulCreateReason, "Created pod: %v", createdPod.Name) + continue + } + + if err != nil { + // TODO: handing this error + continue + } + } + + return nil +} + +// SyncServices will try to check current services for this TFReplicaSet and try to make it as desired. +func (s *TFReplicaSet) SyncServices() error { + for index := int32(0); index < *s.Spec.Replicas; index++ { + _, err := s.ClientSet.CoreV1().Services(s.Job.job.ObjectMeta.Namespace).Get(s.genName(index), meta_v1.GetOptions{}) + if err != nil && k8s_errors.IsNotFound(err) { + log.Infof("Service: %v not found, create new one.", s.genName(index)) + // Create the service + createdService, err := s.CreateServiceWithIndex(index) + + // If the service already exists do nothing. + if err != nil { + if k8s_errors.IsAlreadyExists(err) { + log.Infof("Service: %v already exists.", s.genName(index)) + continue + } + s.recorder.Eventf(s.Job.job, v1.EventTypeWarning, FailedCreateReason, "Error creating: %v", err) + return k8sErrors.NewAggregate([]error{fmt.Errorf("Creating Service %v returned error.", createdService.ObjectMeta.Name), err}) + } + + s.recorder.Eventf(s.Job.job, v1.EventTypeNormal, SuccessfulCreateReason, "Created Service: %v", createdService.Name) + continue + } + + if err != nil { + // TODO: handing this error + continue + } + } + + return nil +} + +func (s *TFReplicaSet) genName(index int32) string { + // Truncate tfjob name to 40 characters + // The whole job name should be compliant with the DNS_LABEL spec, up to a max length of 63 characters + // Thus genName(40 chars)-replicaType(6 chars)-runtimeId(4 chars)-index(4 chars), also leaving some spaces + // See https://github.com/kubernetes/community/blob/master/contributors/design-proposals/architecture/identifiers.md + return fmt.Sprintf("%v-%v-%v-%v", fmt.Sprintf("%.40s", s.Job.job.ObjectMeta.Name), strings.ToLower(string(s.Spec.TFReplicaType)), s.Job.job.Spec.RuntimeId, index) +} + +func (s *TFReplicaSet) genPodName(index int32) string { + // Generate a new pod name with random string + return s.genName(index) + "-" + util.RandString(5) +} + +func (s *TFReplicaSet) defaultPSConfigMapName() string { + return fmt.Sprintf("cm-ps-%v", s.Job.job.Spec.RuntimeId) +} diff --git a/pkg/trainer/replicas_test.go b/pkg/trainer/replicas_test.go new file mode 100644 index 0000000000..d2638e2f37 --- /dev/null +++ b/pkg/trainer/replicas_test.go @@ -0,0 +1,368 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package trainer + +import ( + "encoding/json" + "fmt" + "reflect" + "strings" + "testing" + "time" + + "github.com/golang/protobuf/proto" + "k8s.io/api/core/v1" + meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/tools/record" + + tfv1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" + tfJobFake "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/fake" + "github.com/kubeflow/tf-operator/pkg/util" +) + +var ( + groupVersionKind = schema.GroupVersionKind{ + Group: tfv1alpha1.GroupName, + Version: tfv1alpha1.GroupVersion, + Kind: tfv1alpha1.TFJobResourceKind, + } +) + +func TestTFReplicaSet(t *testing.T) { + clientSet := fake.NewSimpleClientset() + + testSchedulerName := "test-scheduler" + + jobSpec := &tfv1alpha1.TFJob{ + ObjectMeta: meta_v1.ObjectMeta{ + Name: "some-job", + UID: "some-uid", + }, + Spec: tfv1alpha1.TFJobSpec{ + RuntimeId: "some-runtime", + ReplicaSpecs: []*tfv1alpha1.TFReplicaSpec{ + { + Replicas: proto.Int32(2), + TFPort: proto.Int32(10), + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + }, + }, + }, + }, + TFReplicaType: tfv1alpha1.PS, + }, + }, + SchedulerName: testSchedulerName, + }, + } + + recorder := record.NewFakeRecorder(100) + job, err := initJob(clientSet, &tfJobFake.Clientset{}, recorder, jobSpec) + + if err != nil { + t.Fatalf("initJob failed: %v", err) + } + + replica, err := NewTFReplicaSet(clientSet, recorder, *jobSpec.Spec.ReplicaSpecs[0], job) + + if err != nil { + t.Fatalf("NewTFReplicaSet failed: %v", err) + } + + if err := replica.SyncPods(); err != nil { + t.Fatalf("replica.SyncPods() error; %v", err) + } + + if err := replica.SyncServices(); err != nil { + t.Fatalf("replica.SyncServices() error; %v", err) + } + + trueVal := true + expectedOwnerReference := meta_v1.OwnerReference{ + APIVersion: groupVersionKind.GroupVersion().String(), + Kind: groupVersionKind.Kind, + Name: "some-job", + UID: "some-uid", + Controller: &trueVal, + BlockOwnerDeletion: &trueVal, + } + + for index := 0; index < 2; index++ { + // Expected labels + expectedLabels := map[string]string{ + "kubeflow.org": "", + "task_index": fmt.Sprintf("%v", index), + "job_type": "PS", + "runtime_id": "some-runtime", + "tf_job_name": "some-job", + } + + // Check that a service was created. + sList, err := clientSet.CoreV1().Services(replica.Job.job.ObjectMeta.Namespace).List(meta_v1.ListOptions{}) + if err != nil { + t.Fatalf("List services error; %v", err) + } + + if len(sList.Items) != 2 { + t.Fatalf("Expected 2 services got %v", len(sList.Items)) + } + + s := sList.Items[index] + + if !reflect.DeepEqual(expectedLabels, s.ObjectMeta.Labels) { + t.Fatalf("Service Labels; Got %v Want: %v", s.ObjectMeta.Labels, expectedLabels) + } + + name := fmt.Sprintf("some-job-ps-some-runtime-%v", index) + if s.ObjectMeta.Name != name { + t.Fatalf("Job.ObjectMeta.Name = %v; want %v", s.ObjectMeta.Name, name) + } + + if len(s.ObjectMeta.OwnerReferences) != 1 { + t.Fatalf("Expected 1 owner reference got %v", len(s.ObjectMeta.OwnerReferences)) + } + + if !reflect.DeepEqual(s.ObjectMeta.OwnerReferences[0], expectedOwnerReference) { + t.Fatalf("Service.Metadata.OwnerReferences; Got %v; want %v", util.Pformat(s.ObjectMeta.OwnerReferences[0]), util.Pformat(expectedOwnerReference)) + } + + // Check that a pod was created. + l, err := clientSet.CoreV1().Pods(replica.Job.job.ObjectMeta.Namespace).List(meta_v1.ListOptions{}) + if err != nil { + t.Fatalf("List pods error; %v", err) + } + + if len(l.Items) != 2 { + t.Fatalf("Expected 1 pod got %v", len(l.Items)) + } + + p := l.Items[index] + + if !reflect.DeepEqual(expectedLabels, p.ObjectMeta.Labels) { + t.Fatalf("Pod Labels; Got %v Want: %v", expectedLabels, p.ObjectMeta.Labels) + } + + if len(p.Spec.Containers) != 1 { + t.Fatalf("Expected 1 container got %v", len(p.Spec.Containers)) + } + + if len(p.ObjectMeta.OwnerReferences) != 1 { + t.Fatalf("Expected 1 owner reference got %v", len(p.ObjectMeta.OwnerReferences)) + } + + if !reflect.DeepEqual(p.ObjectMeta.OwnerReferences[0], expectedOwnerReference) { + t.Fatalf("Pod.Metadata.OwnerReferences; Got %v; want %v", util.Pformat(p.ObjectMeta.OwnerReferences[0]), util.Pformat(expectedOwnerReference)) + } + + c := p.Spec.Containers[0] + if len(c.Env) != 1 { + t.Fatalf("Expected 1 environment variable got %v", len(c.Env)) + } + + if strings.Compare(p.Spec.SchedulerName, testSchedulerName) != 0 { + t.Fatalf("p.Spec.Template.Spec.SchedulerName; Got %v; want %v", p.Spec.SchedulerName, testSchedulerName) + } + + actualTFConfig := &TFConfig{} + if err := json.Unmarshal([]byte(c.Env[0].Value), actualTFConfig); err != nil { + t.Fatalf("Could not unmarshal TFConfig %v", err) + } + + expectedTFConfig := &TFConfig{ + Cluster: ClusterSpec{}, + Task: TaskSpec{ + Type: "ps", + Index: index, + }, + Environment: "cloud", + } + + if !reflect.DeepEqual(expectedTFConfig, actualTFConfig) { + t.Fatalf("Got %v, Want %v", actualTFConfig, expectedTFConfig) + } + } + // Delete the job. + // N.B it doesn't look like the Fake clientset is sophisticated enough to delete jobs in response to a + // DeleteCollection request (deleting individual jobs does appear to work with the Fake). So if we were to list + // the jobs after calling Delete we'd still see the job. So we will rely on E2E tests to verify Delete works + // correctly. + if err := replica.Delete(); err != nil { + t.Fatalf("replica.Delete() error; %v", err) + } +} + +func TestTFReplicaSetStatusFromPodList(t *testing.T) { + type TestCase struct { + PodList v1.PodList + Name string + Expected tfv1alpha1.ReplicaState + } + + cases := []TestCase{ + { + PodList: v1.PodList{ + Items: []v1.Pod{ + { + Status: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + { + Name: "master", + State: v1.ContainerState{ + Running: &v1.ContainerStateRunning{}, + }, + }, + }, + }, + }, + }, + }, + Name: "master", + Expected: tfv1alpha1.ReplicaStateRunning, + }, + { + PodList: v1.PodList{ + Items: []v1.Pod{ + { + Status: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + { + Name: "master", + State: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ + ExitCode: 0, + }, + }, + }, + }, + }, + }, + }, + }, + Name: "master", + Expected: tfv1alpha1.ReplicaStateSucceeded, + }, + { + // Multiple containers; make sure we match by name. + PodList: v1.PodList{ + Items: []v1.Pod{ + { + Status: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + { + Name: "other", + State: v1.ContainerState{ + Running: &v1.ContainerStateRunning{}, + }, + }, + { + Name: "master", + State: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ + ExitCode: 0, + }, + }, + }, + }, + }, + }, + }, + }, + Name: "master", + Expected: tfv1alpha1.ReplicaStateSucceeded, + }, + { + // Container failed with permanent error and then got restarted. + PodList: v1.PodList{ + Items: []v1.Pod{ + { + Status: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + { + Name: "master", + State: v1.ContainerState{ + Running: &v1.ContainerStateRunning{}, + }, + LastTerminationState: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ + ExitCode: 100, + Message: "some reason", + }, + }, + }, + }, + }, + }, + }, + }, + Name: "master", + Expected: tfv1alpha1.ReplicaStateFailed, + }, + { + // Multiple Pods; check we get the most recent. + PodList: v1.PodList{ + Items: []v1.Pod{ + { + Status: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + { + Name: "master", + State: v1.ContainerState{ + Running: &v1.ContainerStateRunning{}, + }, + }, + }, + StartTime: &meta_v1.Time{ + Time: time.Date(2017, 0, 0, 0, 0, 0, 0, time.UTC), + }, + }, + }, + { + Status: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + { + Name: "master", + State: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ + ExitCode: 100, + Message: "some reason", + }, + }, + }, + }, + StartTime: &meta_v1.Time{ + Time: time.Date(2018, 0, 0, 0, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }, + Name: "master", + Expected: tfv1alpha1.ReplicaStateFailed, + }, + } + + for _, c := range cases { + status := replicaStatusFromPodList(c.PodList, c.Name) + if status != c.Expected { + t.Errorf("replicaStatusFromPodList(%+v, %v)=%v ; want %v", c.PodList, c.Name, status, c.Expected) + } + } +} diff --git a/pkg/trainer/training.go b/pkg/trainer/training.go new file mode 100644 index 0000000000..39a365e423 --- /dev/null +++ b/pkg/trainer/training.go @@ -0,0 +1,468 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package trainer is to manage TensorFlow training jobs. +package trainer + +import ( + "fmt" + "reflect" + "strings" + + log "github.com/sirupsen/logrus" + "k8s.io/api/core/v1" + "k8s.io/api/policy/v1beta1" + k8s_errors "k8s.io/apimachinery/pkg/api/errors" + meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/record" + + "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/helper" + tfv1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" + "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/validation" + tfjobclient "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" + "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" + "github.com/kubeflow/tf-operator/pkg/util" +) + +// TODO(jlewi): We should switch a New pattern and make trainingJob private so we can +// ensure correctness on creation. +type TrainingJob struct { + job *tfv1alpha1.TFJob + + KubeCli kubernetes.Interface + + recorder record.EventRecorder + + Replicas []*TFReplicaSet + + tfJobClient tfjobclient.Interface + + // in memory state of the job. + // status is the source of truth after job struct is materialized. Changes to the status to be persisted + // should be made here. + status tfv1alpha1.TFJobStatus + + memberCounter int + + pdb *v1beta1.PodDisruptionBudget +} + +// ClusterSpec represents a cluster TensorFlow specification. +// https://www.tensorflow.org/deploy/distributed#create_a_tftrainclusterspec_to_describe_the_cluster +// It is a map from job names to network addresses. +type ClusterSpec map[string][]string + +type TaskSpec struct { + Type string `json:"type"` + Index int `json:"index"` +} + +func initJob(kubeCli kubernetes.Interface, tfJobClient tfjobclient.Interface, recorder record.EventRecorder, job *tfv1alpha1.TFJob) (*TrainingJob, error) { + j := &TrainingJob{ + KubeCli: kubeCli, + tfJobClient: tfJobClient, + recorder: recorder, + Replicas: make([]*TFReplicaSet, 0), + job: job, + status: *job.Status.DeepCopy(), + } + + return j, nil +} + +func NewJob(kubeCli kubernetes.Interface, tfJobClient tfjobclient.Interface, recorder record.EventRecorder, job *tfv1alpha1.TFJob, config *tfv1alpha1.ControllerConfig) (*TrainingJob, error) { + j, err := initJob(kubeCli, tfJobClient, recorder, job) + if err != nil { + return nil, err + } + + return j, nil +} + +func (j *TrainingJob) UID() types.UID { + return j.job.ObjectMeta.UID +} + +func (j *TrainingJob) ClusterSpec() ClusterSpec { + clusterSpec := make(ClusterSpec) + + for _, p := range j.Replicas { + replicaNames := make([]string, 0, *p.Spec.Replicas) + + for i := int32(0); i < *p.Spec.Replicas; i++ { + replicaNames = append(replicaNames, fmt.Sprintf("%v:%v", p.genName(i), *p.Spec.TFPort)) + } + + clusterSpec[strings.ToLower(string(p.Spec.TFReplicaType))] = replicaNames + } + + return clusterSpec +} + +// deleteResources deletes the replicas it it was created +func (j *TrainingJob) deleteResources() error { + for _, r := range j.Replicas { + if err := r.Delete(); err != nil { + return err + } + } + + return nil +} + +func (j *TrainingJob) GetStatus() (tfv1alpha1.State, []*tfv1alpha1.TFReplicaStatus, error) { + chief := j.job.Spec.TerminationPolicy.Chief + chiefState := tfv1alpha1.ReplicaStateUnknown + + state := tfv1alpha1.StateUnknown + replicaStatuses := make([]*tfv1alpha1.TFReplicaStatus, 0) + + // The state for each replica. + // TODO(jlewi): We will need to modify this code if we want to allow multiples of a given type of replica. + replicaSetStates := make(map[tfv1alpha1.TFReplicaType]tfv1alpha1.ReplicaState) + + for _, r := range j.Replicas { + rStatus, err := r.GetStatus() + if err != nil { + log.Errorf("GetStatus() for %v returned error; %v", r.Spec.TFReplicaType, err) + } + + replicaSetStates[r.Spec.TFReplicaType] = rStatus.State + + replicaStatuses = append(replicaStatuses, &rStatus) + + if string(r.Spec.TFReplicaType) == chief.ReplicaName { + chiefState = r.GetSingleReplicaStatus(int32(chief.ReplicaIndex)) + } + } + + if chiefState == tfv1alpha1.ReplicaStateRunning { + state = tfv1alpha1.StateRunning + } else if chiefState == tfv1alpha1.ReplicaStateFailed { + state = tfv1alpha1.StateFailed + } else if chiefState == tfv1alpha1.ReplicaStateSucceeded { + state = tfv1alpha1.StateSucceeded + } + + return state, replicaStatuses, nil +} + +// isRetryableTerminationState returns true if a container terminated in a state +// that we consider retryable. +func isRetryableTerminationState(s *v1.ContainerStateTerminated) bool { + // TODO(jlewi): Need to match logic in + // https://cs.corp.google.com/piper///depot/google3/cloud/ml/beta/job/training_job_state_util.cc?l=88 + if s.Reason == "OOMKilled" { + // If the user's process causes an OOM and Docker kills the container, + // the termination reason of ContainerState will be specified to + // 'OOMKilled'. In this case, we can't assume this to be a retryable error. + // + // This check should happen before checking the termination log, since + // if the container terminated with an OOM, the termination log may not + // be written. + return false + } + + // TODO(jlewi): Should we use the exit code reported in the termination + // log message and not the ExitCode reported by the container. + + if s.ExitCode >= 0 && s.ExitCode <= 127 { + // For the exit_code in [0, 127]: + // 0 means success, + // 1 - 127 corresponds to permanent user errors. + // We don't want to retry for both cases. + // More info about exit status can be found in: + // https://www.gnu.org/software/bash/manual/html_node/Exit-Status.html + return false + } + + // For the remaining cases that exit_code from workers that doesn't + // fall into [0, 127]. They can be: + // 137 corresponds to SIGKILL, + // 143 corresponds to SIGTERM, + // other values that have undefined behavior. + // We treat them as internal errors for now and all the internal errors + // will be retired. + return true +} + +func (j *TrainingJob) masterName() string { + return fmt.Sprintf("master-%v-0", j.job.Spec.RuntimeId) +} + +// setup the training job. +func (j *TrainingJob) setup(config *tfv1alpha1.ControllerConfig) { + err := func() error { + // If the job has already started we shouldn't set it up again. + if j.status.Phase != tfv1alpha1.TFJobPhaseNone { + log.Warningf("Job %v has already been setup.", j.name()) + return nil + } + + // Set defaults. + scheme.Scheme.Default(j.job) + + err := validation.ValidateTFJobSpec(&j.job.Spec) + if err != nil { + return fmt.Errorf("invalid job spec: %v", err) + } + + if err := helper.ConfigureAcceleratorsForTFJobSpec(&j.job.Spec, config.Accelerators); err != nil { + return fmt.Errorf("ConfigureAccelerators(...) error; %v", err) + } + + if j.job.Spec.RuntimeId == "" { + j.job.Spec.RuntimeId = util.RandString(4) + } + return nil + }() + + if err != nil { + j.status.Reason = err.Error() + j.status.Phase = tfv1alpha1.TFJobPhaseFailed + j.status.State = tfv1alpha1.StateFailed + } else { + j.status.Phase = tfv1alpha1.TFJobPhaseCreating + j.status.State = tfv1alpha1.StateRunning + } +} + +// setup Replicas. This creates in memory data structures corresponding to the replicas. +func (j *TrainingJob) setupReplicas() error { + if len(j.Replicas) != len(j.job.Spec.ReplicaSpecs) { + j.Replicas = make([]*TFReplicaSet, 0, len(j.job.Spec.ReplicaSpecs)) + for _, t := range j.job.Spec.ReplicaSpecs { + r, err := NewTFReplicaSet(j.KubeCli, j.recorder, *t, j) + if err != nil { + return err + } + j.Replicas = append(j.Replicas, r) + } + } + + return nil +} + +func (j *TrainingJob) Delete() { + // TODO(jlewi): Delete is what should cause us to delete the Pods. + // we shouldn't delete the pods when the jobs finish because leaving the pods + // allows us to get the logs from the pods after the job finishes. + // + log.Infof("TFJob %v deleted by the user", j.fullname()) + // TODO(jlewi): This logic is probably insufficient. + if j.job.Status.Phase != tfv1alpha1.TFJobPhaseCleanUp { + j.status.Phase = tfv1alpha1.TFJobPhaseCleanUp + } + + // TODO(jlewi): Does it make sense to explicitly delete the resources? Should + // we just rely on K8s garbage collection to delete the resources before + // deleting TFJob? + if cErr := j.deleteResources(); cErr != nil { + log.Errorf("trainingJob.deleteResources() error; %v", cErr) + } + + if j.pdb != nil { + // if the job has PDB for gang scheduling, delete it + err := j.KubeCli.PolicyV1beta1().PodDisruptionBudgets(j.job.ObjectMeta.Namespace).Delete(j.pdb.ObjectMeta.Name, &meta_v1.DeleteOptions{}) + if err != nil { + log.Errorf("Error deleting PDB %v; %v", j.pdb.ObjectMeta.Name, err) + } + } +} + +// updateCRDStatus updates the job status based on TraingingJob.status. +func (j *TrainingJob) updateCRDStatus() error { + // If the status hasn't changed then there's no reason to update the CRD. + if reflect.DeepEqual(j.job.Status, j.status) { + return nil + } + + newJob := j.job + newJob.Status = j.status + newJob, err := j.tfJobClient.KubeflowV1alpha1().TFJobs(j.job.ObjectMeta.Namespace).Update(newJob) + if err != nil { + return err + } + + j.job = newJob + + return nil +} + +// reconcile tries to get the job into the desired state. +func (j *TrainingJob) Reconcile(config *tfv1alpha1.ControllerConfig, enableGangScheduling bool) error { + if j.job.Status.Phase == tfv1alpha1.TFJobPhaseNone { + // The job hasn't been setup. + j.setup(config) + + if err := j.updateCRDStatus(); err != nil { + log.Warningf("failed to update CRD status: %v", err) + return err + } + } + + // setupreplicas initializes data structures inside TrainingJob representing the replicas. + // These are go-lang structures which aren't preserved in the APIServer. So we always need to call setupReplicas + // unlike setup which only needs to be called once during the lifecycle of the job. + if err := j.setupReplicas(); err != nil { + log.Errorf("failed to create replicas: %v", err) + j.status.Reason = fmt.Sprintf("Could not create in memory datastructures; %v", err) + if uErr := j.updateCRDStatus(); err != nil { + log.Warningf("Job %v; failed to update status error: %v", j.job.ObjectMeta.Name, uErr) + } + return err + } + + // sync PDB for gang scheduling + // TODO(mitake): replace PDB with a newer mechanism if it is replaced + if enableGangScheduling { + err := j.syncPdb() + if err != nil { + log.Errorf("SyncPdb error: %v", err) + } + } + + // sync pods + for _, rc := range j.Replicas { + err := rc.SyncPods() + if err != nil { + log.Errorf("SyncPods error: %v", err) + } + } + + // sync services + for _, rc := range j.Replicas { + err := rc.SyncServices() + if err != nil { + log.Errorf("SyncServices error: %v", err) + } + } + + if err := j.updateCRDStatus(); err != nil { + log.Warningf("Job %v; failed to update status error: %v", j.job.ObjectMeta.Name, err) + return err + } + + // Call GetStatus in each reconcile loop + state, replicaStatuses, err := j.GetStatus() + + j.status.ReplicaStatuses = replicaStatuses + if err != nil { + log.Errorf("GetStatus() for job %v returned error: %v", j.job.ObjectMeta.Name, err) + return err + } + + // TODO(jlewi): We should update the Phase if we detect the job is done. + if state == tfv1alpha1.StateFailed { + log.Errorf("Master failed Job: %v.", j.job.ObjectMeta.Name) + j.status.Phase = tfv1alpha1.TFJobPhaseDone + j.status.State = tfv1alpha1.StateFailed + } else if state == tfv1alpha1.StateSucceeded { + log.Infof("Master succeeded Job: %v.", j.job.ObjectMeta.Name) + j.status.Phase = tfv1alpha1.TFJobPhaseDone + j.status.State = tfv1alpha1.StateSucceeded + } else if state == tfv1alpha1.StateRunning { + log.Infof("Master running Job: %v.", j.job.ObjectMeta.Name) + j.status.Phase = tfv1alpha1.TFJobPhaseRunning + j.status.State = tfv1alpha1.StateRunning + } else { + log.Infof("Job %v status=%v", j.job.ObjectMeta.Name, util.Pformat(j.status)) + } + + // If the phase changed we should update the CRD. + if err := j.updateCRDStatus(); err != nil { + log.Warningf("Job %v, failed to update CRD status error: %v", j.job.ObjectMeta.Name, err) + return err + } + + if j.job.Status.Phase == tfv1alpha1.TFJobPhaseCleanUp { + if cErr := j.deleteResources(); cErr != nil { + log.Errorf("Job %v trainingJob.Delete() error; %v", j.job.ObjectMeta.Name, cErr) + } + // j.status.SetPhase(spec.TFJobPhaseDone) + // Return from run because we want to stop reconciling the object. + return nil + } + + // updateCRDStatus will update the status of the CRD with c.Status if c.Status + // doesn't match c.Cluster.status. So you can change c.Status in order to propagate + // changes to the CRD status. + if err := j.updateCRDStatus(); err != nil { + log.Warningf("Job %v; failed to update CRD status error: %v", j.job.ObjectMeta.Name, err) + return err + } + + return nil +} + +func (j *TrainingJob) name() string { + return j.job.ObjectMeta.GetName() +} + +// fullname returns the namespace and name for the job. +func (j *TrainingJob) fullname() string { + return j.job.ObjectMeta.GetNamespace() + ":" + j.job.ObjectMeta.GetName() +} + +func (j *TrainingJob) SchedulerName() string { + return j.job.Spec.SchedulerName +} + +// SyncPdb will create a PDB for gang scheduling by kube-arbitrator. +func (j *TrainingJob) syncPdb() error { + nrReplicas := int32(0) + for _, r := range j.Replicas { + nrReplicas += *r.Spec.Replicas + } + + if nrReplicas == 1 { + // gang scheduling isn't required by a non distributed training process + return nil + } + + minAvailable := intstr.FromInt(int(nrReplicas)) + pdb := &v1beta1.PodDisruptionBudget{ + ObjectMeta: meta_v1.ObjectMeta{ + GenerateName: "tf-job-pdb-", + }, + Spec: v1beta1.PodDisruptionBudgetSpec{ + MinAvailable: &minAvailable, + Selector: &meta_v1.LabelSelector{ + MatchLabels: map[string]string{ + "runtime_id": j.job.Spec.RuntimeId, + "tf_job_name": j.job.ObjectMeta.Name, + }, + }, + }, + } + + createdPdb, err := j.KubeCli.PolicyV1beta1().PodDisruptionBudgets(j.job.ObjectMeta.Namespace).Create(pdb) + if err != nil { + if k8s_errors.IsAlreadyExists(err) { + log.Infof("PDB: %v already exists.", j.job.ObjectMeta.Name) + return nil + } + + j.recorder.Eventf(j.job, v1.EventTypeWarning, FailedCreateReason, "Error creating: %v", err) + return err + } + + j.pdb = createdPdb + + j.recorder.Eventf(j.job, v1.EventTypeNormal, SuccessfulCreateReason, "Created PDB: %v", createdPdb.Name) + return nil +} diff --git a/pkg/trainer/training_test.go b/pkg/trainer/training_test.go new file mode 100644 index 0000000000..5c9718c613 --- /dev/null +++ b/pkg/trainer/training_test.go @@ -0,0 +1,489 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package trainer + +import ( + "reflect" + "testing" + + "github.com/gogo/protobuf/proto" + tfv1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" + tfJobFake "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/fake" + "k8s.io/api/core/v1" + "k8s.io/api/policy/v1beta1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/tools/record" +) + +func TestIsRetryableTerminationState(t *testing.T) { + type TestCase struct { + State v1.ContainerStateTerminated + Expected bool + } + + cases := []TestCase{ + { + // Since reason is empty we don't trust the exit code. + State: v1.ContainerStateTerminated{ + ExitCode: 0, + }, + Expected: false, + }, + { + State: v1.ContainerStateTerminated{ + ExitCode: 0, + Message: "some reason", + }, + Expected: false, + }, + { + State: v1.ContainerStateTerminated{ + ExitCode: 1, + Message: "some reason", + }, + Expected: false, + }, + { + State: v1.ContainerStateTerminated{ + ExitCode: 1, + }, + Expected: false, + }, + { + State: v1.ContainerStateTerminated{ + ExitCode: 244, + Message: "some reason", + }, + Expected: true, + }, + { + State: v1.ContainerStateTerminated{ + ExitCode: 244, + Reason: "OOMKilled", + }, + Expected: false, + }, + } + + for _, c := range cases { + actual := isRetryableTerminationState(&c.State) + if actual != c.Expected { + t.Errorf("isRetryableTerminationState(%+v)=%v want %v", c.State, actual, c.Expected) + } + } +} + +func TestClusterSpec(t *testing.T) { + type TestCase struct { + Spec *tfv1alpha1.TFJob + Expected map[string][]string + } + + cases := []TestCase{ + { + Spec: &tfv1alpha1.TFJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "myjob", + }, + Spec: tfv1alpha1.TFJobSpec{ + RuntimeId: "runtime", + ReplicaSpecs: []*tfv1alpha1.TFReplicaSpec{ + { + Replicas: proto.Int32(2), + TFPort: proto.Int32(22), + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + }, + }, + }, + }, + TFReplicaType: tfv1alpha1.PS, + }, + { + Replicas: proto.Int32(1), + TFPort: proto.Int32(42), + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + }, + }, + }, + }, + TFReplicaType: tfv1alpha1.MASTER, + }, + { + Replicas: proto.Int32(3), + TFPort: proto.Int32(40), + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + }, + }, + }, + }, + TFReplicaType: tfv1alpha1.WORKER, + }, + }, + }, + }, + + Expected: map[string][]string{ + "ps": []string{"myjob-ps-runtime-0:22", "myjob-ps-runtime-1:22"}, + "master": []string{"myjob-master-runtime-0:42"}, + "worker": []string{"myjob-worker-runtime-0:40", "myjob-worker-runtime-1:40", "myjob-worker-runtime-2:40"}, + }, + }, + } + + for _, c := range cases { + + clientSet := fake.NewSimpleClientset() + + recorder := record.NewFakeRecorder(100) + job, err := initJob(clientSet, &tfJobFake.Clientset{}, recorder, c.Spec) + + if err != nil { + t.Fatalf("initJob failed: %v", err) + } + + job.setup(&tfv1alpha1.ControllerConfig{}) + job.setupReplicas() + actual := job.ClusterSpec() + + for k, v := range c.Expected { + actualV, ok := actual[k] + if !ok { + t.Errorf("Actual cluster spec is missing key: %v", k) + continue + } + if !reflect.DeepEqual(actualV, v) { + t.Errorf("Key %v got %v want %v", k, actualV, v) + } + } + } +} + +func TestJobSetup(t *testing.T) { + // Verify the setup will fill in the RuntimeId. + clientSet := fake.NewSimpleClientset() + + type testCase struct { + jobSpec *tfv1alpha1.TFJob + expectMounts int + expectPhase tfv1alpha1.TFJobPhase + expectReason string + expectState tfv1alpha1.State + } + + testCases := []testCase{ + { + jobSpec: &tfv1alpha1.TFJob{ + Spec: tfv1alpha1.TFJobSpec{ + ReplicaSpecs: []*tfv1alpha1.TFReplicaSpec{ + { + Replicas: proto.Int32(1), + TFPort: proto.Int32(10), + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + }, + }, + }, + }, + TFReplicaType: tfv1alpha1.MASTER, + }, + }, + }, + }, + expectMounts: 0, + expectPhase: tfv1alpha1.TFJobPhaseCreating, + expectState: tfv1alpha1.StateRunning, + }, + { + jobSpec: &tfv1alpha1.TFJob{ + Spec: tfv1alpha1.TFJobSpec{ + ReplicaSpecs: []*tfv1alpha1.TFReplicaSpec{ + { + Replicas: proto.Int32(2), + TFPort: proto.Int32(10), + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + Resources: v1.ResourceRequirements{ + Requests: map[v1.ResourceName]resource.Quantity{ + "nvidia-gpu": resource.MustParse("1"), + }, + }, + }, + }, + }, + }, + TFReplicaType: tfv1alpha1.WORKER, + }, + }, + TerminationPolicy: &tfv1alpha1.TerminationPolicySpec{ + Chief: &tfv1alpha1.ChiefSpec{ + ReplicaName: string(tfv1alpha1.WORKER), + ReplicaIndex: 0, + }, + }, + }, + }, + expectMounts: 1, + expectPhase: tfv1alpha1.TFJobPhaseCreating, + expectState: tfv1alpha1.StateRunning, + }, + { + // The job should fail setup because the spec is invalid. + jobSpec: &tfv1alpha1.TFJob{ + Spec: tfv1alpha1.TFJobSpec{ + ReplicaSpecs: []*tfv1alpha1.TFReplicaSpec{ + { + Replicas: proto.Int32(2), + TFPort: proto.Int32(10), + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + Resources: v1.ResourceRequirements{ + Requests: map[v1.ResourceName]resource.Quantity{ + "nvidia-gpu": resource.MustParse("1"), + }, + }, + }, + }, + }, + }, + TFReplicaType: tfv1alpha1.WORKER, + }, + }, + }, + }, + expectMounts: 0, + expectPhase: tfv1alpha1.TFJobPhaseFailed, + expectState: tfv1alpha1.StateFailed, + expectReason: "invalid job spec: Missing ReplicaSpec for chief: MASTER", + }, + } + + config := &tfv1alpha1.ControllerConfig{ + Accelerators: map[string]tfv1alpha1.AcceleratorConfig{ + "nvidia-gpu": tfv1alpha1.AcceleratorConfig{ + Volumes: []tfv1alpha1.AcceleratorVolume{ + { + Name: "cuda-lib", + HostPath: "/home/cuda", + MountPath: "/usr/local/cuda", + }, + }, + }, + }, + } + + for _, c := range testCases { + + recorder := record.NewFakeRecorder(100) + job, err := initJob(clientSet, &tfJobFake.Clientset{}, recorder, c.jobSpec) + + job.setup(config) + + if err != nil { + t.Errorf("j.setup error: %v", err) + } + + if job.status.Phase != c.expectPhase { + t.Errorf("job.job.Status.Phase Want: %v Got:%v ", c.expectPhase, job.status.Phase) + } + + if job.status.Reason != c.expectReason { + t.Errorf("job.job.Status.Reason Want: %v Got:%v ", c.expectReason, job.status.Reason) + } + + if job.status.State != c.expectState { + t.Errorf("job.job.Status.State Want: %v Got:%v ", c.expectState, job.status.State) + } + + // Make sure the runtime id is set if the job didn't fail. + if c.expectState != tfv1alpha1.StateFailed && job.job.Spec.RuntimeId == "" { + t.Errorf("RuntimeId should not be empty after calling setup.") + } + + if len(job.job.Spec.ReplicaSpecs[0].Template.Spec.Volumes) != c.expectMounts { + t.Errorf("Expect %v Volumes got %v", c.expectMounts, len(job.job.Spec.ReplicaSpecs[0].Template.Spec.Volumes)) + } + + if len(job.job.Spec.ReplicaSpecs[0].Template.Spec.Containers[0].VolumeMounts) != c.expectMounts { + t.Errorf("Expect %v VolumeMounts got %v", c.expectMounts, len(job.job.Spec.ReplicaSpecs[0].Template.Spec.Containers[0].VolumeMounts)) + } + } +} + +func TestPDBForGangScheduling(t *testing.T) { + clientSet := fake.NewSimpleClientset() + + type testCase struct { + jobSpec *tfv1alpha1.TFJob + expectPdb *v1beta1.PodDisruptionBudget + } + + minAvailable3 := intstr.FromInt(3) + + testCases := []testCase{ + { + jobSpec: &tfv1alpha1.TFJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-meta-name", + }, + Spec: tfv1alpha1.TFJobSpec{ + RuntimeId: "some-runtime-id", + ReplicaSpecs: []*tfv1alpha1.TFReplicaSpec{ + { + Replicas: proto.Int32(1), + TFPort: proto.Int32(10), + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + }, + }, + }, + }, + TFReplicaType: tfv1alpha1.WORKER, + }, + }, + }, + }, + expectPdb: nil, + }, + + { + jobSpec: &tfv1alpha1.TFJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-meta-name", + }, + Spec: tfv1alpha1.TFJobSpec{ + RuntimeId: "some-runtime-id", + ReplicaSpecs: []*tfv1alpha1.TFReplicaSpec{ + { + Replicas: proto.Int32(1), + TFPort: proto.Int32(10), + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + }, + }, + }, + }, + TFReplicaType: tfv1alpha1.MASTER, + }, + { + Replicas: proto.Int32(1), + TFPort: proto.Int32(10), + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + }, + }, + }, + }, + TFReplicaType: tfv1alpha1.PS, + }, + { + Replicas: proto.Int32(1), + TFPort: proto.Int32(10), + Template: &v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "tensorflow", + }, + }, + }, + }, + TFReplicaType: tfv1alpha1.WORKER, + }, + }, + }, + }, + expectPdb: &v1beta1.PodDisruptionBudget{ + Spec: v1beta1.PodDisruptionBudgetSpec{ + MinAvailable: &minAvailable3, + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "runtime_id": "some-runtime-id", + "tf_job_name": "some-meta-name", + }, + }, + }, + }, + }, + } + + for _, c := range testCases { + recorder := record.NewFakeRecorder(100) + job, err := initJob(clientSet, &tfJobFake.Clientset{}, recorder, c.jobSpec) + if err != nil { + t.Errorf("j.initJob() error: %v", err) + } + + err = job.setupReplicas() + if err != nil { + t.Errorf("j.setupReplicas() error: %v", err) + } + + err = job.syncPdb() + if err != nil { + t.Errorf("j.Reconcile() error: %v", err) + } + + actualPdbList, err := clientSet.PolicyV1beta1().PodDisruptionBudgets(job.job.ObjectMeta.Namespace).List(metav1.ListOptions{}) + if err != nil { + t.Fatalf("Could not get PDB List: %v", err) + } + if len(actualPdbList.Items) != 1 && c.expectPdb != nil { + t.Fatalf("k8s should have one PDB but the length of actually created PDB isn't 1, Got %d", len(actualPdbList.Items)) + } + + if c.expectPdb == nil { + // non distributed training job, shouldn't have PDB + continue + } + + actualPdb := actualPdbList.Items[0] + if !reflect.DeepEqual(c.expectPdb.Spec, actualPdb.Spec) { + t.Fatalf("Got %v, Want %v", actualPdb.Spec, c.expectPdb.Spec) + } + } +} diff --git a/pkg/util/k8sutil/k8sutil.go b/pkg/util/k8sutil/k8sutil.go new file mode 100644 index 0000000000..9421504ca6 --- /dev/null +++ b/pkg/util/k8sutil/k8sutil.go @@ -0,0 +1,120 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package k8sutil + +import ( + "net" + "os" + + log "github.com/sirupsen/logrus" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/kubernetes" + _ "k8s.io/client-go/plugin/pkg/client/auth/gcp" // for gcp auth + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + + tfv1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" +) + +const RecommendedConfigPathEnvVar = "KUBECONFIG" + +// TODO(jlewi): I think this function is used to add an owner to a resource. I think we we should use this +// method to ensure all resources created for the TFJob are owned by the TFJob. +func addOwnerRefToObject(o metav1.Object, r metav1.OwnerReference) { + o.SetOwnerReferences(append(o.GetOwnerReferences(), r)) +} + +func MustNewKubeClient() kubernetes.Interface { + cfg, err := GetClusterConfig() + if err != nil { + log.Fatal(err) + } + return kubernetes.NewForConfigOrDie(cfg) +} + +// Obtain the config from the Kube configuration used by kubeconfig, or from k8s cluster. +func GetClusterConfig() (*rest.Config, error) { + if len(os.Getenv(RecommendedConfigPathEnvVar)) > 0 { + // use the current context in kubeconfig + // This is very useful for running locally. + return clientcmd.BuildConfigFromFlags("", os.Getenv(RecommendedConfigPathEnvVar)) + } + + // Work around https://github.com/kubernetes/kubernetes/issues/40973 + // See https://github.com/coreos/etcd-operator/issues/731#issuecomment-283804819 + if len(os.Getenv("KUBERNETES_SERVICE_HOST")) == 0 { + addrs, err := net.LookupHost("kubernetes.default.svc") + if err != nil { + panic(err) + } + if err := os.Setenv("KUBERNETES_SERVICE_HOST", addrs[0]); err != nil { + return nil, err + } + } + if len(os.Getenv("KUBERNETES_SERVICE_PORT")) == 0 { + if err := os.Setenv("KUBERNETES_SERVICE_PORT", "443"); err != nil { + panic(err) + } + } + return rest.InClusterConfig() +} + +func IsKubernetesResourceAlreadyExistError(err error) bool { + return apierrors.IsAlreadyExists(err) +} + +func IsKubernetesResourceNotFoundError(err error) bool { + return apierrors.IsNotFound(err) +} + +// We are using internal api types for cluster related. +func JobListOpt(clusterName string) metav1.ListOptions { + return metav1.ListOptions{ + LabelSelector: labels.SelectorFromSet(LabelsForJob(clusterName)).String(), + } +} + +func LabelsForJob(jobName string) map[string]string { + return map[string]string{ + // TODO(jlewi): Need to set appropriate labels for TF. + "tf_job": jobName, + "app": tfv1alpha1.AppLabel, + } +} + +// TODO(jlewi): CascadeDeletOptions are part of garbage collection policy. +// Do we want to use this? See +// https://kubernetes.io/docs/concepts/workloads/controllers/garbage-collection/ +func CascadeDeleteOptions(gracePeriodSeconds int64) *metav1.DeleteOptions { + return &metav1.DeleteOptions{ + GracePeriodSeconds: func(t int64) *int64 { return &t }(gracePeriodSeconds), + PropagationPolicy: func() *metav1.DeletionPropagation { + foreground := metav1.DeletePropagationForeground + return &foreground + }(), + } +} + +// mergeLabels merges l2 into l1. Conflicting labels will be skipped. +func mergeLabels(l1, l2 map[string]string) { + for k, v := range l2 { + if _, ok := l1[k]; ok { + continue + } + l1[k] = v + } +} diff --git a/pkg/util/util.go b/pkg/util/util.go new file mode 100644 index 0000000000..fd623b5a89 --- /dev/null +++ b/pkg/util/util.go @@ -0,0 +1,74 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package util provides various helper routines. +package util + +import ( + "encoding/json" + "fmt" + "math/rand" + "time" + + log "github.com/sirupsen/logrus" +) + +const ( + // Environment variable for namespace when deployed on kubernetes + EnvKubeflowNamespace = "KUBEFLOW_NAMESPACE" +) + +// Pformat returns a pretty format output of any value that can be marshalled to JSON. +func Pformat(value interface{}) string { + if s, ok := value.(string); ok { + return s + } + valueJSON, err := json.MarshalIndent(value, "", " ") + if err != nil { + log.Warningf("Couldn't pretty format %v, error: %v", value, err) + return fmt.Sprintf("%v", value) + } + return string(valueJSON) +} + +var src = rand.NewSource(time.Now().UnixNano()) + +const letterBytes = "0123456789abcdefghijklmnopqrstuvwxyz" +const ( + letterIdxBits = 6 // 6 bits to represent a letter index + letterIdxMask = 1<= 0; { + if remain == 0 { + cache, remain = src.Int63(), letterIdxMax + } + if idx := int(cache & letterIdxMask); idx < len(letterBytes) { + b[i] = letterBytes[idx] + i-- + } + cache >>= letterIdxBits + remain-- + } + + return string(b) +} From b317aefd41abe5f0b68b042d7669ed831ba74bb2 Mon Sep 17 00:00:00 2001 From: Ce Gao Date: Thu, 22 Mar 2018 20:22:19 +0800 Subject: [PATCH 22/24] py: Fix style Signed-off-by: Ce Gao --- py/release.py | 2 +- py/release_test.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/py/release.py b/py/release.py index ba36759fcb..7087fa7b6c 100755 --- a/py/release.py +++ b/py/release.py @@ -176,7 +176,7 @@ def build_operator_image(root_dir, "build/images/tf_operator/Dockerfile", "examples/tf_sample/tf_sample/tf_smoke.py", os.path.join(go_path, bin_path, "tf-operator"), - os.path.join(go_path, bin_path,"e2e"), + os.path.join(go_path, bin_path, "e2e"), os.path.join(go_path, bin_path, "backend"), "dashboard/frontend/build" ] diff --git a/py/release_test.py b/py/release_test.py index e61dd173e4..7b0ef714a9 100644 --- a/py/release_test.py +++ b/py/release_test.py @@ -36,11 +36,11 @@ def test_build_postsubmit( # pylint: disable=no-self-use @mock.patch("py.release.util.install_go_deps") @mock.patch("py.release.util.clone_repo") @mock.patch("py.release.build_and_push") - def test_build_pr( + def test_build_pr(# pylint: disable=no-self-use self, mock_build_and_push, mock_clone, - _mock_install, # pylint: disable=no-self-use + _mock_install, _mock_os, _mock_makedirs): parser = release.build_parser() From 003fa7dc62bdf79a6c062df7ce7c2812ab5f0288 Mon Sep 17 00:00:00 2001 From: Ce Gao Date: Fri, 23 Mar 2018 15:10:02 +0800 Subject: [PATCH 23/24] examples: Keep v1alpha1 Signed-off-by: Ce Gao --- examples/crd/crd.yaml | 2 +- examples/crd/v1alpha2.yaml | 11 +++++++++++ examples/tf_job.yaml | 32 +++++++++++++++++++------------- examples/tf_job_v1alpha2.yaml | 24 ++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 14 deletions(-) create mode 100644 examples/crd/v1alpha2.yaml create mode 100644 examples/tf_job_v1alpha2.yaml diff --git a/examples/crd/crd.yaml b/examples/crd/crd.yaml index 93fdf1e426..fc2bc45a2a 100644 --- a/examples/crd/crd.yaml +++ b/examples/crd/crd.yaml @@ -4,7 +4,7 @@ metadata: name: tfjobs.kubeflow.org spec: group: kubeflow.org - version: v1alpha2 + version: v1alpha1 names: kind: TFJob singular: tfjob diff --git a/examples/crd/v1alpha2.yaml b/examples/crd/v1alpha2.yaml new file mode 100644 index 0000000000..93fdf1e426 --- /dev/null +++ b/examples/crd/v1alpha2.yaml @@ -0,0 +1,11 @@ +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + name: tfjobs.kubeflow.org +spec: + group: kubeflow.org + version: v1alpha2 + names: + kind: TFJob + singular: tfjob + plural: tfjobs diff --git a/examples/tf_job.yaml b/examples/tf_job.yaml index 7c4017c185..4b274d03df 100644 --- a/examples/tf_job.yaml +++ b/examples/tf_job.yaml @@ -1,24 +1,30 @@ -apiVersion: "kubeflow.org/v1alpha2" +apiVersion: "kubeflow.org/v1alpha1" kind: "TFJob" metadata: - name: "example-job-1" + name: "example-job" spec: - tfReplicaSpecs: - PS: - replicas: 2 + replicaSpecs: + - replicas: 1 + tfReplicaType: MASTER template: spec: containers: - - name: ps-busybox - image: busybox - command: ["sleep", "30000"] + - image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff + name: tensorflow restartPolicy: OnFailure - Worker: - replicas: 4 + - replicas: 1 + tfReplicaType: WORKER template: spec: containers: - - name: worker-busybox - image: busybox - command: ["sleep", "30000"] + - image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff + name: tensorflow + restartPolicy: OnFailure + - replicas: 2 + tfReplicaType: PS + template: + spec: + containers: + - image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff + name: tensorflow restartPolicy: OnFailure diff --git a/examples/tf_job_v1alpha2.yaml b/examples/tf_job_v1alpha2.yaml new file mode 100644 index 0000000000..7c4017c185 --- /dev/null +++ b/examples/tf_job_v1alpha2.yaml @@ -0,0 +1,24 @@ +apiVersion: "kubeflow.org/v1alpha2" +kind: "TFJob" +metadata: + name: "example-job-1" +spec: + tfReplicaSpecs: + PS: + replicas: 2 + template: + spec: + containers: + - name: ps-busybox + image: busybox + command: ["sleep", "30000"] + restartPolicy: OnFailure + Worker: + replicas: 4 + template: + spec: + containers: + - name: worker-busybox + image: busybox + command: ["sleep", "30000"] + restartPolicy: OnFailure From 0947506abf7d81ab2639a2d590973dc426cc096e Mon Sep 17 00:00:00 2001 From: Ce Gao Date: Fri, 23 Mar 2018 22:23:03 +0800 Subject: [PATCH 24/24] linter: Fix --- linter_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linter_config.json b/linter_config.json index f21d04949b..4121bf82b6 100644 --- a/linter_config.json +++ b/linter_config.json @@ -28,7 +28,7 @@ "pkg/apis/tensorflow/v1alpha2/zz_generated.deepcopy.go", "pkg/apis/tensorflow/v1alpha2/zz_generated.defaults.go", "pkg/controller/controller_utils.go", - "pkg/controller.v2/controller_utils.go" + "pkg/controller.v2/controller_utils.go", "pkg/apis/tensorflow/v1alpha1/defaults.go", "pkg/apis/tensorflow/v1alpha1/defaults_test.go", "pkg/apis/tensorflow/validation/validation_test.go"