Constants
GkeInferenceQuickstart_FetchModels_FullMethodName, GkeInferenceQuickstart_FetchModelServers_FullMethodName, GkeInferenceQuickstart_FetchModelServerVersions_FullMethodName, GkeInferenceQuickstart_FetchProfiles_FullMethodName, GkeInferenceQuickstart_GenerateOptimizedManifest_FullMethodName, GkeInferenceQuickstart_FetchBenchmarkingData_FullMethodName
const (
GkeInferenceQuickstart_FetchModels_FullMethodName = "/google.cloud.gkerecommender.v1.GkeInferenceQuickstart/FetchModels"
GkeInferenceQuickstart_FetchModelServers_FullMethodName = "/google.cloud.gkerecommender.v1.GkeInferenceQuickstart/FetchModelServers"
GkeInferenceQuickstart_FetchModelServerVersions_FullMethodName = "/google.cloud.gkerecommender.v1.GkeInferenceQuickstart/FetchModelServerVersions"
GkeInferenceQuickstart_FetchProfiles_FullMethodName = "/google.cloud.gkerecommender.v1.GkeInferenceQuickstart/FetchProfiles"
GkeInferenceQuickstart_GenerateOptimizedManifest_FullMethodName = "/google.cloud.gkerecommender.v1.GkeInferenceQuickstart/GenerateOptimizedManifest"
GkeInferenceQuickstart_FetchBenchmarkingData_FullMethodName = "/google.cloud.gkerecommender.v1.GkeInferenceQuickstart/FetchBenchmarkingData"
)Variables
File_google_cloud_gkerecommender_v1_gkerecommender_proto
var File_google_cloud_gkerecommender_v1_gkerecommender_proto protoreflect.FileDescriptorGkeInferenceQuickstart_ServiceDesc
var GkeInferenceQuickstart_ServiceDesc = grpc.ServiceDesc{
ServiceName: "google.cloud.gkerecommender.v1.GkeInferenceQuickstart",
HandlerType: (*GkeInferenceQuickstartServer)(nil),
Methods: []grpc.MethodDesc{
{
MethodName: "FetchModels",
Handler: _GkeInferenceQuickstart_FetchModels_Handler,
},
{
MethodName: "FetchModelServers",
Handler: _GkeInferenceQuickstart_FetchModelServers_Handler,
},
{
MethodName: "FetchModelServerVersions",
Handler: _GkeInferenceQuickstart_FetchModelServerVersions_Handler,
},
{
MethodName: "FetchProfiles",
Handler: _GkeInferenceQuickstart_FetchProfiles_Handler,
},
{
MethodName: "GenerateOptimizedManifest",
Handler: _GkeInferenceQuickstart_GenerateOptimizedManifest_Handler,
},
{
MethodName: "FetchBenchmarkingData",
Handler: _GkeInferenceQuickstart_FetchBenchmarkingData_Handler,
},
},
Streams: []grpc.StreamDesc{},
Metadata: "google/cloud/gkerecommender/v1/gkerecommender.proto",
}GkeInferenceQuickstart_ServiceDesc is the grpc.ServiceDesc for GkeInferenceQuickstart service. It's only intended for direct use with grpc.RegisterService, and not to be introspected or modified (even as a copy)
Functions
func RegisterGkeInferenceQuickstartServer
func RegisterGkeInferenceQuickstartServer(s grpc.ServiceRegistrar, srv GkeInferenceQuickstartServer)Amount
type Amount struct {
// Output only. The whole units of the amount.
// For example if `currencyCode` is `"USD"`, then 1 unit is one US dollar.
Units int64 `protobuf:"varint,1,opt,name=units,proto3" json:"units,omitempty"`
// Output only. Number of nano (10^-9) units of the amount.
// The value must be between -999,999,999 and +999,999,999 inclusive.
// If `units` is positive, `nanos` must be positive or zero.
// If `units` is zero, `nanos` can be positive, zero, or negative.
// If `units` is negative, `nanos` must be negative or zero.
// For example $-1.75 is represented as `units`=-1 and `nanos`=-750,000,000.
Nanos int32 `protobuf:"varint,2,opt,name=nanos,proto3" json:"nanos,omitempty"`
// contains filtered or unexported fields
}Represents an amount of money in a specific currency.
func (*Amount) Descriptor
Deprecated: Use Amount.ProtoReflect.Descriptor instead.
func (*Amount) GetNanos
func (*Amount) GetUnits
func (*Amount) ProtoMessage
func (*Amount) ProtoMessage()func (*Amount) ProtoReflect
func (x *Amount) ProtoReflect() protoreflect.Messagefunc (*Amount) Reset
func (x *Amount) Reset()func (*Amount) String
Cost
type Cost struct {
// Optional. The cost per million output tokens, calculated as:
// $/output token = GPU $/s / (1/output-to-input-cost-ratio * input tokens/s +
// output tokens/s)
CostPerMillionOutputTokens *Amount `protobuf:"bytes,1,opt,name=cost_per_million_output_tokens,json=costPerMillionOutputTokens,proto3" json:"cost_per_million_output_tokens,omitempty"`
// Optional. The cost per million input tokens. $/input token = ($/output
// token) / output-to-input-cost-ratio.
CostPerMillionInputTokens *Amount `protobuf:"bytes,2,opt,name=cost_per_million_input_tokens,json=costPerMillionInputTokens,proto3" json:"cost_per_million_input_tokens,omitempty"`
// Optional. The pricing model used to calculate the cost. Can be one of:
// `3-years-cud`, `1-year-cud`, `on-demand`, `spot`. If not provided, `spot`
// will be used.
PricingModel string `protobuf:"bytes,3,opt,name=pricing_model,json=pricingModel,proto3" json:"pricing_model,omitempty"`
// Optional. The output-to-input cost ratio. This determines how the total GPU
// cost is split between input and output tokens. If not provided, `4.0` is
// used, assuming a 4:1 output:input cost ratio.
OutputInputCostRatio *float32 `protobuf:"fixed32,4,opt,name=output_input_cost_ratio,json=outputInputCostRatio,proto3,oneof" json:"output_input_cost_ratio,omitempty"`
// contains filtered or unexported fields
}Cost for running a model deployment on a given instance type. Currently, only USD currency code is supported.
func (*Cost) Descriptor
Deprecated: Use Cost.ProtoReflect.Descriptor instead.
func (*Cost) GetCostPerMillionInputTokens
func (*Cost) GetCostPerMillionOutputTokens
func (*Cost) GetOutputInputCostRatio
func (*Cost) GetPricingModel
func (*Cost) ProtoMessage
func (*Cost) ProtoMessage()func (*Cost) ProtoReflect
func (x *Cost) ProtoReflect() protoreflect.Messagefunc (*Cost) Reset
func (x *Cost) Reset()func (*Cost) String
FetchBenchmarkingDataRequest
type FetchBenchmarkingDataRequest struct {
// Required. The model server configuration to get benchmarking data for. Use
// [GkeInferenceQuickstart.FetchProfiles][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchProfiles]
// to find valid configurations.
ModelServerInfo *ModelServerInfo `protobuf:"bytes,1,opt,name=model_server_info,json=modelServerInfo,proto3" json:"model_server_info,omitempty"`
// Optional. The instance type to filter benchmarking data. Instance types are
// in the format `a2-highgpu-1g`. If not provided, all instance types for the
// given profile's `model_server_info` will be returned. Use
// [GkeInferenceQuickstart.FetchProfiles][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchProfiles]
// to find available instance types.
InstanceType string `protobuf:"bytes,3,opt,name=instance_type,json=instanceType,proto3" json:"instance_type,omitempty"`
// Optional. The pricing model to use for the benchmarking data. Defaults to
// `spot`.
PricingModel string `protobuf:"bytes,4,opt,name=pricing_model,json=pricingModel,proto3" json:"pricing_model,omitempty"`
// contains filtered or unexported fields
}Request message for [GkeInferenceQuickstart.FetchBenchmarkingData][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchBenchmarkingData].
func (*FetchBenchmarkingDataRequest) Descriptor
func (*FetchBenchmarkingDataRequest) Descriptor() ([]byte, []int)Deprecated: Use FetchBenchmarkingDataRequest.ProtoReflect.Descriptor instead.
func (*FetchBenchmarkingDataRequest) GetInstanceType
func (x *FetchBenchmarkingDataRequest) GetInstanceType() stringfunc (*FetchBenchmarkingDataRequest) GetModelServerInfo
func (x *FetchBenchmarkingDataRequest) GetModelServerInfo() *ModelServerInfofunc (*FetchBenchmarkingDataRequest) GetPricingModel
func (x *FetchBenchmarkingDataRequest) GetPricingModel() stringfunc (*FetchBenchmarkingDataRequest) ProtoMessage
func (*FetchBenchmarkingDataRequest) ProtoMessage()func (*FetchBenchmarkingDataRequest) ProtoReflect
func (x *FetchBenchmarkingDataRequest) ProtoReflect() protoreflect.Messagefunc (*FetchBenchmarkingDataRequest) Reset
func (x *FetchBenchmarkingDataRequest) Reset()func (*FetchBenchmarkingDataRequest) String
func (x *FetchBenchmarkingDataRequest) String() stringFetchBenchmarkingDataResponse
type FetchBenchmarkingDataResponse struct {
// Output only. List of profiles containing their respective benchmarking
// data.
Profile []*Profile `protobuf:"bytes,1,rep,name=profile,proto3" json:"profile,omitempty"`
// contains filtered or unexported fields
}Response message for [GkeInferenceQuickstart.FetchBenchmarkingData][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchBenchmarkingData].
func (*FetchBenchmarkingDataResponse) Descriptor
func (*FetchBenchmarkingDataResponse) Descriptor() ([]byte, []int)Deprecated: Use FetchBenchmarkingDataResponse.ProtoReflect.Descriptor instead.
func (*FetchBenchmarkingDataResponse) GetProfile
func (x *FetchBenchmarkingDataResponse) GetProfile() []*Profilefunc (*FetchBenchmarkingDataResponse) ProtoMessage
func (*FetchBenchmarkingDataResponse) ProtoMessage()func (*FetchBenchmarkingDataResponse) ProtoReflect
func (x *FetchBenchmarkingDataResponse) ProtoReflect() protoreflect.Messagefunc (*FetchBenchmarkingDataResponse) Reset
func (x *FetchBenchmarkingDataResponse) Reset()func (*FetchBenchmarkingDataResponse) String
func (x *FetchBenchmarkingDataResponse) String() stringFetchModelServerVersionsRequest
type FetchModelServerVersionsRequest struct {
// Required. The model for which to list model server versions. Open-source
// models follow the Huggingface Hub `owner/model_name` format. Use
// [GkeInferenceQuickstart.FetchModels][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchModels]
// to find available models.
Model string `protobuf:"bytes,1,opt,name=model,proto3" json:"model,omitempty"`
// Required. The model server for which to list versions. Open-source model
// servers use simplified, lowercase names (e.g., `vllm`). Use
// [GkeInferenceQuickstart.FetchModelServers][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchModelServers]
// to find available model servers.
ModelServer string `protobuf:"bytes,2,opt,name=model_server,json=modelServer,proto3" json:"model_server,omitempty"`
// Optional. The target number of results to return in a single response.
// If not specified, a default value will be chosen by the service.
// Note that the response may include a partial list and a caller should
// only rely on the response's
// [next_page_token][google.cloud.gkerecommender.v1.FetchModelServerVersionsResponse.next_page_token]
// to determine if there are more instances left to be queried.
PageSize *int32 `protobuf:"varint,3,opt,name=page_size,json=pageSize,proto3,oneof" json:"page_size,omitempty"`
// Optional. The value of
// [next_page_token][google.cloud.gkerecommender.v1.FetchModelServerVersionsResponse.next_page_token]
// received from a previous `FetchModelServerVersionsRequest` call.
// Provide this to retrieve the subsequent page in a multi-page list of
// results. When paginating, all other parameters provided to
// `FetchModelServerVersionsRequest` must match the call that provided the
// page token.
PageToken *string `protobuf:"bytes,4,opt,name=page_token,json=pageToken,proto3,oneof" json:"page_token,omitempty"`
// contains filtered or unexported fields
}Request message for [GkeInferenceQuickstart.FetchModelServerVersions][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchModelServerVersions].
func (*FetchModelServerVersionsRequest) Descriptor
func (*FetchModelServerVersionsRequest) Descriptor() ([]byte, []int)Deprecated: Use FetchModelServerVersionsRequest.ProtoReflect.Descriptor instead.
func (*FetchModelServerVersionsRequest) GetModel
func (x *FetchModelServerVersionsRequest) GetModel() stringfunc (*FetchModelServerVersionsRequest) GetModelServer
func (x *FetchModelServerVersionsRequest) GetModelServer() stringfunc (*FetchModelServerVersionsRequest) GetPageSize
func (x *FetchModelServerVersionsRequest) GetPageSize() int32func (*FetchModelServerVersionsRequest) GetPageToken
func (x *FetchModelServerVersionsRequest) GetPageToken() stringfunc (*FetchModelServerVersionsRequest) ProtoMessage
func (*FetchModelServerVersionsRequest) ProtoMessage()func (*FetchModelServerVersionsRequest) ProtoReflect
func (x *FetchModelServerVersionsRequest) ProtoReflect() protoreflect.Messagefunc (*FetchModelServerVersionsRequest) Reset
func (x *FetchModelServerVersionsRequest) Reset()func (*FetchModelServerVersionsRequest) String
func (x *FetchModelServerVersionsRequest) String() stringFetchModelServerVersionsResponse
type FetchModelServerVersionsResponse struct {
// Output only. A list of available model server versions.
ModelServerVersions []string `protobuf:"bytes,1,rep,name=model_server_versions,json=modelServerVersions,proto3" json:"model_server_versions,omitempty"`
// Output only. A token which may be sent as
// [page_token][FetchModelServerVersionsResponse.page_token] in a subsequent
// `FetchModelServerVersionsResponse` call to retrieve the next page of
// results. If this field is omitted or empty, then there are no more results
// to return.
NextPageToken string `protobuf:"bytes,2,opt,name=next_page_token,json=nextPageToken,proto3" json:"next_page_token,omitempty"`
// contains filtered or unexported fields
}Response message for [GkeInferenceQuickstart.FetchModelServerVersions][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchModelServerVersions].
func (*FetchModelServerVersionsResponse) Descriptor
func (*FetchModelServerVersionsResponse) Descriptor() ([]byte, []int)Deprecated: Use FetchModelServerVersionsResponse.ProtoReflect.Descriptor instead.
func (*FetchModelServerVersionsResponse) GetModelServerVersions
func (x *FetchModelServerVersionsResponse) GetModelServerVersions() []stringfunc (*FetchModelServerVersionsResponse) GetNextPageToken
func (x *FetchModelServerVersionsResponse) GetNextPageToken() stringfunc (*FetchModelServerVersionsResponse) ProtoMessage
func (*FetchModelServerVersionsResponse) ProtoMessage()func (*FetchModelServerVersionsResponse) ProtoReflect
func (x *FetchModelServerVersionsResponse) ProtoReflect() protoreflect.Messagefunc (*FetchModelServerVersionsResponse) Reset
func (x *FetchModelServerVersionsResponse) Reset()func (*FetchModelServerVersionsResponse) String
func (x *FetchModelServerVersionsResponse) String() stringFetchModelServersRequest
type FetchModelServersRequest struct {
// Required. The model for which to list model servers. Open-source models
// follow the Huggingface Hub `owner/model_name` format. Use
// [GkeInferenceQuickstart.FetchModels][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchModels]
// to find available models.
Model string `protobuf:"bytes,1,opt,name=model,proto3" json:"model,omitempty"`
// Optional. The target number of results to return in a single response.
// If not specified, a default value will be chosen by the service.
// Note that the response may include a partial list and a caller should
// only rely on the response's
// [next_page_token][google.cloud.gkerecommender.v1.FetchModelServersResponse.next_page_token]
// to determine if there are more instances left to be queried.
PageSize *int32 `protobuf:"varint,2,opt,name=page_size,json=pageSize,proto3,oneof" json:"page_size,omitempty"`
// Optional. The value of
// [next_page_token][google.cloud.gkerecommender.v1.FetchModelServersResponse.next_page_token]
// received from a previous `FetchModelServersRequest` call.
// Provide this to retrieve the subsequent page in a multi-page list of
// results. When paginating, all other parameters provided to
// `FetchModelServersRequest` must match the call that provided the page
// token.
PageToken *string `protobuf:"bytes,3,opt,name=page_token,json=pageToken,proto3,oneof" json:"page_token,omitempty"`
// contains filtered or unexported fields
}Request message for [GkeInferenceQuickstart.FetchModelServers][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchModelServers].
func (*FetchModelServersRequest) Descriptor
func (*FetchModelServersRequest) Descriptor() ([]byte, []int)Deprecated: Use FetchModelServersRequest.ProtoReflect.Descriptor instead.
func (*FetchModelServersRequest) GetModel
func (x *FetchModelServersRequest) GetModel() stringfunc (*FetchModelServersRequest) GetPageSize
func (x *FetchModelServersRequest) GetPageSize() int32func (*FetchModelServersRequest) GetPageToken
func (x *FetchModelServersRequest) GetPageToken() stringfunc (*FetchModelServersRequest) ProtoMessage
func (*FetchModelServersRequest) ProtoMessage()func (*FetchModelServersRequest) ProtoReflect
func (x *FetchModelServersRequest) ProtoReflect() protoreflect.Messagefunc (*FetchModelServersRequest) Reset
func (x *FetchModelServersRequest) Reset()func (*FetchModelServersRequest) String
func (x *FetchModelServersRequest) String() stringFetchModelServersResponse
type FetchModelServersResponse struct {
// Output only. List of available model servers. Open-source model servers use
// simplified, lowercase names (e.g., `vllm`).
ModelServers []string `protobuf:"bytes,1,rep,name=model_servers,json=modelServers,proto3" json:"model_servers,omitempty"`
// Output only. A token which may be sent as
// [page_token][FetchModelServersResponse.page_token] in a subsequent
// `FetchModelServersResponse` call to retrieve the next page of results.
// If this field is omitted or empty, then there are no more results to
// return.
NextPageToken string `protobuf:"bytes,2,opt,name=next_page_token,json=nextPageToken,proto3" json:"next_page_token,omitempty"`
// contains filtered or unexported fields
}Response message for [GkeInferenceQuickstart.FetchModelServers][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchModelServers].
func (*FetchModelServersResponse) Descriptor
func (*FetchModelServersResponse) Descriptor() ([]byte, []int)Deprecated: Use FetchModelServersResponse.ProtoReflect.Descriptor instead.
func (*FetchModelServersResponse) GetModelServers
func (x *FetchModelServersResponse) GetModelServers() []stringfunc (*FetchModelServersResponse) GetNextPageToken
func (x *FetchModelServersResponse) GetNextPageToken() stringfunc (*FetchModelServersResponse) ProtoMessage
func (*FetchModelServersResponse) ProtoMessage()func (*FetchModelServersResponse) ProtoReflect
func (x *FetchModelServersResponse) ProtoReflect() protoreflect.Messagefunc (*FetchModelServersResponse) Reset
func (x *FetchModelServersResponse) Reset()func (*FetchModelServersResponse) String
func (x *FetchModelServersResponse) String() stringFetchModelsRequest
type FetchModelsRequest struct {
// Optional. The target number of results to return in a single response.
// If not specified, a default value will be chosen by the service.
// Note that the response may include a partial list and a caller should
// only rely on the response's
// [next_page_token][google.cloud.gkerecommender.v1.FetchModelsResponse.next_page_token]
// to determine if there are more instances left to be queried.
PageSize *int32 `protobuf:"varint,1,opt,name=page_size,json=pageSize,proto3,oneof" json:"page_size,omitempty"`
// Optional. The value of
// [next_page_token][google.cloud.gkerecommender.v1.FetchModelsResponse.next_page_token]
// received from a previous `FetchModelsRequest` call.
// Provide this to retrieve the subsequent page in a multi-page list of
// results. When paginating, all other parameters provided to
// `FetchModelsRequest` must match the call that provided the page token.
PageToken *string `protobuf:"bytes,2,opt,name=page_token,json=pageToken,proto3,oneof" json:"page_token,omitempty"`
// contains filtered or unexported fields
}Request message for [GkeInferenceQuickstart.FetchModels][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchModels].
func (*FetchModelsRequest) Descriptor
func (*FetchModelsRequest) Descriptor() ([]byte, []int)Deprecated: Use FetchModelsRequest.ProtoReflect.Descriptor instead.
func (*FetchModelsRequest) GetPageSize
func (x *FetchModelsRequest) GetPageSize() int32func (*FetchModelsRequest) GetPageToken
func (x *FetchModelsRequest) GetPageToken() stringfunc (*FetchModelsRequest) ProtoMessage
func (*FetchModelsRequest) ProtoMessage()func (*FetchModelsRequest) ProtoReflect
func (x *FetchModelsRequest) ProtoReflect() protoreflect.Messagefunc (*FetchModelsRequest) Reset
func (x *FetchModelsRequest) Reset()func (*FetchModelsRequest) String
func (x *FetchModelsRequest) String() stringFetchModelsResponse
type FetchModelsResponse struct {
// Output only. List of available models. Open-source models follow the
// Huggingface Hub `owner/model_name` format.
Models []string `protobuf:"bytes,1,rep,name=models,proto3" json:"models,omitempty"`
// Output only. A token which may be sent as
// [page_token][FetchModelsResponse.page_token] in a subsequent
// `FetchModelsResponse` call to retrieve the next page of results.
// If this field is omitted or empty, then there are no more results to
// return.
NextPageToken string `protobuf:"bytes,2,opt,name=next_page_token,json=nextPageToken,proto3" json:"next_page_token,omitempty"`
// contains filtered or unexported fields
}Response message for [GkeInferenceQuickstart.FetchModels][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchModels].
func (*FetchModelsResponse) Descriptor
func (*FetchModelsResponse) Descriptor() ([]byte, []int)Deprecated: Use FetchModelsResponse.ProtoReflect.Descriptor instead.
func (*FetchModelsResponse) GetModels
func (x *FetchModelsResponse) GetModels() []stringfunc (*FetchModelsResponse) GetNextPageToken
func (x *FetchModelsResponse) GetNextPageToken() stringfunc (*FetchModelsResponse) ProtoMessage
func (*FetchModelsResponse) ProtoMessage()func (*FetchModelsResponse) ProtoReflect
func (x *FetchModelsResponse) ProtoReflect() protoreflect.Messagefunc (*FetchModelsResponse) Reset
func (x *FetchModelsResponse) Reset()func (*FetchModelsResponse) String
func (x *FetchModelsResponse) String() stringFetchProfilesRequest
type FetchProfilesRequest struct {
// Optional. The model to filter profiles by. Open-source models follow the
// Huggingface Hub `owner/model_name` format. If not provided, all models are
// returned. Use
// [GkeInferenceQuickstart.FetchModels][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchModels]
// to find available models.
Model string `protobuf:"bytes,1,opt,name=model,proto3" json:"model,omitempty"`
// Optional. The model server to filter profiles by. If not provided, all
// model servers are returned. Use
// [GkeInferenceQuickstart.FetchModelServers][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchModelServers]
// to find available model servers for a given model.
ModelServer string `protobuf:"bytes,2,opt,name=model_server,json=modelServer,proto3" json:"model_server,omitempty"`
// Optional. The model server version to filter profiles by. If not provided,
// all model server versions are returned. Use
// [GkeInferenceQuickstart.FetchModelServerVersions][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchModelServerVersions]
// to find available versions for a given model and server.
ModelServerVersion string `protobuf:"bytes,3,opt,name=model_server_version,json=modelServerVersion,proto3" json:"model_server_version,omitempty"`
// Optional. The performance requirements to filter profiles. Profiles that do
// not meet these requirements are filtered out. If not provided, all profiles
// are returned.
PerformanceRequirements *PerformanceRequirements `protobuf:"bytes,4,opt,name=performance_requirements,json=performanceRequirements,proto3" json:"performance_requirements,omitempty"`
// Optional. The target number of results to return in a single response. If
// not specified, a default value will be chosen by the service. Note that the
// response may include a partial list and a caller should only rely on the
// response's
// [next_page_token][google.cloud.gkerecommender.v1.FetchProfilesResponse.next_page_token]
// to determine if there are more instances left to be queried.
PageSize *int32 `protobuf:"varint,5,opt,name=page_size,json=pageSize,proto3,oneof" json:"page_size,omitempty"`
// Optional. The value of
// [next_page_token][google.cloud.gkerecommender.v1.FetchProfilesResponse.next_page_token]
// received from a previous `FetchProfilesRequest` call.
// Provide this to retrieve the subsequent page in a multi-page list of
// results. When paginating, all other parameters provided to
// `FetchProfilesRequest` must match the call that provided the page
// token.
PageToken *string `protobuf:"bytes,6,opt,name=page_token,json=pageToken,proto3,oneof" json:"page_token,omitempty"`
// contains filtered or unexported fields
}Request message for [GkeInferenceQuickstart.FetchProfiles][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchProfiles].
func (*FetchProfilesRequest) Descriptor
func (*FetchProfilesRequest) Descriptor() ([]byte, []int)Deprecated: Use FetchProfilesRequest.ProtoReflect.Descriptor instead.
func (*FetchProfilesRequest) GetModel
func (x *FetchProfilesRequest) GetModel() stringfunc (*FetchProfilesRequest) GetModelServer
func (x *FetchProfilesRequest) GetModelServer() stringfunc (*FetchProfilesRequest) GetModelServerVersion
func (x *FetchProfilesRequest) GetModelServerVersion() stringfunc (*FetchProfilesRequest) GetPageSize
func (x *FetchProfilesRequest) GetPageSize() int32func (*FetchProfilesRequest) GetPageToken
func (x *FetchProfilesRequest) GetPageToken() stringfunc (*FetchProfilesRequest) GetPerformanceRequirements
func (x *FetchProfilesRequest) GetPerformanceRequirements() *PerformanceRequirementsfunc (*FetchProfilesRequest) ProtoMessage
func (*FetchProfilesRequest) ProtoMessage()func (*FetchProfilesRequest) ProtoReflect
func (x *FetchProfilesRequest) ProtoReflect() protoreflect.Messagefunc (*FetchProfilesRequest) Reset
func (x *FetchProfilesRequest) Reset()func (*FetchProfilesRequest) String
func (x *FetchProfilesRequest) String() stringFetchProfilesResponse
type FetchProfilesResponse struct {
// Output only. List of profiles that match the given model server info and
// performance requirements (if provided).
Profile []*Profile `protobuf:"bytes,1,rep,name=profile,proto3" json:"profile,omitempty"`
// Output only. The combined range of performance values observed across all
// profiles in this response.
PerformanceRange *PerformanceRange `protobuf:"bytes,2,opt,name=performance_range,json=performanceRange,proto3" json:"performance_range,omitempty"`
// Output only. Additional comments related to the response.
Comments string `protobuf:"bytes,3,opt,name=comments,proto3" json:"comments,omitempty"`
// Output only. A token which may be sent as
// [page_token][FetchProfilesResponse.page_token] in a subsequent
// `FetchProfilesResponse` call to retrieve the next page of results. If this
// field is omitted or empty, then there are no more results to return.
NextPageToken string `protobuf:"bytes,4,opt,name=next_page_token,json=nextPageToken,proto3" json:"next_page_token,omitempty"`
// contains filtered or unexported fields
}Response message for [GkeInferenceQuickstart.FetchProfiles][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchProfiles].
func (*FetchProfilesResponse) Descriptor
func (*FetchProfilesResponse) Descriptor() ([]byte, []int)Deprecated: Use FetchProfilesResponse.ProtoReflect.Descriptor instead.
func (*FetchProfilesResponse) GetComments
func (x *FetchProfilesResponse) GetComments() stringfunc (*FetchProfilesResponse) GetNextPageToken
func (x *FetchProfilesResponse) GetNextPageToken() stringfunc (*FetchProfilesResponse) GetPerformanceRange
func (x *FetchProfilesResponse) GetPerformanceRange() *PerformanceRangefunc (*FetchProfilesResponse) GetProfile
func (x *FetchProfilesResponse) GetProfile() []*Profilefunc (*FetchProfilesResponse) ProtoMessage
func (*FetchProfilesResponse) ProtoMessage()func (*FetchProfilesResponse) ProtoReflect
func (x *FetchProfilesResponse) ProtoReflect() protoreflect.Messagefunc (*FetchProfilesResponse) Reset
func (x *FetchProfilesResponse) Reset()func (*FetchProfilesResponse) String
func (x *FetchProfilesResponse) String() stringGenerateOptimizedManifestRequest
type GenerateOptimizedManifestRequest struct {
// Required. The model server configuration to generate the manifest for. Use
// [GkeInferenceQuickstart.FetchProfiles][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchProfiles]
// to find valid configurations.
ModelServerInfo *ModelServerInfo `protobuf:"bytes,1,opt,name=model_server_info,json=modelServerInfo,proto3" json:"model_server_info,omitempty"`
// Required. The accelerator type. Use
// [GkeInferenceQuickstart.FetchProfiles][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchProfiles]
// to find valid accelerators for a given `model_server_info`.
AcceleratorType string `protobuf:"bytes,2,opt,name=accelerator_type,json=acceleratorType,proto3" json:"accelerator_type,omitempty"`
// Optional. The kubernetes namespace to deploy the manifests in.
KubernetesNamespace string `protobuf:"bytes,3,opt,name=kubernetes_namespace,json=kubernetesNamespace,proto3" json:"kubernetes_namespace,omitempty"`
// Optional. The performance requirements to use for generating Horizontal Pod
// Autoscaler (HPA) resources. If provided, the manifest includes HPA
// resources to adjust the model server replica count to maintain the
// specified targets (e.g., NTPOT, TTFT) at a P50 latency. Cost targets are
// not currently supported for HPA generation. If the specified targets are
// not achievable, the HPA manifest will not be generated.
PerformanceRequirements *PerformanceRequirements `protobuf:"bytes,4,opt,name=performance_requirements,json=performanceRequirements,proto3" json:"performance_requirements,omitempty"`
// Optional. The storage configuration for the model. If not provided, the
// model is loaded from Huggingface.
StorageConfig *StorageConfig `protobuf:"bytes,5,opt,name=storage_config,json=storageConfig,proto3" json:"storage_config,omitempty"`
// contains filtered or unexported fields
}Request message for [GkeInferenceQuickstart.GenerateOptimizedManifest][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.GenerateOptimizedManifest].
func (*GenerateOptimizedManifestRequest) Descriptor
func (*GenerateOptimizedManifestRequest) Descriptor() ([]byte, []int)Deprecated: Use GenerateOptimizedManifestRequest.ProtoReflect.Descriptor instead.
func (*GenerateOptimizedManifestRequest) GetAcceleratorType
func (x *GenerateOptimizedManifestRequest) GetAcceleratorType() stringfunc (*GenerateOptimizedManifestRequest) GetKubernetesNamespace
func (x *GenerateOptimizedManifestRequest) GetKubernetesNamespace() stringfunc (*GenerateOptimizedManifestRequest) GetModelServerInfo
func (x *GenerateOptimizedManifestRequest) GetModelServerInfo() *ModelServerInfofunc (*GenerateOptimizedManifestRequest) GetPerformanceRequirements
func (x *GenerateOptimizedManifestRequest) GetPerformanceRequirements() *PerformanceRequirementsfunc (*GenerateOptimizedManifestRequest) GetStorageConfig
func (x *GenerateOptimizedManifestRequest) GetStorageConfig() *StorageConfigfunc (*GenerateOptimizedManifestRequest) ProtoMessage
func (*GenerateOptimizedManifestRequest) ProtoMessage()func (*GenerateOptimizedManifestRequest) ProtoReflect
func (x *GenerateOptimizedManifestRequest) ProtoReflect() protoreflect.Messagefunc (*GenerateOptimizedManifestRequest) Reset
func (x *GenerateOptimizedManifestRequest) Reset()func (*GenerateOptimizedManifestRequest) String
func (x *GenerateOptimizedManifestRequest) String() stringGenerateOptimizedManifestResponse
type GenerateOptimizedManifestResponse struct {
// Output only. A list of generated Kubernetes manifests.
KubernetesManifests []*KubernetesManifest `protobuf:"bytes,1,rep,name=kubernetes_manifests,json=kubernetesManifests,proto3" json:"kubernetes_manifests,omitempty"`
// Output only. Comments related to deploying the generated manifests.
Comments []string `protobuf:"bytes,2,rep,name=comments,proto3" json:"comments,omitempty"`
// Output only. Additional information about the versioned dependencies used
// to generate the manifests. See [Run best practice inference with GKE
// Inference Quickstart
// recipes](https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/inference-quickstart)
// for details.
ManifestVersion string `protobuf:"bytes,3,opt,name=manifest_version,json=manifestVersion,proto3" json:"manifest_version,omitempty"`
// contains filtered or unexported fields
}Response message for [GkeInferenceQuickstart.GenerateOptimizedManifest][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.GenerateOptimizedManifest].
func (*GenerateOptimizedManifestResponse) Descriptor
func (*GenerateOptimizedManifestResponse) Descriptor() ([]byte, []int)Deprecated: Use GenerateOptimizedManifestResponse.ProtoReflect.Descriptor instead.
func (*GenerateOptimizedManifestResponse) GetComments
func (x *GenerateOptimizedManifestResponse) GetComments() []stringfunc (*GenerateOptimizedManifestResponse) GetKubernetesManifests
func (x *GenerateOptimizedManifestResponse) GetKubernetesManifests() []*KubernetesManifestfunc (*GenerateOptimizedManifestResponse) GetManifestVersion
func (x *GenerateOptimizedManifestResponse) GetManifestVersion() stringfunc (*GenerateOptimizedManifestResponse) ProtoMessage
func (*GenerateOptimizedManifestResponse) ProtoMessage()func (*GenerateOptimizedManifestResponse) ProtoReflect
func (x *GenerateOptimizedManifestResponse) ProtoReflect() protoreflect.Messagefunc (*GenerateOptimizedManifestResponse) Reset
func (x *GenerateOptimizedManifestResponse) Reset()func (*GenerateOptimizedManifestResponse) String
func (x *GenerateOptimizedManifestResponse) String() stringGkeInferenceQuickstartClient
type GkeInferenceQuickstartClient interface {
// Fetches available models. Open-source models follow the Huggingface Hub
// `owner/model_name` format.
FetchModels(ctx context.Context, in *FetchModelsRequest, opts ...grpc.CallOption) (*FetchModelsResponse, error)
// Fetches available model servers. Open-source model servers use simplified,
// lowercase names (e.g., `vllm`).
FetchModelServers(ctx context.Context, in *FetchModelServersRequest, opts ...grpc.CallOption) (*FetchModelServersResponse, error)
// Fetches available model server versions. Open-source servers use their own
// versioning schemas (e.g., `vllm` uses semver like `v1.0.0`).
//
// Some model servers have different versioning schemas depending on the
// accelerator. For example, `vllm` uses semver on GPUs, but returns nightly
// build tags on TPUs. All available versions will be returned when different
// schemas are present.
FetchModelServerVersions(ctx context.Context, in *FetchModelServerVersionsRequest, opts ...grpc.CallOption) (*FetchModelServerVersionsResponse, error)
// Fetches available profiles. A profile contains performance metrics and
// cost information for a specific model server setup. Profiles can be
// filtered by parameters. If no filters are provided, all profiles are
// returned.
//
// Profiles display a single value per performance metric based on the
// provided performance requirements. If no requirements are given, the
// metrics represent the inflection point. See [Run best practice inference
// with GKE Inference Quickstart
// recipes](https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/inference-quickstart#how)
// for details.
FetchProfiles(ctx context.Context, in *FetchProfilesRequest, opts ...grpc.CallOption) (*FetchProfilesResponse, error)
// Generates an optimized deployment manifest for a given model and model
// server, based on the specified accelerator, performance targets, and
// configurations. See [Run best practice inference with GKE Inference
// Quickstart
// recipes](https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/inference-quickstart)
// for deployment details.
GenerateOptimizedManifest(ctx context.Context, in *GenerateOptimizedManifestRequest, opts ...grpc.CallOption) (*GenerateOptimizedManifestResponse, error)
// Fetches all of the benchmarking data available for a profile. Benchmarking
// data returns all of the performance metrics available for a given model
// server setup on a given instance type.
FetchBenchmarkingData(ctx context.Context, in *FetchBenchmarkingDataRequest, opts ...grpc.CallOption) (*FetchBenchmarkingDataResponse, error)
}GkeInferenceQuickstartClient is the client API for GkeInferenceQuickstart service.
For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream.
func NewGkeInferenceQuickstartClient
func NewGkeInferenceQuickstartClient(cc grpc.ClientConnInterface) GkeInferenceQuickstartClientGkeInferenceQuickstartServer
type GkeInferenceQuickstartServer interface {
// Fetches available models. Open-source models follow the Huggingface Hub
// `owner/model_name` format.
FetchModels(context.Context, *FetchModelsRequest) (*FetchModelsResponse, error)
// Fetches available model servers. Open-source model servers use simplified,
// lowercase names (e.g., `vllm`).
FetchModelServers(context.Context, *FetchModelServersRequest) (*FetchModelServersResponse, error)
// Fetches available model server versions. Open-source servers use their own
// versioning schemas (e.g., `vllm` uses semver like `v1.0.0`).
//
// Some model servers have different versioning schemas depending on the
// accelerator. For example, `vllm` uses semver on GPUs, but returns nightly
// build tags on TPUs. All available versions will be returned when different
// schemas are present.
FetchModelServerVersions(context.Context, *FetchModelServerVersionsRequest) (*FetchModelServerVersionsResponse, error)
// Fetches available profiles. A profile contains performance metrics and
// cost information for a specific model server setup. Profiles can be
// filtered by parameters. If no filters are provided, all profiles are
// returned.
//
// Profiles display a single value per performance metric based on the
// provided performance requirements. If no requirements are given, the
// metrics represent the inflection point. See [Run best practice inference
// with GKE Inference Quickstart
// recipes](https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/inference-quickstart#how)
// for details.
FetchProfiles(context.Context, *FetchProfilesRequest) (*FetchProfilesResponse, error)
// Generates an optimized deployment manifest for a given model and model
// server, based on the specified accelerator, performance targets, and
// configurations. See [Run best practice inference with GKE Inference
// Quickstart
// recipes](https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/inference-quickstart)
// for deployment details.
GenerateOptimizedManifest(context.Context, *GenerateOptimizedManifestRequest) (*GenerateOptimizedManifestResponse, error)
// Fetches all of the benchmarking data available for a profile. Benchmarking
// data returns all of the performance metrics available for a given model
// server setup on a given instance type.
FetchBenchmarkingData(context.Context, *FetchBenchmarkingDataRequest) (*FetchBenchmarkingDataResponse, error)
}GkeInferenceQuickstartServer is the server API for GkeInferenceQuickstart service. All implementations should embed UnimplementedGkeInferenceQuickstartServer for forward compatibility
KubernetesManifest
type KubernetesManifest struct {
// Output only. Kubernetes resource kind.
Kind string `protobuf:"bytes,1,opt,name=kind,proto3" json:"kind,omitempty"`
// Output only. Kubernetes API version.
ApiVersion string `protobuf:"bytes,2,opt,name=api_version,json=apiVersion,proto3" json:"api_version,omitempty"`
// Output only. YAML content.
Content string `protobuf:"bytes,3,opt,name=content,proto3" json:"content,omitempty"`
// contains filtered or unexported fields
}A Kubernetes manifest.
func (*KubernetesManifest) Descriptor
func (*KubernetesManifest) Descriptor() ([]byte, []int)Deprecated: Use KubernetesManifest.ProtoReflect.Descriptor instead.
func (*KubernetesManifest) GetApiVersion
func (x *KubernetesManifest) GetApiVersion() stringfunc (*KubernetesManifest) GetContent
func (x *KubernetesManifest) GetContent() stringfunc (*KubernetesManifest) GetKind
func (x *KubernetesManifest) GetKind() stringfunc (*KubernetesManifest) ProtoMessage
func (*KubernetesManifest) ProtoMessage()func (*KubernetesManifest) ProtoReflect
func (x *KubernetesManifest) ProtoReflect() protoreflect.Messagefunc (*KubernetesManifest) Reset
func (x *KubernetesManifest) Reset()func (*KubernetesManifest) String
func (x *KubernetesManifest) String() stringMillisecondRange
type MillisecondRange struct {
// Output only. The minimum value of the range.
Min int32 `protobuf:"varint,1,opt,name=min,proto3" json:"min,omitempty"`
// Output only. The maximum value of the range.
Max int32 `protobuf:"varint,2,opt,name=max,proto3" json:"max,omitempty"`
// contains filtered or unexported fields
}Represents a range of latency values in milliseconds.
func (*MillisecondRange) Descriptor
func (*MillisecondRange) Descriptor() ([]byte, []int)Deprecated: Use MillisecondRange.ProtoReflect.Descriptor instead.
func (*MillisecondRange) GetMax
func (x *MillisecondRange) GetMax() int32func (*MillisecondRange) GetMin
func (x *MillisecondRange) GetMin() int32func (*MillisecondRange) ProtoMessage
func (*MillisecondRange) ProtoMessage()func (*MillisecondRange) ProtoReflect
func (x *MillisecondRange) ProtoReflect() protoreflect.Messagefunc (*MillisecondRange) Reset
func (x *MillisecondRange) Reset()func (*MillisecondRange) String
func (x *MillisecondRange) String() stringModelServerInfo
type ModelServerInfo struct {
// Required. The model. Open-source models follow the Huggingface Hub
// `owner/model_name` format. Use
// [GkeInferenceQuickstart.FetchModels][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchModels]
// to find available models.
Model string `protobuf:"bytes,1,opt,name=model,proto3" json:"model,omitempty"`
// Required. The model server. Open-source model servers use simplified,
// lowercase names (e.g., `vllm`). Use
// [GkeInferenceQuickstart.FetchModelServers][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchModelServers]
// to find available servers.
ModelServer string `protobuf:"bytes,2,opt,name=model_server,json=modelServer,proto3" json:"model_server,omitempty"`
// Optional. The model server version. Use
// [GkeInferenceQuickstart.FetchModelServerVersions][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchModelServerVersions]
// to find available versions. If not provided, the latest available version
// is used.
ModelServerVersion string `protobuf:"bytes,3,opt,name=model_server_version,json=modelServerVersion,proto3" json:"model_server_version,omitempty"`
// contains filtered or unexported fields
}Model server information gives. Valid model server info combinations can be found using [GkeInferenceQuickstart.FetchProfiles][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchProfiles].
func (*ModelServerInfo) Descriptor
func (*ModelServerInfo) Descriptor() ([]byte, []int)Deprecated: Use ModelServerInfo.ProtoReflect.Descriptor instead.
func (*ModelServerInfo) GetModel
func (x *ModelServerInfo) GetModel() stringfunc (*ModelServerInfo) GetModelServer
func (x *ModelServerInfo) GetModelServer() stringfunc (*ModelServerInfo) GetModelServerVersion
func (x *ModelServerInfo) GetModelServerVersion() stringfunc (*ModelServerInfo) ProtoMessage
func (*ModelServerInfo) ProtoMessage()func (*ModelServerInfo) ProtoReflect
func (x *ModelServerInfo) ProtoReflect() protoreflect.Messagefunc (*ModelServerInfo) Reset
func (x *ModelServerInfo) Reset()func (*ModelServerInfo) String
func (x *ModelServerInfo) String() stringPerformanceRange
type PerformanceRange struct {
// Output only. The range of throughput in output tokens per second. This is
// measured as total_output_tokens_generated_by_server /
// elapsed_time_in_seconds.
ThroughputOutputRange *TokensPerSecondRange `protobuf:"bytes,1,opt,name=throughput_output_range,json=throughputOutputRange,proto3" json:"throughput_output_range,omitempty"`
// Output only. The range of TTFT (Time To First Token) in milliseconds. TTFT
// is the time it takes to generate the first token for a request.
TtftRange *MillisecondRange `protobuf:"bytes,2,opt,name=ttft_range,json=ttftRange,proto3" json:"ttft_range,omitempty"`
// Output only. The range of NTPOT (Normalized Time Per Output Token) in
// milliseconds. NTPOT is the request latency normalized by the number of
// output tokens, measured as request_latency / total_output_tokens.
NtpotRange *MillisecondRange `protobuf:"bytes,3,opt,name=ntpot_range,json=ntpotRange,proto3" json:"ntpot_range,omitempty"`
// contains filtered or unexported fields
}Performance range for a model deployment.
func (*PerformanceRange) Descriptor
func (*PerformanceRange) Descriptor() ([]byte, []int)Deprecated: Use PerformanceRange.ProtoReflect.Descriptor instead.
func (*PerformanceRange) GetNtpotRange
func (x *PerformanceRange) GetNtpotRange() *MillisecondRangefunc (*PerformanceRange) GetThroughputOutputRange
func (x *PerformanceRange) GetThroughputOutputRange() *TokensPerSecondRangefunc (*PerformanceRange) GetTtftRange
func (x *PerformanceRange) GetTtftRange() *MillisecondRangefunc (*PerformanceRange) ProtoMessage
func (*PerformanceRange) ProtoMessage()func (*PerformanceRange) ProtoReflect
func (x *PerformanceRange) ProtoReflect() protoreflect.Messagefunc (*PerformanceRange) Reset
func (x *PerformanceRange) Reset()func (*PerformanceRange) String
func (x *PerformanceRange) String() stringPerformanceRequirements
type PerformanceRequirements struct {
// Optional. The target Normalized Time Per Output Token (NTPOT) in
// milliseconds. NTPOT is calculated as `request_latency /
// total_output_tokens`. If not provided, this target will not be enforced.
TargetNtpotMilliseconds *int32 `protobuf:"varint,1,opt,name=target_ntpot_milliseconds,json=targetNtpotMilliseconds,proto3,oneof" json:"target_ntpot_milliseconds,omitempty"`
// Optional. The target Time To First Token (TTFT) in milliseconds. TTFT is
// the time it takes to generate the first token for a request. If not
// provided, this target will not be enforced.
TargetTtftMilliseconds *int32 `protobuf:"varint,2,opt,name=target_ttft_milliseconds,json=targetTtftMilliseconds,proto3,oneof" json:"target_ttft_milliseconds,omitempty"`
// Optional. The target cost for running a profile's model server. If not
// provided, this requirement will not be enforced.
TargetCost *Cost `protobuf:"bytes,3,opt,name=target_cost,json=targetCost,proto3" json:"target_cost,omitempty"`
// contains filtered or unexported fields
}Performance requirements for a profile and or model deployment.
func (*PerformanceRequirements) Descriptor
func (*PerformanceRequirements) Descriptor() ([]byte, []int)Deprecated: Use PerformanceRequirements.ProtoReflect.Descriptor instead.
func (*PerformanceRequirements) GetTargetCost
func (x *PerformanceRequirements) GetTargetCost() *Costfunc (*PerformanceRequirements) GetTargetNtpotMilliseconds
func (x *PerformanceRequirements) GetTargetNtpotMilliseconds() int32func (*PerformanceRequirements) GetTargetTtftMilliseconds
func (x *PerformanceRequirements) GetTargetTtftMilliseconds() int32func (*PerformanceRequirements) ProtoMessage
func (*PerformanceRequirements) ProtoMessage()func (*PerformanceRequirements) ProtoReflect
func (x *PerformanceRequirements) ProtoReflect() protoreflect.Messagefunc (*PerformanceRequirements) Reset
func (x *PerformanceRequirements) Reset()func (*PerformanceRequirements) String
func (x *PerformanceRequirements) String() stringPerformanceStats
type PerformanceStats struct {
// Output only. The number of queries per second.
// Note: This metric can vary widely based on context length and may not be a
// reliable measure of LLM throughput.
QueriesPerSecond float32 `protobuf:"fixed32,1,opt,name=queries_per_second,json=queriesPerSecond,proto3" json:"queries_per_second,omitempty"`
// Output only. The number of output tokens per second. This is the throughput
// measured as total_output_tokens_generated_by_server /
// elapsed_time_in_seconds.
OutputTokensPerSecond int32 `protobuf:"varint,2,opt,name=output_tokens_per_second,json=outputTokensPerSecond,proto3" json:"output_tokens_per_second,omitempty"`
// Output only. The Normalized Time Per Output Token (NTPOT) in milliseconds.
// This is the request latency normalized by the number of output tokens,
// measured as request_latency / total_output_tokens.
NtpotMilliseconds int32 `protobuf:"varint,3,opt,name=ntpot_milliseconds,json=ntpotMilliseconds,proto3" json:"ntpot_milliseconds,omitempty"`
// Output only. The Time To First Token (TTFT) in milliseconds. This is the
// time it takes to generate the first token for a request.
TtftMilliseconds int32 `protobuf:"varint,4,opt,name=ttft_milliseconds,json=ttftMilliseconds,proto3" json:"ttft_milliseconds,omitempty"`
// Output only. The cost of running the model deployment.
Cost []*Cost `protobuf:"bytes,5,rep,name=cost,proto3" json:"cost,omitempty"`
// contains filtered or unexported fields
}Performance statistics for a model deployment.
func (*PerformanceStats) Descriptor
func (*PerformanceStats) Descriptor() ([]byte, []int)Deprecated: Use PerformanceStats.ProtoReflect.Descriptor instead.
func (*PerformanceStats) GetCost
func (x *PerformanceStats) GetCost() []*Costfunc (*PerformanceStats) GetNtpotMilliseconds
func (x *PerformanceStats) GetNtpotMilliseconds() int32func (*PerformanceStats) GetOutputTokensPerSecond
func (x *PerformanceStats) GetOutputTokensPerSecond() int32func (*PerformanceStats) GetQueriesPerSecond
func (x *PerformanceStats) GetQueriesPerSecond() float32func (*PerformanceStats) GetTtftMilliseconds
func (x *PerformanceStats) GetTtftMilliseconds() int32func (*PerformanceStats) ProtoMessage
func (*PerformanceStats) ProtoMessage()func (*PerformanceStats) ProtoReflect
func (x *PerformanceStats) ProtoReflect() protoreflect.Messagefunc (*PerformanceStats) Reset
func (x *PerformanceStats) Reset()func (*PerformanceStats) String
func (x *PerformanceStats) String() stringProfile
type Profile struct {
// Output only. The model server configuration. Use
// [GkeInferenceQuickstart.FetchProfiles][google.cloud.gkerecommender.v1.GkeInferenceQuickstart.FetchProfiles]
// to find valid configurations.
ModelServerInfo *ModelServerInfo `protobuf:"bytes,1,opt,name=model_server_info,json=modelServerInfo,proto3" json:"model_server_info,omitempty"`
// Output only. The accelerator type. Expected format: `nvidia-h100-80gb`.
AcceleratorType string `protobuf:"bytes,2,opt,name=accelerator_type,json=acceleratorType,proto3" json:"accelerator_type,omitempty"`
// Output only. The TPU topology (if applicable).
TpuTopology string `protobuf:"bytes,3,opt,name=tpu_topology,json=tpuTopology,proto3" json:"tpu_topology,omitempty"`
// Output only. The instance type. Expected format: `a2-highgpu-1g`.
InstanceType string `protobuf:"bytes,4,opt,name=instance_type,json=instanceType,proto3" json:"instance_type,omitempty"`
// Output only. The resources used by the model deployment.
ResourcesUsed *ResourcesUsed `protobuf:"bytes,5,opt,name=resources_used,json=resourcesUsed,proto3" json:"resources_used,omitempty"`
// Output only. The performance statistics for this profile.
PerformanceStats []*PerformanceStats `protobuf:"bytes,6,rep,name=performance_stats,json=performanceStats,proto3" json:"performance_stats,omitempty"`
// contains filtered or unexported fields
}A profile containing information about a model deployment.
func (*Profile) Descriptor
Deprecated: Use Profile.ProtoReflect.Descriptor instead.
func (*Profile) GetAcceleratorType
func (*Profile) GetInstanceType
func (*Profile) GetModelServerInfo
func (x *Profile) GetModelServerInfo() *ModelServerInfofunc (*Profile) GetPerformanceStats
func (x *Profile) GetPerformanceStats() []*PerformanceStatsfunc (*Profile) GetResourcesUsed
func (x *Profile) GetResourcesUsed() *ResourcesUsedfunc (*Profile) GetTpuTopology
func (*Profile) ProtoMessage
func (*Profile) ProtoMessage()func (*Profile) ProtoReflect
func (x *Profile) ProtoReflect() protoreflect.Messagefunc (*Profile) Reset
func (x *Profile) Reset()func (*Profile) String
ResourcesUsed
type ResourcesUsed struct {
// Output only. The number of accelerators (e.g., GPUs or TPUs) used by the
// model deployment on the Kubernetes node.
AcceleratorCount int32 `protobuf:"varint,1,opt,name=accelerator_count,json=acceleratorCount,proto3" json:"accelerator_count,omitempty"`
// contains filtered or unexported fields
}Resources used by a model deployment.
func (*ResourcesUsed) Descriptor
func (*ResourcesUsed) Descriptor() ([]byte, []int)Deprecated: Use ResourcesUsed.ProtoReflect.Descriptor instead.
func (*ResourcesUsed) GetAcceleratorCount
func (x *ResourcesUsed) GetAcceleratorCount() int32func (*ResourcesUsed) ProtoMessage
func (*ResourcesUsed) ProtoMessage()func (*ResourcesUsed) ProtoReflect
func (x *ResourcesUsed) ProtoReflect() protoreflect.Messagefunc (*ResourcesUsed) Reset
func (x *ResourcesUsed) Reset()func (*ResourcesUsed) String
func (x *ResourcesUsed) String() stringStorageConfig
type StorageConfig struct {
// Optional. The Google Cloud Storage bucket URI to load the model from. This
// URI must point to the directory containing the model's config file
// (`config.json`) and model weights. A tuned GCSFuse setup can improve
// LLM Pod startup time by more than 7x. Expected format:
// `gs://Storage configuration for a model deployment.
func (*StorageConfig) Descriptor
func (*StorageConfig) Descriptor() ([]byte, []int)Deprecated: Use StorageConfig.ProtoReflect.Descriptor instead.
func (*StorageConfig) GetModelBucketUri
func (x *StorageConfig) GetModelBucketUri() stringfunc (*StorageConfig) GetXlaCacheBucketUri
func (x *StorageConfig) GetXlaCacheBucketUri() stringfunc (*StorageConfig) ProtoMessage
func (*StorageConfig) ProtoMessage()func (*StorageConfig) ProtoReflect
func (x *StorageConfig) ProtoReflect() protoreflect.Messagefunc (*StorageConfig) Reset
func (x *StorageConfig) Reset()func (*StorageConfig) String
func (x *StorageConfig) String() stringTokensPerSecondRange
type TokensPerSecondRange struct {
// Output only. The minimum value of the range.
Min int32 `protobuf:"varint,1,opt,name=min,proto3" json:"min,omitempty"`
// Output only. The maximum value of the range.
Max int32 `protobuf:"varint,2,opt,name=max,proto3" json:"max,omitempty"`
// contains filtered or unexported fields
}Represents a range of throughput values in tokens per second.
func (*TokensPerSecondRange) Descriptor
func (*TokensPerSecondRange) Descriptor() ([]byte, []int)Deprecated: Use TokensPerSecondRange.ProtoReflect.Descriptor instead.
func (*TokensPerSecondRange) GetMax
func (x *TokensPerSecondRange) GetMax() int32func (*TokensPerSecondRange) GetMin
func (x *TokensPerSecondRange) GetMin() int32func (*TokensPerSecondRange) ProtoMessage
func (*TokensPerSecondRange) ProtoMessage()func (*TokensPerSecondRange) ProtoReflect
func (x *TokensPerSecondRange) ProtoReflect() protoreflect.Messagefunc (*TokensPerSecondRange) Reset
func (x *TokensPerSecondRange) Reset()func (*TokensPerSecondRange) String
func (x *TokensPerSecondRange) String() stringUnimplementedGkeInferenceQuickstartServer
type UnimplementedGkeInferenceQuickstartServer struct {
}UnimplementedGkeInferenceQuickstartServer should be embedded to have forward compatible implementations.
func (UnimplementedGkeInferenceQuickstartServer) FetchBenchmarkingData
func (UnimplementedGkeInferenceQuickstartServer) FetchBenchmarkingData(context.Context, *FetchBenchmarkingDataRequest) (*FetchBenchmarkingDataResponse, error)func (UnimplementedGkeInferenceQuickstartServer) FetchModelServerVersions
func (UnimplementedGkeInferenceQuickstartServer) FetchModelServerVersions(context.Context, *FetchModelServerVersionsRequest) (*FetchModelServerVersionsResponse, error)func (UnimplementedGkeInferenceQuickstartServer) FetchModelServers
func (UnimplementedGkeInferenceQuickstartServer) FetchModelServers(context.Context, *FetchModelServersRequest) (*FetchModelServersResponse, error)func (UnimplementedGkeInferenceQuickstartServer) FetchModels
func (UnimplementedGkeInferenceQuickstartServer) FetchModels(context.Context, *FetchModelsRequest) (*FetchModelsResponse, error)func (UnimplementedGkeInferenceQuickstartServer) FetchProfiles
func (UnimplementedGkeInferenceQuickstartServer) FetchProfiles(context.Context, *FetchProfilesRequest) (*FetchProfilesResponse, error)func (UnimplementedGkeInferenceQuickstartServer) GenerateOptimizedManifest
func (UnimplementedGkeInferenceQuickstartServer) GenerateOptimizedManifest(context.Context, *GenerateOptimizedManifestRequest) (*GenerateOptimizedManifestResponse, error)UnsafeGkeInferenceQuickstartServer
type UnsafeGkeInferenceQuickstartServer interface {
// contains filtered or unexported methods
}UnsafeGkeInferenceQuickstartServer may be embedded to opt out of forward compatibility for this service. Use of this interface is not recommended, as added methods to GkeInferenceQuickstartServer will result in compilation errors.