Skip to content

Commit 18280f7

Browse files
petermattisclaude
andcommitted
roachprod/vm: fix JSON serialization of VM.Errors field
The VM.Errors field was typed as []error, but Go's encoding/json cannot unmarshal into an error interface type because it doesn't know what concrete type to create. This caused roachprod list to fail when loading cached cluster info containing VMs with errors: json: cannot unmarshal object into Go struct field VM.vms.errors of type error This change introduces a VMError wrapper type that: - Implements the error interface - Implements json.Marshaler to serialize as a JSON string (error message) - Implements json.Unmarshaler to handle both the new format (strings) and legacy format (empty objects from broken serialization) - Implements Unwrap() so errors.Is/As still work All cloud providers (AWS, Azure, GCE, IBM) are updated to use the new VMError type via NewVMError(). The BadInstanceErrors() function now returns map[string]vm.List (grouped by error message string) instead of map[error]vm.List since serialization loses pointer identity. Release note: None Epic: None Co-authored-by: Claude <[email protected]>
1 parent 81e306b commit 18280f7

File tree

8 files changed

+117
-49
lines changed

8 files changed

+117
-49
lines changed

pkg/cmd/roachprod/cli/BUILD.bazel

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ go_library(
3030
"//pkg/roachprod/logger",
3131
"//pkg/roachprod/roachprodutil",
3232
"//pkg/roachprod/ssh",
33-
"//pkg/roachprod/ui",
3433
"//pkg/roachprod/vm",
3534
"//pkg/roachprod/vm/gce",
3635
"//pkg/util/envutil",

pkg/cmd/roachprod/cli/commands.go

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ import (
2626
"github.com/cockroachdb/cockroach/pkg/roachprod/config"
2727
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
2828
"github.com/cockroachdb/cockroach/pkg/roachprod/roachprodutil"
29-
"github.com/cockroachdb/cockroach/pkg/roachprod/ui"
3029
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
3130
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce"
3231
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
@@ -547,15 +546,15 @@ hosts file.
547546
if listDetails {
548547
collated := filteredCloud.BadInstanceErrors()
549548

550-
// Sort by Error() value for stable output
551-
var errors ui.ErrorsByError
552-
for err := range collated {
553-
errors = append(errors, err)
549+
// Sort by error message for stable output
550+
var errMsgs []string
551+
for msg := range collated {
552+
errMsgs = append(errMsgs, msg)
554553
}
555-
sort.Sort(errors)
554+
sort.Strings(errMsgs)
556555

557-
for _, e := range errors {
558-
fmt.Printf("%s: %s\n", e, collated[e].Names())
556+
for _, msg := range errMsgs {
557+
fmt.Printf("%s: %s\n", msg, collated[msg].Names())
559558
}
560559
}
561560
}

pkg/roachprod/cloud/cluster_cloud.go

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -60,14 +60,17 @@ type Cloud struct {
6060
BadInstances vm.List `json:"bad_instances"`
6161
}
6262

63-
// BadInstanceErrors returns all bad VM instances, grouped by error.
64-
func (c *Cloud) BadInstanceErrors() map[error]vm.List {
65-
ret := map[error]vm.List{}
63+
// BadInstanceErrors returns all bad VM instances, grouped by error message.
64+
// Note: errors are grouped by their string representation to handle cases
65+
// where errors have been serialized/deserialized and lost pointer identity.
66+
func (c *Cloud) BadInstanceErrors() map[string]vm.List {
67+
ret := map[string]vm.List{}
6668

6769
// Expand instances and errors
68-
for _, vm := range c.BadInstances {
69-
for _, err := range vm.Errors {
70-
ret[err] = append(ret[err], vm)
70+
for _, v := range c.BadInstances {
71+
for _, err := range v.Errors {
72+
msg := err.Error()
73+
ret[msg] = append(ret[msg], v)
7174
}
7275
}
7376

@@ -263,11 +266,11 @@ func ListCloud(l *logger.Logger, options vm.ListOptions) (*Cloud, error) {
263266
// Parse cluster/user from VM name, but only for non-local VMs
264267
userName, err := v.UserName()
265268
if err != nil {
266-
v.Errors = append(v.Errors, vm.ErrInvalidUserName)
269+
v.Errors = append(v.Errors, vm.NewVMError(vm.ErrInvalidUserName))
267270
}
268271
clusterName, err := v.ClusterName()
269272
if err != nil {
270-
v.Errors = append(v.Errors, vm.ErrInvalidClusterName)
273+
v.Errors = append(v.Errors, vm.NewVMError(vm.ErrInvalidClusterName))
271274
}
272275

273276
// Anything with an error gets tossed into the BadInstances slice, and we'll correct

pkg/roachprod/vm/aws/aws.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1141,20 +1141,20 @@ func (in *DescribeInstancesOutputInstance) toVM(
11411141
// Convert the tag map into a more useful representation
11421142
tagMap := in.Tags.MakeMap()
11431143

1144-
var errs []error
1144+
var errs []vm.VMError
11451145
createdAt, err := time.Parse(time.RFC3339, in.LaunchTime)
11461146
if err != nil {
1147-
errs = append(errs, vm.ErrNoExpiration)
1147+
errs = append(errs, vm.NewVMError(vm.ErrNoExpiration))
11481148
}
11491149

11501150
var lifetime time.Duration
11511151
if lifeText, ok := tagMap[vm.TagLifetime]; ok {
11521152
lifetime, err = time.ParseDuration(lifeText)
11531153
if err != nil {
1154-
errs = append(errs, err)
1154+
errs = append(errs, vm.NewVMError(err))
11551155
}
11561156
} else {
1157-
errs = append(errs, vm.ErrNoExpiration)
1157+
errs = append(errs, vm.NewVMError(vm.ErrNoExpiration))
11581158
}
11591159

11601160
var nonBootableVolumes []vm.Volume
@@ -1165,11 +1165,11 @@ func (in *DescribeInstancesOutputInstance) toVM(
11651165
if vol, ok := volumes[bdm.Disk.VolumeID]; ok {
11661166
nonBootableVolumes = append(nonBootableVolumes, vol)
11671167
} else {
1168-
errs = append(errs, errors.Newf(
1168+
errs = append(errs, vm.NewVMError(errors.Newf(
11691169
"Attempted to add volume %s however it is not in the attached volumes for instance %s",
11701170
bdm.Disk.VolumeID,
11711171
in.InstanceID,
1172-
))
1172+
)))
11731173
}
11741174
}
11751175
}

pkg/roachprod/vm/azure/azure.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -604,19 +604,19 @@ func (p *Provider) computeVirtualMachineToVM(cvm compute.VirtualMachine) (*vm.VM
604604
}
605605

606606
if createdPtr := cvm.Tags[vm.TagCreated]; createdPtr == nil {
607-
m.Errors = append(m.Errors, vm.ErrNoExpiration)
607+
m.Errors = append(m.Errors, vm.NewVMError(vm.ErrNoExpiration))
608608
} else if parsed, err := time.Parse(time.RFC3339, *createdPtr); err == nil {
609609
m.CreatedAt = parsed
610610
} else {
611-
m.Errors = append(m.Errors, vm.ErrNoExpiration)
611+
m.Errors = append(m.Errors, vm.NewVMError(vm.ErrNoExpiration))
612612
}
613613

614614
if lifetimePtr := cvm.Tags[vm.TagLifetime]; lifetimePtr == nil {
615-
m.Errors = append(m.Errors, vm.ErrNoExpiration)
615+
m.Errors = append(m.Errors, vm.NewVMError(vm.ErrNoExpiration))
616616
} else if parsed, err := time.ParseDuration(*lifetimePtr); err == nil {
617617
m.Lifetime = parsed
618618
} else {
619-
m.Errors = append(m.Errors, vm.ErrNoExpiration)
619+
m.Errors = append(m.Errors, vm.NewVMError(vm.ErrNoExpiration))
620620
}
621621

622622
// The network info needs a separate request.
@@ -625,7 +625,7 @@ func (p *Provider) computeVirtualMachineToVM(cvm compute.VirtualMachine) (*vm.VM
625625
return nil, err
626626
}
627627
if err := p.fillNetworkDetails(context.Background(), m, nicID); errors.Is(err, vm.ErrBadNetwork) {
628-
m.Errors = append(m.Errors, err)
628+
m.Errors = append(m.Errors, vm.NewVMError(err))
629629
} else if err != nil {
630630
return nil, err
631631
}

pkg/roachprod/vm/gce/gcloud.go

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -209,27 +209,27 @@ type jsonVM struct {
209209

210210
// Convert the JSON VM data into our common VM type.
211211
func (jsonVM *jsonVM) toVM(project string, dnsDomain string) (ret *vm.VM) {
212-
var vmErrors []error
212+
var vmErrors []vm.VMError
213213
var err error
214214

215215
// Check "lifetime" label.
216216
var lifetime time.Duration
217217
if lifetimeStr, ok := jsonVM.Labels[vm.TagLifetime]; ok {
218218
if lifetime, err = time.ParseDuration(lifetimeStr); err != nil {
219-
vmErrors = append(vmErrors, vm.ErrNoExpiration)
219+
vmErrors = append(vmErrors, vm.NewVMError(vm.ErrNoExpiration))
220220
}
221221
} else {
222-
vmErrors = append(vmErrors, vm.ErrNoExpiration)
222+
vmErrors = append(vmErrors, vm.NewVMError(vm.ErrNoExpiration))
223223
}
224224

225225
// Extract network information
226226
var publicIP, privateIP, vpc string
227227
if len(jsonVM.NetworkInterfaces) == 0 {
228-
vmErrors = append(vmErrors, vm.ErrBadNetwork)
228+
vmErrors = append(vmErrors, vm.NewVMError(vm.ErrBadNetwork))
229229
} else {
230230
privateIP = jsonVM.NetworkInterfaces[0].NetworkIP
231231
if len(jsonVM.NetworkInterfaces[0].AccessConfigs) == 0 {
232-
vmErrors = append(vmErrors, vm.ErrBadNetwork)
232+
vmErrors = append(vmErrors, vm.NewVMError(vm.ErrBadNetwork))
233233
} else {
234234
_ = jsonVM.NetworkInterfaces[0].AccessConfigs[0].Name // silence unused warning
235235
publicIP = jsonVM.NetworkInterfaces[0].AccessConfigs[0].NatIP
@@ -238,7 +238,7 @@ func (jsonVM *jsonVM) toVM(project string, dnsDomain string) (ret *vm.VM) {
238238
}
239239
if jsonVM.Scheduling.OnHostMaintenance == "" {
240240
// N.B. 'onHostMaintenance' is always non-empty, hence its absense implies a parsing error
241-
vmErrors = append(vmErrors, vm.ErrBadScheduling)
241+
vmErrors = append(vmErrors, vm.NewVMError(vm.ErrBadScheduling))
242242
}
243243

244244
machineType := lastComponent(jsonVM.MachineType)
@@ -261,7 +261,7 @@ func (jsonVM *jsonVM) toVM(project string, dnsDomain string) (ret *vm.VM) {
261261

262262
vol, volType, err := jsonVMDisk.toVolume()
263263
if err != nil {
264-
vmErrors = append(vmErrors, err)
264+
vmErrors = append(vmErrors, vm.NewVMError(err))
265265
continue
266266
}
267267

@@ -3115,7 +3115,7 @@ func (j *managedInstanceGroupInstance) toVM(
31153115
) *vm.VM {
31163116

31173117
var err error
3118-
var vmErrors []error
3118+
var vmErrors []vm.VMError
31193119

31203120
remoteUser := config.SharedUser
31213121
if !config.UseSharedUser {
@@ -3130,10 +3130,10 @@ func (j *managedInstanceGroupInstance) toVM(
31303130
var lifetime time.Duration
31313131
if lifetimeStr, ok := instanceTemplate.Properties.Labels[vm.TagLifetime]; ok {
31323132
if lifetime, err = time.ParseDuration(lifetimeStr); err != nil {
3133-
vmErrors = append(vmErrors, vm.ErrNoExpiration)
3133+
vmErrors = append(vmErrors, vm.NewVMError(vm.ErrNoExpiration))
31343134
}
31353135
} else {
3136-
vmErrors = append(vmErrors, vm.ErrNoExpiration)
3136+
vmErrors = append(vmErrors, vm.NewVMError(vm.ErrNoExpiration))
31373137
}
31383138

31393139
var arch vm.CPUArch
@@ -3148,7 +3148,7 @@ func (j *managedInstanceGroupInstance) toVM(
31483148
for _, disk := range instanceTemplate.Properties.Disks {
31493149
vol, volType, err := disk.toVolume(j.Name, zone)
31503150
if err != nil {
3151-
vmErrors = append(vmErrors, err)
3151+
vmErrors = append(vmErrors, vm.NewVMError(err))
31523152
continue
31533153
}
31543154

pkg/roachprod/vm/ibm/ibm_extended_types.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -548,45 +548,45 @@ func (i *instance) toVM() vm.VM {
548548

549549
if i == nil || i.instance == nil {
550550
return vm.VM{
551-
Errors: []error{errors.New("instance is nil")},
551+
Errors: []vm.VMError{vm.NewVMError(errors.New("instance is nil"))},
552552
}
553553
}
554554

555555
if !i.initialized {
556556
err := i.load()
557557
if err != nil {
558558
return vm.VM{
559-
Errors: []error{errors.Wrap(err, "failed to load instance")},
559+
Errors: []vm.VMError{vm.NewVMError(errors.Wrap(err, "failed to load instance"))},
560560
}
561561
}
562562
}
563563

564-
var vmErrors []error
564+
var vmErrors []vm.VMError
565565

566566
vpcID := ""
567567
region, err := i.provider.zoneToRegion(*i.instance.Zone.Name)
568568
if err != nil {
569-
vmErrors = append(vmErrors, errors.Wrap(err, "unable to get region"))
569+
vmErrors = append(vmErrors, vm.NewVMError(errors.Wrap(err, "unable to get region")))
570570
} else {
571571
vpcID = i.provider.config.regions[region].vpcID
572572
}
573573

574574
// Check if the instance is in a valid state.
575575
if core.StringNilMapper(i.instance.Status) == "failed" {
576-
vmErrors = append(vmErrors, errors.New("instance is in failed state"))
576+
vmErrors = append(vmErrors, vm.NewVMError(errors.New("instance is in failed state")))
577577
}
578578

579579
// Gather tags
580580
tags, err := i.getTagsAsMap()
581581
if err != nil {
582-
vmErrors = append(vmErrors, errors.Wrap(err, "unable to get tags"))
582+
vmErrors = append(vmErrors, vm.NewVMError(errors.Wrap(err, "unable to get tags")))
583583
}
584584

585585
var lifetime time.Duration
586586
if lifeText, ok := tags[vm.TagLifetime]; ok {
587587
lifetime, err = time.ParseDuration(lifeText)
588588
if err != nil {
589-
vmErrors = append(vmErrors, errors.Wrap(err, "unable to compute lifetime"))
589+
vmErrors = append(vmErrors, vm.NewVMError(errors.Wrap(err, "unable to compute lifetime")))
590590
}
591591
} else {
592592
// Missing lifetime tag, use the default lifetime.
@@ -598,7 +598,7 @@ func (i *instance) toVM() vm.VM {
598598
privateIP := i.getPrivateIPAddress()
599599
publicIP, err := i.getPublicIPAddress()
600600
if err != nil {
601-
vmErrors = append(vmErrors, errors.Wrap(err, "unable to get public IP"))
601+
vmErrors = append(vmErrors, vm.NewVMError(errors.Wrap(err, "unable to get public IP")))
602602
}
603603

604604
nonBootAttachedVolumes := []vm.Volume{}

pkg/roachprod/vm/vm.go

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
package vm
77

88
import (
9+
"encoding/json"
910
"fmt"
1011
"regexp"
1112
"sort"
@@ -105,7 +106,7 @@ type VM struct {
105106
CreatedAt time.Time `json:"created_at"`
106107
// If non-empty, indicates that some or all of the data in the VM instance
107108
// is not present or otherwise invalid.
108-
Errors []error `json:"errors"`
109+
Errors []VMError `json:"errors"`
109110
Lifetime time.Duration `json:"lifetime"`
110111
Preemptible bool `json:"preemptible"`
111112
Labels map[string]string `json:"labels"`
@@ -171,7 +172,7 @@ func Name(cluster string, idx int) string {
171172
return fmt.Sprintf("%s-%0.4d", cluster, idx)
172173
}
173174

174-
// Error values for VM.Error
175+
// Error values for VM.Errors
175176
var (
176177
ErrBadNetwork = errors.New("could not determine network information")
177178
ErrBadScheduling = errors.New("could not determine scheduling information")
@@ -180,6 +181,72 @@ var (
180181
ErrNoExpiration = errors.New("could not determine expiration")
181182
)
182183

184+
// VMError wraps an error and implements json.Marshaler/Unmarshaler so that
185+
// errors can be properly serialized to and from JSON. This is necessary because
186+
// Go's encoding/json cannot unmarshal into an error interface type.
187+
type VMError struct {
188+
err error
189+
}
190+
191+
// NewVMError creates a new VMError from an error.
192+
func NewVMError(err error) VMError {
193+
return VMError{err: err}
194+
}
195+
196+
// Error implements the error interface.
197+
func (e VMError) Error() string {
198+
if e.err == nil {
199+
return ""
200+
}
201+
return e.err.Error()
202+
}
203+
204+
// Unwrap returns the underlying error for use with errors.Is/As.
205+
func (e VMError) Unwrap() error {
206+
return e.err
207+
}
208+
209+
// MarshalJSON implements json.Marshaler.
210+
func (e VMError) MarshalJSON() ([]byte, error) {
211+
if e.err == nil {
212+
return []byte(`""`), nil
213+
}
214+
return json.Marshal(e.err.Error())
215+
}
216+
217+
// UnmarshalJSON implements json.Unmarshaler.
218+
// It handles both the new format (JSON string) and the legacy format (JSON object)
219+
// that was produced when []error was serialized directly. The legacy format
220+
// typically produces empty objects `{}` since error types don't have exported fields.
221+
func (e *VMError) UnmarshalJSON(data []byte) error {
222+
// Try to unmarshal as a string first (new format)
223+
var msg string
224+
if err := json.Unmarshal(data, &msg); err == nil {
225+
if msg == "" {
226+
e.err = nil
227+
} else {
228+
// Use Newf with %s format to satisfy the fmtsafe linter which requires
229+
// constant format strings.
230+
e.err = errors.Newf("%s", msg)
231+
}
232+
return nil
233+
}
234+
235+
// If that fails, try to unmarshal as an object (legacy format).
236+
// The legacy format serialized error interfaces as objects, but since
237+
// errors.errorString has no exported fields, they serialize as empty `{}`.
238+
// We treat any object as a nil/unknown error to maintain backwards compatibility.
239+
var obj map[string]any
240+
if err := json.Unmarshal(data, &obj); err == nil {
241+
// Legacy format - treat as nil error (the error message was lost in serialization)
242+
e.err = nil
243+
return nil
244+
}
245+
246+
// If neither worked, return an error
247+
return errors.Newf("cannot unmarshal VMError from: %s", string(data))
248+
}
249+
183250
var regionRE = regexp.MustCompile(`(.*[^-])-?[0-9a-z]$`)
184251

185252
// IsLocal returns true if the VM represents the local host.

0 commit comments

Comments
 (0)