-
Notifications
You must be signed in to change notification settings - Fork 498
Expand file tree
/
Copy pathsupport-bundle-controller.yaml
More file actions
239 lines (239 loc) · 7.87 KB
/
support-bundle-controller.yaml
File metadata and controls
239 lines (239 loc) · 7.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
apiVersion: troubleshoot.sh/v1beta2
kind: SupportBundle
metadata:
name: controller
spec:
uri: # TODO
collectors:
- clusterInfo: {}
- clusterResources:
namespaces:
- kube-system
- k0s-autopilot
- kube-node-lease
- default # so we get kubernetes svc endpoints
- nodeMetrics: {}
hostCollectors:
# System Info Collectors
- cpu: {}
- hostOS: {}
- hostServices: {}
- ipv4Interfaces: {}
- memory: {}
- time: {}
# Certificate Info for ETCD and K8s API
- certificate:
collectorName: k8s-api-keypair
certificatePath: /var/lib/k0s/pki/server.crt
keyPath: /var/lib/k0s/pki/server.key
- certificate:
collectorName: etcd-keypair
certificatePath: /var/lib/k0s/pki/etcd/server.crt
keyPath: /var/lib/k0s/pki/etcd/server.key
# Disk usage for commonly used directories in kURL installs
- diskUsage:
collectorName: root
path: /
# Run collectors for system information
- run:
collectorName: k8s-api-healthz-6443
command: "curl"
args: ["-k", "--cert", "/var/lib/k0s/pki/admin.crt", "--key", "/var/lib/k0s/pki/admin.key", "https://localhost:6443/healthz?verbose"]
- run:
collectorName: curl-etcd-health-2379
command: "curl"
args: ["-ki", "https://localhost:2379/health", "--cert", "/var/lib/k0s/pki/apiserver-etcd-client.crt", "--key", "/var/lib/k0s/pki/apiserver-etcd-client.key"]
- run:
collectorName: etcd-members
command: "k0s"
args: ["etcd", "member-list"]
- run:
collectorName: "free"
command: "free"
args: ["-m"]
- run:
collectorName: "top"
command: "top"
args: ["-b", "-n", "1"]
- run:
collectorName: "uptime"
command: "uptime"
args: []
- run:
collectorName: "uname"
command: "uname"
args: ["-a"]
- run:
collectorName: "df"
command: "df"
args: ["-h"]
- run:
collectorName: "iostat"
command: "iostat"
args: ["-x"]
# Systemctl service statuses
- run:
collectorName: "systemctl-firewalld-status"
command: "systemctl"
args: ["status", "firewalld"]
- run:
collectorName: "systemctl-ufw-status"
command: "systemctl"
args: ["status", "ufw"]
- run:
collectorName: "systemctl-k0s-status"
command: "systemctl"
args: ["status", "k0s*"]
# systemd Service Configurations
- run:
collectorName: "systemctl-cat-journald"
command: "systemctl"
args: ["cat", "systemd-journald"]
- run:
collectorName: "systemctl-cat-k0s"
command: "systemctl"
args: ["cat", "k0s*"]
# TODO Add same checks for rc-service
# Logs for k0s
- run:
collectorName: "journalctl-k0s"
command: "journalctl"
args: ["-u", "k0s*", "--no-pager", "-S", "7 days ago"]
- run:
collectorName: "journalctl-dmesg"
command: "journalctl"
args: ["--dmesg", "--no-pager", "-S", "7 days ago"]
# k0s status
- run:
collectorName: k0s-status
command: "k0s"
args: ["status", "-o", "yaml"]
# Gathering hostname info to help troubleshoot scenarios where the hostname mismatch
- run:
collectorName: "hostnames"
command: "sh"
args:
- -c
- |
echo "hostname = $(hostname)"
echo "/proc/sys/kernel/hostname = $(cat /proc/sys/kernel/hostname)"
echo "uname -n = $(uname -n)"
# System Info Collectors
- run:
collectorName: "vmstat"
command: "vmstat"
args: ["-w"]
- run:
collectorName: "ps-high-load"
command: "sh"
args: ["-c", "ps -eo s,user,cmd | grep ^[RD] | sort | uniq -c | sort -nbr | head -20"]
- filesystemPerformance:
collectorName: filesystem-latency-two-minute-benchmark
timeout: 2m
directory: /var/lib/k0s/etcd
fileSize: 22Mi
operationSizeBytes: 2300
datasync: true
enableBackgroundIOPS: true
backgroundIOPSWarmupSeconds: 10
backgroundWriteIOPS: 300
backgroundWriteIOPSJobs: 6
backgroundReadIOPS: 50
backgroundReadIOPSJobs: 1
exclude: true
- run:
collectorName: "localhost-ips"
command: "sh"
args: ["-c", "host localhost"]
hostAnalyzers:
- time:
checkName: "ntp-status"
outcomes:
- fail:
when: "ntp == unsynchronized+inactive"
message: "System clock is not synchronized"
- warn:
when: "ntp == unsynchronized+active"
message: System clock not yet synchronized
- pass:
when: "ntp == synchronized+active"
message: "System clock is synchronized"
- warn:
when: "timezone != UTC"
message: "Non UTC timezone can interfere with system function"
- pass:
when: "timezone == UTC"
message: "Timezone is set to UTC"
- diskUsage:
checkName: "root"
collectorName: "root"
outcomes:
- fail:
when: "total < 40Gi"
message: The disk containing directory / has less than 40Gi of total space
- warn:
when: "used/total > 80%"
message: The disk containing directory / is more than 80% full
- warn:
when: "available < 10Gi"
message: The disk containing directory / has less than 10Gi of disk space available
- pass:
message: The disk containing directory / has sufficient space
- diskUsage:
checkName: "tmp"
collectorName: "tmp"
outcomes:
- warn:
when: "total < 8Gi"
message: The disk containing directory /tmp has less than 8Gi of total space
- warn:
when: "used/total > 80%"
message: The disk containing directory /tmp is more than 80% full
- warn:
when: "available < 2Gi"
message: The disk containing directory /tmp has less than 2Gi of disk space available
- pass:
message: The disk containing directory /tmp has sufficient space
- filesystemPerformance:
collectorName: filesystem-latency-two-minute-benchmark
outcomes:
- pass:
when: "p99 < 10ms"
message: "Write latency is ok (p99 target < 10ms)"
- warn:
message: "Write latency is high. p99 target >= 10ms)"
exclude: true
analyzers:
- textAnalyze:
checkName: Kubernetes API health check
fileName: host-collectors/run-host/k8s-api-healthz-6443.txt
regex: ".*healthz check passed*"
outcomes:
- fail:
when: "false"
message: "Kubernetes API health check did not pass. One or more components are not working."
- pass:
when: "true"
message: "Kubernetes API health check passed"
- textAnalyze:
checkName: ETCD API Health
fileName: host-collectors/run-host/curl-etcd-health-2379.txt
regex: ".*\"health\":\"true\"*"
outcomes:
- fail:
when: "false"
message: "ETCD status returned: unhealthy"
- pass:
when: "true"
message: "ETCD status returned: healthy"
- textAnalyze:
checkName: Check if localhost resolves to 127.0.0.1
fileName: host-collectors/run-host/localhost-ips.txt
regex: 'localhost has address 127.0.0.1'
outcomes:
- fail:
when: "false"
message: "'localhost' does not resolve to 127.0.0.1 ip address"
- pass:
when: "true"
message: "'localhost' resolves to 127.0.0.1 ip address"