I have a container that keeps crashing in my k8s cluster for unknown reasons. The container's process is an nginx server. The container appears to be receiving a SIGQUIT signal.
Dockerfile
# build environment
FROM node:16-alpine as build
WORKDIR /app
ENV PATH /app/node_modules/.bin:$PATH
COPY package.json ./
COPY package-lock.json ./
RUN npm ci --silent
RUN npm install react-scripts#3.4.1 -g --silent
COPY . ./
RUN npm run build
# production environment
FROM nginx:stable-alpine
COPY --from=build /app/build /usr/share/nginx/html
EXPOSE 80
CMD ["nginx", "-g", "daemon off;"]
container logs
/docker-entrypoint.sh: /docker-entrypoint.d/ is not empty, will attempt to perform configuration
/docker-entrypoint.sh: Looking for shell scripts in /docker-entrypoint.d/
/docker-entrypoint.sh: Launching /docker-entrypoint.d/10-listen-on-ipv6-by-default.sh
10-listen-on-ipv6-by-default.sh: info: Getting the checksum of /etc/nginx/conf.d/default.conf
10-listen-on-ipv6-by-default.sh: info: Enabled listen on IPv6 in /etc/nginx/conf.d/default.conf
/docker-entrypoint.sh: Launching /docker-entrypoint.d/20-envsubst-on-templates.sh
/docker-entrypoint.sh: Launching /docker-entrypoint.d/30-tune-worker-processes.sh
/docker-entrypoint.sh: Configuration complete; ready for start up
2021/11/11 06:40:37 [notice[] 1#1: using the "epoll" event method
2021/11/11 06:40:37 [notice[] 1#1: nginx/1.20.1
2021/11/11 06:40:37 [notice[] 1#1: built by gcc 10.2.1 20201203 (Alpine 10.2.1_pre1)
2021/11/11 06:40:37 [notice[] 1#1: OS: Linux 5.4.120+
2021/11/11 06:40:37 [notice[] 1#1: getrlimit(RLIMIT_NOFILE): 1048576:1048576
2021/11/11 06:40:37 [notice[] 1#1: start worker processes
2021/11/11 06:40:37 [notice[] 1#1: start worker process 32
2021/11/11 06:40:37 [notice[] 1#1: start worker process 33
10.15.128.65 - - [11/Nov/2021:06:40:41 +0000] "\x16\x03\x01\x01\x00\x01\x00\x00\xFC\x03\x03>\x85O#\xCC\xB9\xA5j\xAB\x8D\xC1PpZ\x18$\xE5ah\xDF7\xB1\xFF\xAD\x22\x050\xC3.+\xB6+ \x0F}S)\xC9\x1F\x0BY\x15_\x10\xC6\xAAF\xAA\x9F\x9E_#dG\x01\xF5vzt\xB50&;\x1E\x15\x00&\xC0/\xC00\xC0+\xC0,\xCC\xA8\xCC\xA9\xC0\x13\xC0\x09\xC0\x14\xC0" 400 157 "-" "-" "-"
10.15.128.65 - - [11/Nov/2021:06:40:44 +0000] "\x16\x03\x01\x01\x00\x01\x00\x00\xFC\x03\x03\xD8['\xE75x'\xC3}+v\xC9\x83\x84\x96EKn\xC5\xB6}\xEE\xBE\xD9Gp\xE9\x1BX<n\xB2 \xD9n\xD1\xC5\xFC\xF2\x8D\x92\xAC\xC0\xA8mdF\x17B\xA3y9\xDD\x98b\x0E\x996\xB6\xA5\xAB\xEB\xD4\xDA" 400 157 "-" "-" "-"
10.15.128.65 - - [11/Nov/2021:06:40:47 +0000] "\x16\x03\x01\x01\x00\x01\x00\x00\xFC\x03\x03Fy\x03N\x0E\x11\x89k\x7F\xC5\x00\x90w}\xEB{\x7F\xB1=\xF0" 400 157 "-" "-" "-"
2021/11/11 06:40:47 [notice[] 1#1: signal 3 (SIGQUIT) received, shutting down
2021/11/11 06:40:47 [notice[] 32#32: gracefully shutting down
2021/11/11 06:40:47 [notice[] 32#32: exiting
2021/11/11 06:40:47 [notice[] 33#33: gracefully shutting down
2021/11/11 06:40:47 [notice[] 32#32: exit
2021/11/11 06:40:47 [notice[] 33#33: exiting
2021/11/11 06:40:47 [notice[] 33#33: exit
2021/11/11 06:40:47 [notice[] 1#1: signal 17 (SIGCHLD) received from 33
2021/11/11 06:40:47 [notice[] 1#1: worker process 33 exited with code 0
2021/11/11 06:40:47 [notice[] 1#1: signal 29 (SIGIO) received
2021/11/11 06:40:47 [notice[] 1#1: signal 17 (SIGCHLD) received from 32
2021/11/11 06:40:47 [notice[] 1#1: worker process 32 exited with code 0
2021/11/11 06:40:47 [notice[] 1#1: exit
kubectl get pod PODNAME --output=yaml
apiVersion: v1
kind: Pod
metadata:
annotations:
seccomp.security.alpha.kubernetes.io/pod: runtime/default
creationTimestamp: "2021-11-11T06:40:30Z"
generateName: sgb-web-master-fb9f995fb-
labels:
app: sgb-web-master
pod-template-hash: fb9f995fb
name: sgb-web-master-fb9f995fb-zwhgl
namespace: default
ownerReferences:
- apiVersion: apps/v1
blockOwnerDeletion: true
controller: true
kind: ReplicaSet
name: sgb-web-master-fb9f995fb
uid: 96ebf43d-e2e6-4632-a536-764bcab8daeb
resourceVersion: "66168456"
uid: ed80b0d0-6681-4c2a-8edd-16c8ef6bee86
spec:
containers:
- env:
- name: PORT
value: "80"
image: cflynnus/saigonbros-web:master-d70f3001d130bf986da236a08e1fded4b64e8097
imagePullPolicy: Always
livenessProbe:
failureThreshold: 3
httpGet:
path: /
port: 80
scheme: HTTPS
initialDelaySeconds: 3
periodSeconds: 3
successThreshold: 1
timeoutSeconds: 1
name: saigonbros-web
ports:
- containerPort: 80
name: sgb-web-port
protocol: TCP
resources:
limits:
cpu: 500m
ephemeral-storage: 1Gi
memory: 2Gi
requests:
cpu: 500m
ephemeral-storage: 1Gi
memory: 2Gi
securityContext:
capabilities:
drop:
- NET_RAW
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: kube-api-access-rkwb2
readOnly: true
dnsPolicy: ClusterFirst
enableServiceLinks: true
nodeName: gk3-autopilot-cluster-1-default-pool-43dd48b9-tf0n
preemptionPolicy: PreemptLowerPriority
priority: 0
readinessGates:
- conditionType: cloud.google.com/load-balancer-neg-ready
restartPolicy: Always
schedulerName: gke.io/optimize-utilization-scheduler
securityContext:
seccompProfile:
type: RuntimeDefault
serviceAccount: default
serviceAccountName: default
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 300
volumes:
- name: kube-api-access-rkwb2
projected:
defaultMode: 420
sources:
- serviceAccountToken:
expirationSeconds: 3607
path: token
- configMap:
items:
- key: ca.crt
path: ca.crt
name: kube-root-ca.crt
- downwardAPI:
items:
- fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
path: namespace
status:
conditions:
- lastProbeTime: null
lastTransitionTime: null
message: 'Pod is in NEG "Key{\"k8s1-301c19bd-default-sgb-web-master-80-48ae70f6\",
zone: \"asia-southeast1-a\"}". NEG is not attached to any BackendService with
health checking. Marking condition "cloud.google.com/load-balancer-neg-ready"
to True.'
reason: LoadBalancerNegWithoutHealthCheck
status: "True"
type: cloud.google.com/load-balancer-neg-ready
- lastProbeTime: null
lastTransitionTime: "2021-11-11T06:40:33Z"
status: "True"
type: Initialized
- lastProbeTime: null
lastTransitionTime: "2021-11-11T06:44:42Z"
message: 'containers with unready status: [saigonbros-web]'
reason: ContainersNotReady
status: "False"
type: Ready
- lastProbeTime: null
lastTransitionTime: "2021-11-11T06:44:42Z"
message: 'containers with unready status: [saigonbros-web]'
reason: ContainersNotReady
status: "False"
type: ContainersReady
- lastProbeTime: null
lastTransitionTime: "2021-11-11T06:40:33Z"
status: "True"
type: PodScheduled
containerStatuses:
- containerID: containerd://dfc32581c1edda1a221dc00cede918cfb93225e51e505ea7a9f935fc9ab893d5
image: docker.io/cflynnus/saigonbros-web:master-d70f3001d130bf986da236a08e1fded4b64e8097
imageID: docker.io/cflynnus/saigonbros-web#sha256:ff8d6d42511ed6520967007714dfbd46817fca06bb65ae984bc04a8b90346222
lastState:
terminated:
containerID: containerd://dfc32581c1edda1a221dc00cede918cfb93225e51e505ea7a9f935fc9ab893d5
exitCode: 0
finishedAt: "2021-11-11T06:44:41Z"
reason: Completed
startedAt: "2021-11-11T06:44:30Z"
name: saigonbros-web
ready: false
restartCount: 6
started: false
state:
waiting:
message: back-off 2m40s restarting failed container=saigonbros-web pod=sgb-web-master-fb9f995fb-zwhgl_default(ed80b0d0-6681-4c2a-8edd-16c8ef6bee86)
reason: CrashLoopBackOff
hostIP: 10.148.15.200
phase: Running
podIP: 10.15.128.103
podIPs:
- ip: 10.15.128.103
qosClass: Guaranteed
startTime: "2021-11-11T06:40:33Z"
Your liveness probe is configured as HTTPS on port 80. Just change it to HTTP. Look at the key spec.containers.livenessProbe.httpGet.scheme.
Kubernetes thinks that your pod isn't alive (bad liveness probe) and cause the SIGQUIT.
Normally this will help you. When your pod isn't alive, then Kubernetes tries to restart the app for you.
Edit
You can also identify that behavior in the logs of your nginx:
10.15.128.65 - - [11/Nov/2021:06:40:41 +0000] "\x16\x03\x01\x01\x00\x01\x00\x00\xFC\x03\x03>\x85O#\xCC\xB9\xA5j\xAB\x8D\xC1PpZ\x18$\xE5ah\xDF7\xB1\xFF\xAD\x22\x050\xC3.+\xB6+ \x0F}S)\xC9\x1F\x0BY\x15_\x10\xC6\xAAF\xAA\x9F\x9E_#dG\x01\xF5vzt\xB50&;\x1E\x15\x00&\xC0/\xC00\xC0+\xC0,\xCC\xA8\xCC\xA9\xC0\x13\xC0\x09\xC0\x14\xC0" 400 157 "-" "-" "-"
10.15.128.65 - - [11/Nov/2021:06:40:44 +0000] "\x16\x03\x01\x01\x00\x01\x00\x00\xFC\x03\x03\xD8['\xE75x'\xC3}+v\xC9\x83\x84\x96EKn\xC5\xB6}\xEE\xBE\xD9Gp\xE9\x1BX<n\xB2 \xD9n\xD1\xC5\xFC\xF2\x8D\x92\xAC\xC0\xA8mdF\x17B\xA3y9\xDD\x98b\x0E\x996\xB6\xA5\xAB\xEB\xD4\xDA" 400 157 "-" "-" "-"
10.15.128.65 - - [11/Nov/2021:06:40:47 +0000] "\x16\x03\x01\x01\x00\x01\x00\x00\xFC\x03\x03Fy\x03N\x0E\x11\x89k\x7F\xC5\x00\x90w}\xEB{\x7F\xB1=\xF0" 400 157 "-" "-" "-"
2021/11/11 06:40:47 [notice[] 1#1: signal 3 (SIGQUIT) received, shutting down
There are the three configured liveness probes with a period of three seconds. They are unreadable, because kubernetes send TLS packets (which are in a plain-view not human readable).
Immediately after that, there is the shutdown.
The other way is to read the description of your pod. There you can see, that HTTPS and port 80 are configured. HTTPS runs over port 443, so it must be a configuration error.
Related
I am having an issue where a custom built rabbitmq image works in docker, but continuously restarts within kubernetes.
Dockerfile:
# syntax=docker/dockerfile:1
FROM rabbitmq:management-alpine
ADD rabbitmq.conf /etc/rabbitmq/
ADD definitions.json /etc/rabbitmq/
ENTRYPOINT ["docker-entrypoint.sh"]
EXPOSE 4369 5671 5672 15691 15692 25672
CMD ["rabbitmq-server"]
When run with a simple docker run <IMAGE>, I get logs indicating success, and clearly the service is running in the container:
...
2022-11-25 16:37:41.392367+00:00 [info] <0.229.0> Importing concurrently 7 exchanges...
2022-11-25 16:37:41.394591+00:00 [info] <0.229.0> Importing sequentially 1 global runtime parameters...
2022-11-25 16:37:41.395691+00:00 [info] <0.229.0> Importing concurrently 7 queues...
2022-11-25 16:37:41.400586+00:00 [info] <0.229.0> Importing concurrently 7 bindings...
2022-11-25 16:37:41.403519+00:00 [info] <0.787.0> Resetting node maintenance status
2022-11-25 16:37:41.414900+00:00 [info] <0.846.0> Management plugin: HTTP (non-TLS) listener started on port 15672
2022-11-25 16:37:41.414963+00:00 [info] <0.874.0> Statistics database started.
2022-11-25 16:37:41.415003+00:00 [info] <0.873.0> Starting worker pool 'management_worker_pool' with 3 processes in it
2022-11-25 16:37:41.423652+00:00 [info] <0.888.0> Prometheus metrics: HTTP (non-TLS) listener started on port 15692
2022-11-25 16:37:41.423704+00:00 [info] <0.787.0> Ready to start client connection listeners
2022-11-25 16:37:41.424455+00:00 [info] <0.932.0> started TCP listener on [::]:5672
completed with 4 plugins.
2022-11-25 16:37:41.448054+00:00 [info] <0.787.0> Server startup complete; 4 plugins started.
2022-11-25 16:37:41.448054+00:00 [info] <0.787.0> * rabbitmq_prometheus
2022-11-25 16:37:41.448054+00:00 [info] <0.787.0> * rabbitmq_management
2022-11-25 16:37:41.448054+00:00 [info] <0.787.0> * rabbitmq_web_dispatch
2022-11-25 16:37:41.448054+00:00 [info] <0.787.0> * rabbitmq_management_agent
However, if I take this container, and deploy it within my kubernetes cluster, the pod seems to start, and then exit into a "CrashLoopBackoff" state.
kubectl logs <POD> returns:
Segmentation fault (core dumped)
and kubectl describe pod <POD> returns:
Name: rabbitmq-0
Namespace: *****
Priority: 0
Service Account: *****
Node: minikube/*****
Start Time: Thu, 24 Nov 2022 00:35:28 -0500
Labels: app=rabbitmq
controller-revision-hash=rabbitmq-75d6d74c5d
statefulset.kubernetes.io/pod-name=rabbitmq-0
Annotations: <none>
Status: Running
IP: *****
IPs:
IP: *****
Controlled By: StatefulSet/rabbitmq
Containers:
rabbitmq-deployment:
Container ID: docker://32930809a10ced998083d8adacec209da7081b7c7bfda605f7ac87f78cf23fda
Image: *****/<POD>:latest
Image ID: *****
Ports: 5672/TCP, 15672/TCP, 15692/TCP, 4369/TCP
Host Ports: 0/TCP, 0/TCP, 0/TCP, 0/TCP
State: Waiting
Reason: CrashLoopBackOff
Last State: Terminated
Reason: Completed
Exit Code: 0
Started: Thu, 24 Nov 2022 00:41:26 -0500
Finished: Thu, 24 Nov 2022 00:41:27 -0500
Ready: False
Restart Count: 6
Liveness: exec [rabbitmq-diagnostics status] delay=60s timeout=15s period=60s #success=1 #failure=3
Readiness: exec [rabbitmq-diagnostics ping] delay=20s timeout=10s period=60s #success=1 #failure=3
Environment: <none>
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-sst9x (ro)
Conditions:
Type Status
Initialized True
Ready False
ContainersReady False
PodScheduled True
Volumes:
kube-api-access-sst9x:
Type: Projected (a volume that contains injected data from multiple sources)
TokenExpirationSeconds: 3607
ConfigMapName: kube-root-ca.crt
ConfigMapOptional: <nil>
DownwardAPI: true
QoS Class: BestEffort
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Scheduled 35h default-scheduler Successfully assigned mpa/rabbitmq-0 to minikube
Normal Pulled 35h kubelet Successfully pulled image "*****" in 622.632929ms
Normal Pulled 35h kubelet Successfully pulled image "*****" in 233.765678ms
Normal Pulled 35h kubelet Successfully pulled image "*****" in 203.932962ms
Normal Pulling 35h (x4 over 35h) kubelet Pulling image "*****"
Normal Created 35h (x4 over 35h) kubelet Created container rabbitmq-deployment
Normal Started 35h (x4 over 35h) kubelet Started container rabbitmq-deployment
Normal Pulled 35h kubelet Successfully pulled image "*****" in 212.459802ms
Warning BackOff 35h (x52 over 35h) kubelet Back-off restarting failed container
The section of that describe command that states:
State: Waiting
Reason: CrashLoopBackOff
Last State: Terminated
Reason: Completed
makes me wonder if the process isn't properly being left running. It's almost as if rabbitmq is starting, and then exiting once initialized.
Is there something I am missing here? Thank you.
EDIT:
kubectl get all gives:
NAME READY STATUS RESTARTS AGE
pod/auth-deployment-9cfd4c64f-c5v99 1/1 Running 0 19m
pod/config-deployment-d4f4c959c-dnspd 1/1 Running 0 20m
pod/rabbitmq-0 0/1 CrashLoopBackOff 8 (4m45s ago) 20m
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
service/auth-service ClusterIP 10.101.181.223 <none> 8080/TCP 19m
service/config-service ClusterIP 10.98.208.163 <none> 8080/TCP 20m
NAME READY UP-TO-DATE AVAILABLE AGE
deployment.apps/auth-deployment 1/1 1 1 19m
deployment.apps/config-deployment 1/1 1 1 20m
NAME DESIRED CURRENT READY AGE
replicaset.apps/auth-deployment-9cfd4c64f 1 1 1 19m
replicaset.apps/config-deployment-d4f4c959c 1 1 1 20m
NAME READY AGE
statefulset.apps/rabbitmq 0/1 20m
When running a Go HTTPS server locally with self signed certificates, things are fine
When pushing the same to a docker container (via skaffold -- or Google GKE), ListenAndServeTLS is hanging and the container is looping on recreation.
Certificate was create via:
openssl genrsa -out https-server.key 2048
openssl ecparam -genkey -name secp384r1 -out https-server.key
openssl req -new -x509 -sha256 -key https-server.key -out https-server.crt -days 3650
main.go contains:
if IsSSL {
err := http.ListenAndServeTLS(addr+":"+srvPort, os.Getenv("CERT_FILE"), os.Getenv("KEY_FILE"), handler)
if err != nil {
log.Fatal(err)
}
} else {
log.Fatal(http.ListenAndServe(addr+":"+srvPort, handler))
}
The crt and key files are passed via K8s secrets and my yaml file contains the following:
apiVersion: apps/v1
kind: Deployment
metadata:
name: frontend
spec:
selector:
matchLabels:
app: frontend
template:
metadata:
labels:
app: frontend
annotations:
sidecar.istio.io/rewriteAppHTTPProbers: "true"
spec:
volumes:
- name: google-cloud-key
secret:
secretName: ecomm-key
- name: ssl-cert
secret:
secretName: ecomm-cert-server
- name: ssl-key
secret:
secretName: ecomm-cert-key
containers:
- name: frontend
image: gcr.io/sca-ecommerce-291313/frontend:latest
ports:
- containerPort: 8080
readinessProbe:
initialDelaySeconds: 10
httpGet:
path: "/_healthz"
port: 8080
httpHeaders:
- name: "Cookie"
value: "shop_session-id=x-readiness-probe"
livenessProbe:
initialDelaySeconds: 10
httpGet:
path: "/_healthz"
port: 8080
httpHeaders:
- name: "Cookie"
value: "shop_session-id=x-liveness-probe"
volumeMounts:
- name: ssl-cert
mountPath: /var/secrets/ssl-cert
- name: ssl-key
mountPath: /var/secrets/ssl-key
env:
- name: USE_SSL
value: "true"
- name: CERT_FILE
value: "/var/secrets/ssl-cert/cert-server.pem"
- name: KEY_FILE
value: "/var/secrets/ssl-key/cert-key.pem"
- name: PORT
value: "8080"
I have the same behaviour when referencing the file directly in the code like:
err := http.ListenAndServeTLS(addr+":"+srvPort, "https-server.crt", "https-server.key", handler)
The strange and not helping thing is that ListenAndServeTLS does not give any log output on why it's hanging or a hinch on the problem ( using kubectl logs )
Looking at the kubectl describe pod output:
Name: frontend-85f4d9cb8c-9bjh4
Namespace: ecomm-ns
Priority: 0
Start Time: Fri, 01 Jan 2021 17:04:29 +0100
Labels: app=frontend
app.kubernetes.io/managed-by=skaffold
pod-template-hash=85f4d9cb8c
skaffold.dev/run-id=44518449-c1c1-4b6c-8cc1-406ac6d6b91f
Annotations: sidecar.istio.io/rewriteAppHTTPProbers: true
Status: Running
IP: 192.168.10.7
IPs:
IP: 192.168.10.7
Controlled By: ReplicaSet/frontend-85f4d9cb8c
Containers:
frontend:
Container ID: docker://f867ea7a2f99edf891b571f80ae18f10e261375e073b9d2007bbff1600d272c7
Image: gcr.io/sca-ecommerce-291313/frontend:5110aa8a87655b07cc71ffb2c46fd8739e3c25c222a637b2f5a7a1af1bfccc22
Image ID: docker://sha256:5110aa8a87655b07cc71ffb2c46fd8739e3c25c222a637b2f5a7a1af1bfccc22
Port: 8080/TCP
Host Port: 0/TCP
State: Running
Started: Fri, 01 Jan 2021 17:05:08 +0100
Last State: Terminated
Reason: Error
Exit Code: 2
Started: Fri, 01 Jan 2021 17:04:37 +0100
Finished: Fri, 01 Jan 2021 17:05:07 +0100
Ready: False
Restart Count: 1
Limits:
cpu: 200m
memory: 128Mi
Requests:
cpu: 100m
memory: 64Mi
Liveness: http-get http://:8080/_healthz delay=10s timeout=1s period=10s #success=1 #failure=3
Readiness: http-get http://:8080/_healthz delay=10s timeout=1s period=10s #success=1 #failure=3
Environment:
GOOGLE_APPLICATION_CREDENTIALS: /var/secrets/google/key.json
CERT_FILE: /var/secrets/ssl-cert/cert-server.crt
KEY_FILE: /var/secrets/ssl-key/cert-server.key
PORT: 8080
USE_SSL: true
ONLINE_PRODUCT_CATALOG_SERVICE_ADDR: onlineproductcatalogservice:4040
ENV_PLATFORM: gcp
DISABLE_TRACING: 1
DISABLE_PROFILER: 1
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from default-token-tm62d (ro)
/var/secrets/google from google-cloud-key (rw)
/var/secrets/ssl-cert from ssl-cert (rw)
/var/secrets/ssl-key from ssl-key (rw)
Conditions:
Type Status
Initialized True
Ready False
ContainersReady False
PodScheduled True
Volumes:
google-cloud-key:
Type: Secret (a volume populated by a Secret)
SecretName: ecomm-key
Optional: false
ssl-cert:
Type: Secret (a volume populated by a Secret)
SecretName: https-cert-server
Optional: false
ssl-key:
Type: Secret (a volume populated by a Secret)
SecretName: https-cert-key
Optional: false
default-token-tm62d:
Type: Secret (a volume populated by a Secret)
SecretName: default-token-tm62d
Optional: false
QoS Class: Burstable
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Scheduled 46s default-scheduler Successfully assigned ecomm-ns/frontend-85f4d9cb8c-9bjh4
Warning Unhealthy 17s (x2 over 27s) kubelet Readiness probe failed: HTTP probe failed with statuscode: 400
Normal Pulled 8s (x2 over 41s) kubelet Container image "gcr.io/frontend:5110aa8a87655b07cc71ffb2c46fd8739e3c25c222a637b2f5a7a1af1bfccc22" already present on machine
Normal Created 8s (x2 over 39s) kubelet Created container frontend
Warning Unhealthy 8s (x3 over 28s) kubelet Liveness probe failed: HTTP probe failed with statuscode: 400
Normal Killing 8s kubelet Container frontend failed liveness probe, will be restarted
Normal Started 7s (x2 over 38s) kubelet Started container frontend
The liveness probe and readyness probes are getting a 400 response.
I've accidentally drained/uncordoned all nodes in Kubernetes (even master) and now I'm trying to bring it back by connecting to the ETCD and manually change some keys in there. I successfuly bashed into etcd container:
$ docker ps
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
8fbcb67da963 quay.io/coreos/etcd:v3.3.10 "/usr/local/bin/etcd" 17 hours ago Up 17 hours etcd1
a0d6426df02a cd48205a40f0 "kube-controller-man…" 17 hours ago Up 17 hours k8s_kube-controller-manager_kube-controller-manager-node1_kube-system_0441d7804a7366fd957f8b402008efe5_16
5fa8e47441a0 6bed756ced73 "kube-scheduler --au…" 17 hours ago Up 17 hours k8s_kube-scheduler_kube-scheduler-node1_kube-system_6f33d7866b72ca1b13c79edd42fa8dc6_14
2c8e07cf499f gcr.io/google_containers/pause-amd64:3.1 "/pause" 17 hours ago Up 17 hours k8s_POD_kube-scheduler-node1_kube-system_6f33d7866b72ca1b13c79edd42fa8dc6_3
2ca43282ea1c gcr.io/google_containers/pause-amd64:3.1 "/pause" 17 hours ago Up 17 hours k8s_POD_kube-controller-manager-node1_kube-system_0441d7804a7366fd957f8b402008efe5_3
9473644a3333 gcr.io/google_containers/pause-amd64:3.1 "/pause" 17 hours ago Up 17 hours k8s_POD_kube-apiserver-node1_kube-system_93ff1a9840f77f8b2b924a85815e17fe_3
and then I run:
docker exec -it 8fbcb67da963 /bin/sh
and then I try to run the following:
ETCDCTL_API=3 etcdctl --endpoints https://172.16.16.111:2379 --cacert /etc/ssl/etcd/ssl/ca.pem --key /etc/ssl/etcd/ssl/member-node1-key.pem --cert /etc/ssl/etcd/ssl/member-node1.pem get / --prefix=true -w json --debug
and here is the result I get:
ETCDCTL_CACERT=/etc/ssl/etcd/ssl/ca.pem
ETCDCTL_CERT=/etc/ssl/etcd/ssl/member-node1.pem
ETCDCTL_COMMAND_TIMEOUT=5s
ETCDCTL_DEBUG=true
ETCDCTL_DIAL_TIMEOUT=2s
ETCDCTL_DISCOVERY_SRV=
ETCDCTL_ENDPOINTS=[https://172.16.16.111:2379]
ETCDCTL_HEX=false
ETCDCTL_INSECURE_DISCOVERY=true
ETCDCTL_INSECURE_SKIP_TLS_VERIFY=false
ETCDCTL_INSECURE_TRANSPORT=true
ETCDCTL_KEEPALIVE_TIME=2s
ETCDCTL_KEEPALIVE_TIMEOUT=6s
ETCDCTL_KEY=/etc/ssl/etcd/ssl/member-node1-key.pem
ETCDCTL_USER=
ETCDCTL_WRITE_OUT=json
INFO: 2020/06/24 15:44:07 ccBalancerWrapper: updating state and picker called by balancer: IDLE, 0xc420246c00
INFO: 2020/06/24 15:44:07 dialing to target with scheme: ""
INFO: 2020/06/24 15:44:07 could not get resolver for scheme: ""
INFO: 2020/06/24 15:44:07 balancerWrapper: is pickfirst: false
INFO: 2020/06/24 15:44:07 balancerWrapper: got update addr from Notify: [{172.16.16.111:2379 <nil>}]
INFO: 2020/06/24 15:44:07 ccBalancerWrapper: new subconn: [{172.16.16.111:2379 0 <nil>}]
INFO: 2020/06/24 15:44:07 balancerWrapper: handle subconn state change: 0xc4201708d0, CONNECTING
INFO: 2020/06/24 15:44:07 ccBalancerWrapper: updating state and picker called by balancer: CONNECTING, 0xc420246c00
Error: context deadline exceeded
Here is my etcd.env:
# Environment file for etcd v3.3.10
ETCD_DATA_DIR=/var/lib/etcd
ETCD_ADVERTISE_CLIENT_URLS=https://172.16.16.111:2379
ETCD_INITIAL_ADVERTISE_PEER_URLS=https://172.16.16.111:2380
ETCD_INITIAL_CLUSTER_STATE=existing
ETCD_METRICS=basic
ETCD_LISTEN_CLIENT_URLS=https://172.16.16.111:2379,https://127.0.0.1:2379
ETCD_ELECTION_TIMEOUT=5000
ETCD_HEARTBEAT_INTERVAL=250
ETCD_INITIAL_CLUSTER_TOKEN=k8s_etcd
ETCD_LISTEN_PEER_URLS=https://172.16.16.111:2380
ETCD_NAME=etcd1
ETCD_PROXY=off
ETCD_INITIAL_CLUSTER=etcd1=https://172.16.16.111:2380,etcd2=https://172.16.16.112:2380,etcd3=https://172.16.16.113:2380
ETCD_AUTO_COMPACTION_RETENTION=8
ETCD_SNAPSHOT_COUNT=10000
# TLS settings
ETCD_TRUSTED_CA_FILE=/etc/ssl/etcd/ssl/ca.pem
ETCD_CERT_FILE=/etc/ssl/etcd/ssl/member-node1.pem
ETCD_KEY_FILE=/etc/ssl/etcd/ssl/member-node1-key.pem
ETCD_CLIENT_CERT_AUTH=true
ETCD_PEER_TRUSTED_CA_FILE=/etc/ssl/etcd/ssl/ca.pem
ETCD_PEER_CERT_FILE=/etc/ssl/etcd/ssl/member-node1.pem
ETCD_PEER_KEY_FILE=/etc/ssl/etcd/ssl/member-node1-key.pem
ETCD_PEER_CLIENT_CERT_AUTH=True
Update 1:
Here is my kubeadm-config.yaml:
apiVersion: kubeadm.k8s.io/v1beta2
kind: InitConfiguration
localAPIEndpoint:
advertiseAddress: 172.16.16.111
bindPort: 6443
certificateKey: d73faece88f86e447eea3ca38f7b07e0a1f0bbb886567fee3b8cf8848b1bf8dd
nodeRegistration:
name: node1
taints: []
criSocket: /var/run/dockershim.sock
---
apiVersion: kubeadm.k8s.io/v1beta2
kind: ClusterConfiguration
clusterName: cluster.local
etcd:
external:
endpoints:
- https://172.16.16.111:2379
- https://172.16.16.112:2379
- https://172.16.16.113:2379
caFile: /etc/ssl/etcd/ssl/ca.pem
certFile: /etc/ssl/etcd/ssl/node-node1.pem
keyFile: /etc/ssl/etcd/ssl/node-node1-key.pem
dns:
type: CoreDNS
imageRepository: docker.io/coredns
imageTag: 1.6.0
networking:
dnsDomain: cluster.local
serviceSubnet: 10.233.0.0/18
podSubnet: 10.233.64.0/18
kubernetesVersion: v1.16.6
controlPlaneEndpoint: 172.16.16.111:6443
certificatesDir: /etc/kubernetes/ssl
imageRepository: gcr.io/google-containers
apiServer:
extraArgs:
anonymous-auth: "True"
authorization-mode: Node,RBAC
bind-address: 0.0.0.0
insecure-port: "0"
apiserver-count: "1"
endpoint-reconciler-type: lease
service-node-port-range: 30000-32767
kubelet-preferred-address-types: "InternalDNS,InternalIP,Hostname,ExternalDNS,ExternalIP"
profiling: "False"
request-timeout: "1m0s"
enable-aggregator-routing: "False"
storage-backend: etcd3
runtime-config:
allow-privileged: "true"
extraVolumes:
- name: usr-share-ca-certificates
hostPath: /usr/share/ca-certificates
mountPath: /usr/share/ca-certificates
readOnly: true
certSANs:
- kubernetes
- kubernetes.default
- kubernetes.default.svc
- kubernetes.default.svc.cluster.local
- 10.233.0.1
- localhost
- 127.0.0.1
- node1
- lb-apiserver.kubernetes.local
- 172.16.16.111
- node1.cluster.local
timeoutForControlPlane: 5m0s
controllerManager:
extraArgs:
node-monitor-grace-period: 40s
node-monitor-period: 5s
pod-eviction-timeout: 5m0s
node-cidr-mask-size: "24"
profiling: "False"
terminated-pod-gc-threshold: "12500"
bind-address: 0.0.0.0
configure-cloud-routes: "false"
scheduler:
extraArgs:
bind-address: 0.0.0.0
extraVolumes:
---
apiVersion: kubeproxy.config.k8s.io/v1alpha1
kind: KubeProxyConfiguration
bindAddress: 0.0.0.0
clientConnection:
acceptContentTypes:
burst: 10
contentType: application/vnd.kubernetes.protobuf
kubeconfig:
qps: 5
clusterCIDR: 10.233.64.0/18
configSyncPeriod: 15m0s
conntrack:
maxPerCore: 32768
min: 131072
tcpCloseWaitTimeout: 1h0m0s
tcpEstablishedTimeout: 24h0m0s
enableProfiling: False
healthzBindAddress: 0.0.0.0:10256
hostnameOverride: node1
iptables:
masqueradeAll: False
masqueradeBit: 14
minSyncPeriod: 0s
syncPeriod: 30s
ipvs:
excludeCIDRs: []
minSyncPeriod: 0s
scheduler: rr
syncPeriod: 30s
strictARP: False
metricsBindAddress: 127.0.0.1:10249
mode: ipvs
nodePortAddresses: []
oomScoreAdj: -999
portRange:
udpIdleTimeout: 250ms
---
apiVersion: kubelet.config.k8s.io/v1beta1
kind: KubeletConfiguration
clusterDNS:
- 169.254.25.10
Update 2:
Contents of /etc/kubernetes/manigests/kube-apiserver.yaml:
apiVersion: v1
kind: Pod
metadata:
creationTimestamp: null
labels:
component: kube-apiserver
tier: control-plane
name: kube-apiserver
namespace: kube-system
spec:
containers:
- command:
- kube-apiserver
- --advertise-address=172.16.16.111
- --allow-privileged=true
- --anonymous-auth=True
- --apiserver-count=1
- --authorization-mode=Node,RBAC
- --bind-address=0.0.0.0
- --client-ca-file=/etc/kubernetes/ssl/ca.crt
- --enable-admission-plugins=NodeRestriction
- --enable-aggregator-routing=False
- --enable-bootstrap-token-auth=true
- --endpoint-reconciler-type=lease
- --etcd-cafile=/etc/ssl/etcd/ssl/ca.pem
- --etcd-certfile=/etc/ssl/etcd/ssl/node-node1.pem
- --etcd-keyfile=/etc/ssl/etcd/ssl/node-node1-key.pem
- --etcd-servers=https://172.16.16.111:2379,https://172.16.16.112:2379,https://172.16.16.113:2379
- --insecure-port=0
- --kubelet-client-certificate=/etc/kubernetes/ssl/apiserver-kubelet-client.crt
- --kubelet-client-key=/etc/kubernetes/ssl/apiserver-kubelet-client.key
- --kubelet-preferred-address-types=InternalDNS,InternalIP,Hostname,ExternalDNS,ExternalIP
- --profiling=False
- --proxy-client-cert-file=/etc/kubernetes/ssl/front-proxy-client.crt
- --proxy-client-key-file=/etc/kubernetes/ssl/front-proxy-client.key
- --request-timeout=1m0s
- --requestheader-allowed-names=front-proxy-client
- --requestheader-client-ca-file=/etc/kubernetes/ssl/front-proxy-ca.crt
- --requestheader-extra-headers-prefix=X-Remote-Extra-
- --requestheader-group-headers=X-Remote-Group
- --requestheader-username-headers=X-Remote-User
- --runtime-config=
- --secure-port=6443
- --service-account-key-file=/etc/kubernetes/ssl/sa.pub
- --service-cluster-ip-range=10.233.0.0/18
- --service-node-port-range=30000-32767
- --storage-backend=etcd3
- --tls-cert-file=/etc/kubernetes/ssl/apiserver.crt
- --tls-private-key-file=/etc/kubernetes/ssl/apiserver.key
image: gcr.io/google-containers/kube-apiserver:v1.16.6
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 8
httpGet:
host: 172.16.16.111
path: /healthz
port: 6443
scheme: HTTPS
initialDelaySeconds: 15
timeoutSeconds: 15
name: kube-apiserver
resources:
requests:
cpu: 250m
volumeMounts:
- mountPath: /etc/ssl/certs
name: ca-certs
readOnly: true
- mountPath: /etc/ca-certificates
name: etc-ca-certificates
readOnly: true
- mountPath: /etc/ssl/etcd/ssl
name: etcd-certs-0
readOnly: true
- mountPath: /etc/kubernetes/ssl
name: k8s-certs
readOnly: true
- mountPath: /usr/local/share/ca-certificates
name: usr-local-share-ca-certificates
readOnly: true
- mountPath: /usr/share/ca-certificates
name: usr-share-ca-certificates
readOnly: true
hostNetwork: true
priorityClassName: system-cluster-critical
volumes:
- hostPath:
path: /etc/ssl/certs
type: DirectoryOrCreate
name: ca-certs
- hostPath:
path: /etc/ca-certificates
type: DirectoryOrCreate
name: etc-ca-certificates
- hostPath:
path: /etc/ssl/etcd/ssl
type: DirectoryOrCreate
name: etcd-certs-0
- hostPath:
path: /etc/kubernetes/ssl
type: DirectoryOrCreate
name: k8s-certs
- hostPath:
path: /usr/local/share/ca-certificates
type: DirectoryOrCreate
name: usr-local-share-ca-certificates
- hostPath:
path: /usr/share/ca-certificates
type: ""
name: usr-share-ca-certificates
status: {}
I used kubespray to install the cluster.
How can I connect to the etcd? Any help would be appreciated.
This context deadline exceeded generally happens because of
Using wrong certificates. You could be using peer certificates instead of client certificates. You need to check the Kubernetes API Server parameters which will tell you where are the client certificates located because Kubernetes API Server is a client to ETCD. Then you can use those same certificates in the etcdctl command from the node.
The etcd cluster is not operational anymore because peer members are down.
I've dockerized a Flask app, using gunicorn to serve it. The last line of my Dockerfile is:
CMD source activate my_env && gunicorn --timeout 333 --bind 0.0.0.0:5000 app:app
When running the app locally – either straight in my console, without docker, or with
docker run -dit \
--name my-app \
--publish 5000:5000 \
my-app:latest
It boots up fine. I get a log like:
[2018-12-04 19:32:30 +0000] [8] [INFO] Starting gunicorn 19.7.1
[2018-12-04 19:32:30 +0000] [8] [INFO] Listening at: http://0.0.0.0:5000 (8)
[2018-12-04 19:32:30 +0000] [8] [INFO] Using worker: sync
[2018-12-04 19:32:30 +0000] [16] [INFO] Booting worker with pid: 16
<my app's output>
When running the same image in k8s I get
[2018-12-10 21:09:42 +0000] [5] [INFO] Starting gunicorn 19.7.1
[2018-12-10 21:09:42 +0000] [5] [INFO] Listening at: http://0.0.0.0:5000 (5)
[2018-12-10 21:09:42 +0000] [5] [INFO] Using worker: sync
[2018-12-10 21:09:42 +0000] [13] [INFO] Booting worker with pid: 13
[2018-12-10 21:10:52 +0000] [16] [INFO] Booting worker with pid: 16
[2018-12-10 21:10:53 +0000] [19] [INFO] Booting worker with pid: 19
[2018-12-10 21:14:40 +0000] [22] [INFO] Booting worker with pid: 22
[2018-12-10 21:16:14 +0000] [25] [INFO] Booting worker with pid: 25
[2018-12-10 21:16:25 +0000] [28] [INFO] Booting worker with pid: 28
<etc>
My k8s deployment yaml looks like
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: my-deployment
spec:
replicas: 1
selector:
matchLabels:
app: my-app
template:
metadata:
labels:
app: my-app
spec:
imagePullSecrets:
- name: regcred
containers:
- name: my-frontend
image: my-registry/my-frontend:latest
ports:
- containerPort: 80
- name: my-backend
image: my-registry/my-backend:latest
ports:
- containerPort: 5000
Here, the container in question is my-backend. Any ideas why this is happening?
Update: As I wrote this, the events list that is printed with kubectl describe pods was updated with the following:
Warning FailedMount 9m55s kubelet, minikube MountVolume.SetUp failed for volume "default-token-k2shm" : Get https://localhost:8443/api/v1/namespaces/default/secrets/default-token-k2shm: net/http: TLS handshake timeout
Warning FailedMount 9m53s (x2 over 9m54s) kubelet, minikube MountVolume.SetUp failed for volume "default-token-k2shm" : secrets "default-token-k2shm" is forbidden: User "system:node:minikube" cannot get secrets in the namespace "default": no path found to object
Normal SuccessfulMountVolume 9m50s kubelet, minikube MountVolume.SetUp succeeded for volume "default-token-k2shm"
Not sure if it's relevant to my issue
I solved this by adding resources under the container - mine needed more memory.
resources:
requests:
memory: "512Mi"
cpu: 0.1
limits:
memory: "1024Mi"
cpu: 1.0
Hope that helps.
I have a problem when I set kubelet parameter cluster-dns
My OS is CentOS Linux release 7.0.1406 (Core)
Kernel:Linux master 3.10.0-693.el7.x86_64 #1 SMP Tue Aug 22 21:09:27 UTC 2017 x86_64 x86_64 x86_64 GNU/Linux
kubelet config file:
KUBELET_HOSTNAME="--hostname-override=master"
#KUBELET_API_SERVER="--api-servers=http://master:8080
KUBECONFIG="--kubeconfig=/root/.kube/config-demo"
KUBELET_DNS="–-cluster-dns=10.254.0.10"
KUBELET_DOMAIN="--cluster-domain=cluster.local"
# Add your own!
KUBELET_ARGS="--cgroup-driver=systemd --fail-swap-on=false --pod_infra_container_image=177.1.1.35/library/pause:latest"
config file:
KUBE_LOGTOSTDERR="--logtostderr=true"
KUBE_LOG_LEVEL="--v=4"
KUBE_ALLOW_PRIV="--allow-privileged=false"
KUBE_MASTER="--master=http://master:8080"
kubelet.service file:
[Unit]
Description=Kubernetes Kubelet Server
Documentation=https://github.com/GoogleCloudPlatform/kubernetes
After=docker.service
Requires=docker.service
[Service]
WorkingDirectory=/var/lib/kubelet
EnvironmentFile=-/etc/kubernetes/config
EnvironmentFile=-/etc/kubernetes/kubelet
ExecStart=/usr/bin/kubelet \
$KUBE_LOGTOSTDERR \
$KUBE_LOG_LEVEL \
$KUBELET_API_SERVER \
$KUBELET_DNS \
$KUBELET_DOMAIN \
$KUBELET_ADDRESS \
$KUBELET_PORT \
$KUBELET_HOSTNAME \
$KUBE_ALLOW_PRIV \
$KUBELET_ARGS \
$KUBECONFIG
Restart=on-failure
KillMode=process
[Install]
WantedBy=multi-user.target
When I start the kubelet service I can see the "--cluster-dns=10.254.0.10" parameter is correct set:
root 29705 1 1 13:24 ? 00:00:16 /usr/bin/kubelet --logtostderr=true --v=4 –-cluster-dns=10.254.0.10 --cluster-domain=cluster.local --hostname-override=master --allow-privileged=false --cgroup-driver=systemd --fail-swap-on=false --pod_infra_container_image=177.1.1.35/library/pause:latest --kubeconfig=/root/.kube/config-demo
But when I use systemctl status kubelet check the service the cluster-domain parameter just have only on "-" like:
systemctl status kubelet -l
● kubelet.service - Kubernetes Kubelet Server
Loaded: loaded (/usr/lib/systemd/system/kubelet.service; enabled; vendor preset: disabled)
Active: active (running) since Fri 2018-07-13 13:24:07 CST; 5s ago
Docs: https://github.com/GoogleCloudPlatform/kubernetes
Main PID: 29705 (kubelet)
Memory: 30.6M
CGroup: /system.slice/kubelet.service
└─29705 /usr/bin/kubelet --logtostderr=true --v=4 -cluster-dns=10.254.0.10 --cluster-domain=cluster.local --hostname-override=master --allow-privileged=false --cgroup-driver=systemd --fail-swap-on=false --pod_infra_container_image=177.1.1.35/library/pause:latest --kubeconfig=/root/.kube/config-demo
In the logs say there is nothing set in cluster-dns flag:
Jul 13 13:24:07 master kubelet: I0713 13:24:07.680625 29705 flags.go:27] FLAG: --cluster-dns="[]"
Jul 13 13:24:07 master kubelet: I0713 13:24:07.680636 29705 flags.go:27] FLAG: --cluster-domain="cluster.local"
The Pods with errors:
pod: "java-deploy-69c84746b9-b2d7j_default(ce02d183-864f-11e8-9bdb-525400c4f6bf)". kubelet does not have ClusterDNS IP configured and cannot create Pod using "ClusterFirst" policy. Falling back to "Default" policy.
My kube-dns config file:
apiVersion: v1
kind: Service
metadata:
name: kube-dns
namespace: kube-system
labels:
k8s-app: kube-dns
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/name: "KubeDNS"
spec:
selector:
k8s-app: kube-dns
clusterIP: 10.254.0.10
ports:
- name: dns
port: 53
protocol: UDP
- name: dns-tcp
port: 53
protocol: TCP
---
#apiVersion: v1
#kind: ServiceAccount
#metadata:
# name: kube-dns
# namespace: kube-system
# labels:
# kubernetes.io/cluster-service: "true"
# addonmanager.kubernetes.io/mode: Reconcile
---
apiVersion: v1
kind: ConfigMap
metadata:
name: kube-dns
namespace: kube-system
labels:
addonmanager.kubernetes.io/mode: EnsureExists
---
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: kube-dns
namespace: kube-system
labels:
k8s-app: kube-dns
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
spec:
# replicas: not specified here:
# 1. In order to make Addon Manager do not reconcile this replicas parameter.
# 2. Default is 1.
# 3. Will be tuned in real time if DNS horizontal auto-scaling is turned on.
strategy:
rollingUpdate:
maxSurge: 10%
maxUnavailable: 0
selector:
matchLabels:
k8s-app: kube-dns
template:
metadata:
labels:
k8s-app: kube-dns
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ''
spec:
tolerations:
- key: "CriticalAddonsOnly"
operator: "Exists"
volumes:
- name: kube-dns-config
configMap:
name: kube-dns
optional: true
containers:
- name: kubedns
image: 177.1.1.35/library/kube-dns:1.14.8
resources:
# TODO: Set memory limits when we've profiled the container for large
# clusters, then set request = limit to keep this container in
# guaranteed class. Currently, this container falls into the
# "burstable" category so the kubelet doesn't backoff from restarting it.
limits:
memory: 170Mi
requests:
cpu: 100m
memory: 70Mi
livenessProbe:
httpGet:
path: /healthcheck/kubedns
port: 10054
scheme: HTTP
initialDelaySeconds: 60
timeoutSeconds: 5
successThreshold: 1
failureThreshold: 5
readinessProbe:
httpGet:
path: /readiness
port: 8081
scheme: HTTP
# we poll on pod startup for the Kubernetes master service and
# only setup the /readiness HTTP server once that's available.
initialDelaySeconds: 3
timeoutSeconds: 5
args:
- --domain=cluster.local.
- --dns-port=10053
- --config-dir=/kube-dns-config
- --kube-master-url=http://177.1.1.40:8080
- --v=2
env:
- name: PROMETHEUS_PORT
value: "10055"
ports:
- containerPort: 10053
name: dns-local
protocol: UDP
- containerPort: 10053
name: dns-tcp-local
protocol: TCP
- containerPort: 10055
name: metrics
protocol: TCP
volumeMounts:
- name: kube-dns-config
mountPath: /kube-dns-config
- name: dnsmasq
image: 177.1.1.35/library/dnsmasq:1.14.8
livenessProbe:
httpGet:
path: /healthcheck/dnsmasq
port: 10054
scheme: HTTP
initialDelaySeconds: 60
timeoutSeconds: 5
successThreshold: 1
failureThreshold: 5
args:
- -v=2
- -logtostderr
- -configDir=/etc/k8s/dns/dnsmasq-nanny
- -restartDnsmasq=true
- --
- -k
- --cache-size=1000
- --no-negcache
- --log-facility=-
- --server=/cluster.local/127.0.0.1#10053
- --server=/in-addr.arpa/127.0.0.1#10053
- --server=/ip6.arpa/127.0.0.1#10053
ports:
- containerPort: 53
name: dns
protocol: UDP
- containerPort: 53
name: dns-tcp
protocol: TCP
# see: https://github.com/kubernetes/kubernetes/issues/29055 for details
resources:
requests:
cpu: 150m
memory: 20Mi
volumeMounts:
- name: kube-dns-config
mountPath: /etc/k8s/dns/dnsmasq-nanny
- name: sidecar
image: 177.1.1.35/library/sidecar:1.14.8
livenessProbe:
httpGet:
path: /metrics
port: 10054
scheme: HTTP
initialDelaySeconds: 60
timeoutSeconds: 5
successThreshold: 1
failureThreshold: 5
args:
- --v=2
- --logtostderr
- --probe=kubedns,127.0.0.1:10053,kubernetes.default.svc.cluster.local,5,SRV
- --probe=dnsmasq,127.0.0.1:53,kubernetes.default.svc.cluster.local,5,SRV
ports:
- containerPort: 10054
name: metrics
protocol: TCP
resources:
requests:
memory: 20Mi
cpu: 10m
dnsPolicy: Default # Don't use cluster DNS.
#serviceAccountName: kube-dns
Recheck your kubelet config:
KUBELET_DNS="–-cluster-dns=10.254.0.10"
It seems to me that the first dash is longer than the second.
Maybe a copy&paste you made causes that strange character.
Retype it and retry.