ECS Fargate 任务在使用 Terraform 创建时运行状况检查失败



我创建了一个 ECS 集群和一个负载均衡器,以使用 Terraform 在 Fargate 上公开一个 basc hello-world 节点应用程序。Terraform 设法很好地创建了我的 aws 资源,并在 ECS Fargate 上部署了正确的映像,但该任务永远不会通过初始运行状况检查并无限期重新启动。我认为这是一个端口转发问题,但我相信我的Dockerfile、负载均衡器和任务定义都公开了正确的端口。

以下是我在 ECS 仪表板上查看服务的"事件"选项卡时看到的错误:

service my-first-service (port 2021) is unhealthy in target-group target-group due to (reason Request timed out).

以下是我的应用程序代码、Dockerfile 和我用来部署到 Fargate 的 Terraform 文件:

index.js

const express = require('express')
const app = express()
const port = 2021
app.get('/', (req, res) => res.send('Hello World!'))
app.listen(port, () => console.log(`Example app listening on port ${port}!`))

Dockerfile

# Use an official Node runtime as a parent image
FROM node:12.7.0-alpine
# Set the working directory to /app
WORKDIR '/app'
# Copy package.json to the working directory
COPY package.json .
# Install any needed packages specified in package.json
RUN yarn
# Copying the rest of the code to the working directory
COPY . .
# Make port 2021 available to the world outside this container
EXPOSE 2021
# Run index.js when the container launches
CMD ["node", "index.js"]

application_load_balancer_target_group.tf

resource "aws_lb_target_group" "target_group" {
name        = "target-group"
port        = 80
protocol    = "HTTP"
target_type = "ip"
vpc_id      = "${aws_default_vpc.default_vpc.id}" # Referencing the default VPC
health_check {
matcher = "200,301,302"
path = "/"
}
}
resource "aws_lb_listener" "listener" {
load_balancer_arn = "${aws_alb.application_load_balancer.arn}" # Referencing our load balancer
port              = "80"
protocol          = "HTTP"
default_action {
type             = "forward"
target_group_arn = "${aws_lb_target_group.target_group.arn}" # Referencing our tagrte group
}
}

application_load_balaner.tf

resource "aws_alb" "application_load_balancer" {
name               = "test-lb-tf" # Naming our load balancer
load_balancer_type = "application"
subnets = [ # Referencing the default subnets
"${aws_default_subnet.default_subnet_a.id}",
"${aws_default_subnet.default_subnet_b.id}",
"${aws_default_subnet.default_subnet_c.id}"
]
# Referencing the security group
security_groups = ["${aws_security_group.load_balancer_security_group.id}"]
}
# Creating a security group for the load balancer:
resource "aws_security_group" "load_balancer_security_group" {
ingress {
from_port   = 80 # Allowing traffic in from port 80
to_port     = 80
protocol    = "tcp"
cidr_blocks = ["0.0.0.0/0"] # Allowing traffic in from all sources
}
egress {
from_port   = 0 # Allowing any incoming port
to_port     = 0 # Allowing any outgoing port
protocol    = "-1" # Allowing any outgoing protocol 
cidr_blocks = ["0.0.0.0/0"] # Allowing traffic out to all IP addresses
}
}

ecs_cluster.tf

resource "aws_ecs_cluster" "my_cluster" {
name = "my-cluster" # Naming the cluster
}

ecs_service.tf

# Providing a reference to our default VPC (these are needed by the aws_ecs_service at the bottom of this file)
resource "aws_default_vpc" "default_vpc" {
}
# Providing a reference to our default subnets (NOTE: Make sure the availability zones match your zone)
resource "aws_default_subnet" "default_subnet_a" {
availability_zone = "us-east-2a"
}
resource "aws_default_subnet" "default_subnet_b" {
availability_zone = "us-east-2b"
}
resource "aws_default_subnet" "default_subnet_c" {
availability_zone = "us-east-2c"
}

resource "aws_ecs_service" "my_first_service" {
name            = "my-first-service"                             # Naming our first service
cluster         = "${aws_ecs_cluster.my_cluster.id}"             # Referencing our created Cluster
task_definition = "${aws_ecs_task_definition.my_first_task.arn}" # Referencing the task our service will spin up
launch_type     = "FARGATE"
desired_count   = 1 # Setting the number of containers we want deployed to 1
# NOTE: The following 'load_balancer' snippet was added here after the creation of the application_load_balancer files.
load_balancer {
target_group_arn = "${aws_lb_target_group.target_group.arn}" # Referencing our target group
container_name   = "${aws_ecs_task_definition.my_first_task.family}"
container_port   = 2021 # Specifying the container port
}
network_configuration {
subnets          = ["${aws_default_subnet.default_subnet_a.id}", "${aws_default_subnet.default_subnet_b.id}", "${aws_default_subnet.default_subnet_c.id}"]
assign_public_ip = true # Providing our containers with public IPs
}
}

resource "aws_security_group" "service_security_group" {
ingress {
from_port = 0
to_port   = 0
protocol  = "-1"
# Only allowing traffic in from the load balancer security group
security_groups = ["${aws_security_group.load_balancer_security_group.id}"]
}
egress {
from_port   = 0 # Allowing any incoming port
to_port     = 0 # Allowing any outgoing port
protocol    = "-1" # Allowing any outgoing protocol 
cidr_blocks = ["0.0.0.0/0"] # Allowing traffic out to all IP addresses
}
}

ecs_task_definition.tf

resource "aws_ecs_task_definition" "my_first_task" {
family                   = "my-first-task" # Naming our first task
container_definitions    = <<DEFINITION
[
{
"name": "my-first-task",
"image": "${var.ECR_IMAGE_URL}",
"essential": true,
"portMappings": [
{
"containerPort": 2021,
"hostPort": 2021
}
],
"memory": 512,
"cpu": 256
}
]
DEFINITION
requires_compatibilities = ["FARGATE"] # Stating that we are using ECS Fargate
network_mode             = "awsvpc"    # Using awsvpc as our network mode as this is required for Fargate
memory                   = 512         # Specifying the memory our container requires
cpu                      = 256         # Specifying the CPU our container requires
execution_role_arn       = "${aws_iam_role.ecsTaskExecutionRole.arn}"
}
resource "aws_iam_role" "ecsTaskExecutionRole" {
name               = "ecsTaskExecutionRole"
assume_role_policy = "${data.aws_iam_policy_document.assume_role_policy.json}"
}
data "aws_iam_policy_document" "assume_role_policy" {
statement {
actions = ["sts:AssumeRole"]
principals {
type        = "Service"
identifiers = ["ecs-tasks.amazonaws.com"]
}
}
}
resource "aws_iam_role_policy_attachment" "ecsTaskExecutionRole_policy" {
role       = "${aws_iam_role.ecsTaskExecutionRole.name}"
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}

我哪里出错了?

当我从 k8s 迁移到 ECS Fargate 时,我遇到了同样的类似问题。 我的任务无法开始,这是一场噩梦。 k8s 中的相同映像在相同的运行状况检查下运行良好。

我可以看到你在task_definition中缺少healthCheck,至少这对我来说是问题。

这是我的containerDefinition

container_definitions = jsonencode([{
name        = "${var.app_name}-container-${var.environment}"
image       = "${var.container_repository}:${var.container_image_version}"
essential   = true
environment: concat(
var.custom_env_variables,
[
{
name  = "JAVA_TOOL_OPTIONS"
value = "-Xmx${var.container_memory_max_ram}m -XX:MaxRAM=${var.container_memory_max_ram}m -XX:+UseParallelGC -XX:MinHeapFreeRatio=5 -XX:MaxHeapFreeRatio=10 -XX:GCTimeRatio=4"
},
{
name  = "SPRING_PROFILES_ACTIVE"
value = var.spring_profile
},
{
name  = "APP_NAME"
value = var.spring_app_name
}
]
)
portMappings = [
{
protocol      = "tcp"
containerPort = var.container_port
},
{
protocol      = "tcp"
containerPort = var.container_actuator_port
}
]
healthCheck = {
retries = 10
command = [ "CMD-SHELL", "curl -f http://localhost:8081/actuator/liveness || exit 1" ]
timeout: 5
interval: 10
startPeriod: var.health_start_period
}
logConfiguration = {
logDriver = "awslogs"
options = {
awslogs-group         = aws_cloudwatch_log_group.main.name
awslogs-stream-prefix = "ecs"
awslogs-region        = var.aws_region
}
}
mountPoints = [{
sourceVolume = "backend_efs",
containerPath = "/data",
readOnly = false
}]
}])

healthCheckAPRT:

healthCheck = {
retries = 10
command = [ "CMD-SHELL", "curl -f http://localhost:8081/actuator/liveness || exit 1" ]
timeout: 5
interval: 10
startPeriod: var.health_start_period
}

容器为了启动需要有一种方法来检查任务是否运行正常。 我只能通过curl得到它.我有一个端点可以返回我是否实时。您需要指定您的,返回 200 非常重要。

默认情况下也没有curl命令,您需要将其添加到您的DockerFile因为这是我花了几个小时的下一个问题,因为 ECS 上没有明显的错误。

我添加了这一行:

RUN apt-get update && apt-get install -y --no-install-recommends curl

从外观上看,您正在创建具有子网的新VPC,但没有定义路由表,没有互联网网关并附加到 VPC。因此,您的 VPC 只是私有的,无法从互联网访问,也无法访问 ECR 来获取您的 docker 映像。

也许不是创建一个名为default_vpc的新 VPC,而是使用现有的默认 VPC。如果是这样,则必须使用数据源

data "aws_vpc" "default_vpc" {
default = true
}

要获取子网,请执行以下操作:

data "aws_subnet_ids" "default" {
vpc_id = data.aws_vpc.default_vpc.id
}

并修改代码的其余部分以引用这些数据源。

同样对于 Fargate,它应该删除:

"hostPort": 2021

您忘记为 ECS 服务设置安全组。它应该是:

network_configuration {
subnets          = data.aws_subnet_ids.default.ids
assign_public_ip = true # Providing our containers with public IPs
security_groups = [aws_security_group.service_security_group.id]
}

最简单的方法是在Dockerfile中添加一个HEALTHCHECK字段,正如AWS所建议的那样,请参阅:https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-ecs-taskdefinition-healthcheck.html。

Dockerfile应如下所示:

FROM node:18-alpine
RUN apk --no-cache add curl
# health check for ECS
HEALTHCHECK --interval=10m --timeout=5s 
CMD curl -f http://localhost:3000/health || exit 1

您也可以更新containerDefinition,但这只会覆盖Dockerfile的设置。

最新更新