V2EX = way to explore
V2EX 是一个关于分享和探索的地方
现在注册
已注册用户请  登录
silence01
V2EX  ›  Java

请教一个线上问题,流量早高峰时,接口耗时增多,前端请求 pending,应该如何分析以及定位问题。

  •  
  •   silence01 · 272 天前 · 1373 次点击
    这是一个创建于 272 天前的主题,其中的信息可能已经有所发展或是发生改变。
    目前服务端是两台 8C32G 机器,一台机器带宽比较大,承载主要流量,这台机器上有个 nginx 容器以及多个 springboot 项目容器,nginx 主要做反向代理以及负载均衡。
    通过阿里云监控可以看到在流量高峰时,tcp 连接数,正在建立连接的数据飙高。如下图
    https://imgur.com/a/wBov3gq
    nginx 的报错日志显示 worker_connections are not enough ,
    然后高峰期接口耗时日志明显增加,请问,在这种情况下应该如何分析以及定位问题?试过调高 worker_connections ,但是不起作用。另外感觉目前的服务端架构不太合理,流量入口是在一台机器上,系统瓶颈还是在一台机器上,应该怎么优化呢。
    还请各位大佬不吝赐教。
    6 条回复    2024-04-17 14:56:29 +08:00
    CodeWind
        1
    CodeWind  
       272 天前
    贴一下 nginx 容器的配置,以及 nginx 的配置信息。然后建议将 Nginx 容器的网络模式改为 HostNetwork 试试。
    silence01
        2
    silence01  
    OP
       272 天前
    @CodeWind
    nginx 容器配置信息

    [
    {
    "Id": "c0ea1b73f4a4183219515743d236f20934b7b80ae95d6107b5b71f84a9973f43",
    "Created": "2023-12-27T08:26:37.928226351Z",
    "Path": "/docker-entrypoint.sh",
    "Args": [
    "nginx",
    "-g",
    "daemon off;"
    ],
    "State": {
    "Status": "running",
    "Running": true,
    "Paused": false,
    "Restarting": false,
    "OOMKilled": false,
    "Dead": false,
    "Pid": 28784,
    "ExitCode": 0,
    "Error": "",
    "StartedAt": "2023-12-27T08:26:38.109869375Z",
    "FinishedAt": "0001-01-01T00:00:00Z"
    },
    "Image": "sha256:a99a39d070bfd1cb60fe65c45dea3a33764dc00a9546bf8dc46cb5a11b1b50e9",
    "ResolvConfPath": "/var/lib/docker/containers/c0ea1b73f4a4183219515743d236f20934b7b80ae95d6107b5b71f84a9973f43/resolv.conf",
    "HostnamePath": "/var/lib/docker/containers/c0ea1b73f4a4183219515743d236f20934b7b80ae95d6107b5b71f84a9973f43/hostname",
    "HostsPath": "/var/lib/docker/containers/c0ea1b73f4a4183219515743d236f20934b7b80ae95d6107b5b71f84a9973f43/hosts",
    "LogPath": "",
    "Name": "/nginx",
    "RestartCount": 0,
    "Driver": "overlay2",
    "MountLabel": "",
    "ProcessLabel": "",
    "AppArmorProfile": "",
    "ExecIDs": [
    "6d5fa77c7040de4f287c63225ef28a517878a61f310bb179184e13f1196d216f",
    "2cd8e8e9d77c0b3aec39adcc8e4a2b153d174ad060ec71018c97c5f2b24f1205",
    "86e0bb5b5eeadf7df2809d5a778f332eb01e20518420ae3c82c097798914125d",
    "01f17216c739f84790c190833342f54c6025a4a1bd2c8013dd61e5979f4ce461",
    "efcada90dbba7550b372930560ed1d18cf1b10d91f5c08519b1c01ad7af4ea71",
    "54333782d48cf040258fb14d5e2739156484aab3026dc2720df8ec1cfbf99245",
    "465102c14c5ff464eec011fce630ea553fcd09cb50d9b34a854293a750e4b614",
    "a1ec1c96827ce60bdfa4c68d165820368834c7c5c00d2a6c24d8cca574f16a71",
    "8fc228b0db75742e61f36784b6a4a44651f3a14ee629a25e833586f6fc868f6c"
    ],
    "HostConfig": {
    "Binds": null,
    "ContainerIDFile": "",
    "LogConfig": {
    "Type": "journald",
    "Config": {}
    },
    "NetworkMode": "default",
    "PortBindings": {
    "443/tcp": [
    {
    "HostIp": "",
    "HostPort": "443"
    }
    ],
    "80/tcp": [
    {
    "HostIp": "",
    "HostPort": "80"
    }
    ]
    },
    "RestartPolicy": {
    "Name": "no",
    "MaximumRetryCount": 0
    },
    "AutoRemove": false,
    "VolumeDriver": "",
    "VolumesFrom": null,
    "CapAdd": null,
    "CapDrop": null,
    "Dns": [],
    "DnsOptions": [],
    "DnsSearch": [],
    "ExtraHosts": null,
    "GroupAdd": null,
    "IpcMode": "",
    "Cgroup": "",
    "Links": null,
    "OomScoreAdj": 0,
    "PidMode": "",
    "Privileged": false,
    "PublishAllPorts": false,
    "ReadonlyRootfs": false,
    "SecurityOpt": null,
    "UTSMode": "",
    "UsernsMode": "",
    "ShmSize": 67108864,
    "Runtime": "docker-runc",
    "ConsoleSize": [
    0,
    0
    ],
    "Isolation": "",
    "CpuShares": 0,
    "Memory": 0,
    "NanoCpus": 0,
    "CgroupParent": "",
    "BlkioWeight": 0,
    "BlkioWeightDevice": null,
    "BlkioDeviceReadBps": null,
    "BlkioDeviceWriteBps": null,
    "BlkioDeviceReadIOps": null,
    "BlkioDeviceWriteIOps": null,
    "CpuPeriod": 0,
    "CpuQuota": 0,
    "CpuRealtimePeriod": 0,
    "CpuRealtimeRuntime": 0,
    "CpusetCpus": "",
    "CpusetMems": "",
    "Devices": [],
    "DiskQuota": 0,
    "KernelMemory": 0,
    "MemoryReservation": 0,
    "MemorySwap": 0,
    "MemorySwappiness": -1,
    "OomKillDisable": false,
    "PidsLimit": 0,
    "Ulimits": null,
    "CpuCount": 0,
    "CpuPercent": 0,
    "IOMaximumIOps": 0,
    "IOMaximumBandwidth": 0
    },
    "GraphDriver": {
    "Name": "overlay2",
    "Data": {
    "LowerDir": "/var/lib/docker/overlay2/91e3a4840e9b70c770780d76ddc290ca8c13c6aa6cc486a3f4e4733d0c0c5f22-init/diff:/var/lib/docker/overlay2/5d91f3f600ca558808844dfde84be684d870904ae0f90550b3ec02e8abda0177/diff:/var/lib/docker/overlay2/fcb79371f98a9e2cb86cbc89aee92c140fdd070783e59fc9ee387a10723e3a0c/diff:/var/lib/docker/overlay2/8c7c190ee5730833267d5fc47eca96cf9554f396e578dcdc71402ad420302c43/diff:/var/lib/docker/overlay2/e8d8fc9899cf6baa778175efa5bb3d51c9a99e51025c706f3fcbb0e914a32afe/diff:/var/lib/docker/overlay2/b20b67ff2f5dc33991a222d6590faad8e21eb2d017654423830763c89dbc431b/diff:/var/lib/docker/overlay2/77778c655de26a0011d507926f9f60e1df7da18b901028374625be538600fefe/diff",
    "MergedDir": "/var/lib/docker/overlay2/91e3a4840e9b70c770780d76ddc290ca8c13c6aa6cc486a3f4e4733d0c0c5f22/merged",
    "UpperDir": "/var/lib/docker/overlay2/91e3a4840e9b70c770780d76ddc290ca8c13c6aa6cc486a3f4e4733d0c0c5f22/diff",
    "WorkDir": "/var/lib/docker/overlay2/91e3a4840e9b70c770780d76ddc290ca8c13c6aa6cc486a3f4e4733d0c0c5f22/work"
    }
    },
    "Mounts": [],
    "Config": {
    "Hostname": "c0ea1b73f4a4",
    "Domainname": "",
    "User": "",
    "AttachStdin": false,
    "AttachStdout": false,
    "AttachStderr": false,
    "ExposedPorts": {
    "443/tcp": {},
    "80/tcp": {}
    },
    "Tty": false,
    "OpenStdin": false,
    "StdinOnce": false,
    "Env": [
    "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
    "NGINX_VERSION=1.23.3",
    "NJS_VERSION=0.7.9",
    "PKG_RELEASE=1~bullseye"
    ],
    "Cmd": [
    "nginx",
    "-g",
    "daemon off;"
    ],
    "Image": "nginx",
    "Volumes": null,
    "WorkingDir": "",
    "Entrypoint": [
    "/docker-entrypoint.sh"
    ],
    "OnBuild": null,
    "Labels": {
    "maintainer": "NGINX Docker Maintainers <[email protected]>"
    },
    "StopSignal": "SIGQUIT"
    },
    "NetworkSettings": {
    "Bridge": "",
    "SandboxID": "48f11bccefb8b41c1586a404810af3b4bccbd2225ea61d52e0c0e737f9c1968d",
    "HairpinMode": false,
    "LinkLocalIPv6Address": "",
    "LinkLocalIPv6PrefixLen": 0,
    "Ports": {
    "443/tcp": [
    {
    "HostIp": "0.0.0.0",
    "HostPort": "443"
    }
    ],
    "80/tcp": [
    {
    "HostIp": "0.0.0.0",
    "HostPort": "80"
    }
    ]
    },
    "SandboxKey": "/var/run/docker/netns/48f11bccefb8",
    "SecondaryIPAddresses": null,
    "SecondaryIPv6Addresses": null,
    "EndpointID": "e75e3c0b5836cc4532127e712e60f157d35167f82c5a2befd608c48622fd90d2",
    "Gateway": "172.17.0.1",
    "GlobalIPv6Address": "",
    "GlobalIPv6PrefixLen": 0,
    "IPAddress": "172.17.0.4",
    "IPPrefixLen": 16,
    "IPv6Gateway": "",
    "MacAddress": "02:42:ac:11:00:04",
    "Networks": {
    "bridge": {
    "IPAMConfig": null,
    "Links": null,
    "Aliases": null,
    "NetworkID": "54c36131e69a5d0f69f965f393b8c0c260df3ee18e5e62be482e7322186b86ba",
    "EndpointID": "e75e3c0b5836cc4532127e712e60f157d35167f82c5a2befd608c48622fd90d2",
    "Gateway": "172.17.0.1",
    "IPAddress": "172.17.0.4",
    "IPPrefixLen": 16,
    "IPv6Gateway": "",
    "GlobalIPv6Address": "",
    "GlobalIPv6PrefixLen": 0,
    "MacAddress": "02:42:ac:11:00:04"
    }
    }
    }
    }
    ]

    nginx 配置信息:
    user nginx;
    worker_processes auto;

    #error_log /var/log/nginx/error.log notice;
    # error_log /etc/nginx/conf.d/error.log notice;

    pid /var/run/nginx.pid;


    events {
    worker_connections 1224;
    }


    http {
    include /etc/nginx/mime.types;
    default_type application/octet-stream;


    log_format main '$remote_addr - $remote_user [$time_local] "$request" '
    '$status $body_bytes_sent "$http_referer" '
    '"$http_user_agent" "$http_x_forwarded_for" $request_time $upstream_response_time';

    access_log /var/log/nginx/access.log main;
    error_log /var/log/nginx/error.log error;
    rewrite_log on;

    sendfile on;
    #tcp_nopush on;

    keepalive_timeout 65;

    #gzip on;

    include /etc/nginx/conf.d/*.conf;
    }
    fkdtz
        3
    fkdtz  
       272 天前   ❤️ 1
    你都说高峰期接口耗时日志明显增加,建议先查后端逻辑。
    有可能后端因为某种原因响应很慢,导致 Nginx worker 无法快速响应前端,直到把 worker 全都占满。
    如果是这种情况的话,问题出在后端,调高负载均衡配置不解决根本问题。
    CodeWind
        4
    CodeWind  
       272 天前
    @silencechengk 需要再看一下宿主机和 Nginx 容器内 ulimit 和最大打开文件数设置,如果这两个参数小于 worker_connections 值,调大 worker_connections 参数无意义。
    CodeWind
        5
    CodeWind  
       272 天前   ❤️ 1
    宿主机的 ulimit 参数建议要大于容器,小于就没有意义。另外你也打印了 upstream_response_time 日志,可以根据日志筛选接口的响应时间,对比一下其他时间段。如果发现高峰期响应时间明显上升,那就是后端代码的问题,如果发现变化不大,那就是 nginx 的问题
    BiChengfei
        6
    BiChengfei  
       271 天前   ❤️ 1
    worker_connections are not enough 没遇到过,也没设置过
    从”流量早高峰时,接口耗时增多,前端请求 pending“分析
    1. 频繁创建销毁线程,导致 GC 频繁,一般是 Yong GC ,导致耗时边长,可以监控下 GC 频率
    2. 有线程共享资源竞争情况,导致线程等待,导致耗时变长
    这两点都是从减少接口耗时,及时释放资源,来解决连接数不足的问题
    关于   ·   帮助文档   ·   博客   ·   API   ·   FAQ   ·   实用小工具   ·   5504 人在线   最高记录 6679   ·     Select Language
    创意工作者们的社区
    World is powered by solitude
    VERSION: 3.9.8.5 · 23ms · UTC 03:35 · PVG 11:35 · LAX 19:35 · JFK 22:35
    Developed with CodeLauncher
    ♥ Do have faith in what you're doing.