Skip to content

APM 策略

0x01 关键信息

a. 适用场景

  • 为 tRPC/Tars 应用批量下发告警策略
  • APM 策略模板的自动下发、手动注册、重置清理

0x02 tRPC 策略应用

a. 策略应用命令

shell
# 基础用法
python manage.py apply_rpc_strategies -b 2 -a "sandtrpc" -t "caller" "callee" -g 1 --caller-extra-group-by "callee_method" --callee-extra-group-by "callee_method"

# 带配置的用法
python manage.py apply_rpc_strategies -b 60 -a "trpc-oteam-sdk-access-demo" --config '{"caller": {"group_by": ["callee_method"]}, "callee": {"group_by": ["callee_method"]}}'

# 带过滤条件
python manage.py apply_rpc_strategies -b 2 -a "trpc-cluster-access-demo" -t "callee" "caller" "resource" "panic" --config '{"caller": {"group_by": ["callee_method"], "filter_dict": {"namespace__eq": "Development"}}, "callee": {"group_by": ["callee_method"], "filter_dict": {"namespace__eq": "Development"}}}'

# 指定服务
python manage.py apply_rpc_strategies -b 2 -a "trpc-cluster-access-demo" -g 997 -t "callee" "caller" "resource" -s "bkm.web" "bkm.product" --config '{"caller": {"group_by": ["callee_method"], "filter_dict": {"namespace__eq": "Development"}}, "callee": {"group_by": ["callee_method"], "filter_dict": {"namespace__eq": "Development"}}}'

b. 上云环境示例

shell
# 天天象棋
python manage.py apply_rpc_strategies -b 640 -a "qqchess" -g 83485 -t "callee" "caller" -s "qqchess.online_ai_svr" --config '{"caller": {"group_by": ["callee_method"], "filter_dict": {"namespace__eq": "Production"}}, "callee": {"group_by": ["callee_method"], "filter_dict": {"namespace__eq": "Production"}}}'

# Taf 服务批量
python manage.py apply_rpc_strategies -b 640 -a "qqchess" -g 84269 -t "callee" "caller" -s "QQChess.ActivitySvr" "QQChess.DailyRankServer" "QQChess.EventRouterServer" --config '{"caller": {"group_by": ["callee_method"]}, "callee": {"group_by": ["callee_method"]}}'

# 英雄杀小游戏
python manage.py apply_rpc_strategies -b 100380 -a "formal_thkgame_apm" -g 83494 -t "callee" "caller" --config '{"caller": {"group_by": ["callee_method"]}, "callee": {"group_by": ["callee_method"]}}'

# 和平精英周边生态
python manage.py apply_rpc_strategies -b -4228598 -a "hpjy-microservices-activities-production" -g 83239 83654 82538 -t "callee" "caller" --config '{"caller": {"group_by": ["callee_method"]}, "callee": {"group_by": ["callee_method"]}}'

c. Python 代码调用

python
from apm_web.handlers.metric_group import MetricHelper
from apm_web.handlers.strategy_group import (
    BaseStrategyGroup,
    GroupEnum,
    RPCApplyType,
    StrategyGroupRegistry,
)

bk_biz_id = 2
app_name = "trpc-cluster-access-demo"
apply_types = ["callee", "caller", "resource"]
notice_group_ids = [997]

metric_helper = MetricHelper(bk_biz_id, app_name)
group: BaseStrategyGroup = StrategyGroupRegistry.get(
    GroupEnum.RPC,
    bk_biz_id,
    app_name,
    metric_helper=metric_helper,
    notice_group_ids=notice_group_ids,
    apply_types=apply_types,
)

d. 获取服务配置

python
from apm_web.handlers import metric_group

bk_biz_id = 640
app_name = "qqchess"
service_name = "qqchess.PersonifyLinuxAiServer"

group: metric_group.TrpcMetricGroup = metric_group.MetricGroupRegistry.get(
    metric_group.GroupEnum.TRPC, bk_biz_id, app_name
)
group.get_server_config(server=service_name)

0x03 策略模板管理

a. 模板自动下发

python
from apm_web.strategy.handler import StrategyTemplateHandler

bk_biz_id: int = 2
app_name: str = "trpc-cluster-access-demo"
StrategyTemplateHandler.handle_auto_apply(bk_biz_id, app_name)

from apm_web.handlers import service_handler
nodes = service_handler.ServiceHandler.list_nodes(bk_biz_id, app_name)

b. 模板注册

python
from apm_web.models import Application
from apm_web.strategy.builtin.registry import BuiltinStrategyTemplateRegistry

BuiltinStrategyTemplateRegistry(Application.objects.get(bk_biz_id=2, app_name="trpc-cluster-access-demo")).register()
BuiltinStrategyTemplateRegistry(Application.objects.get(bk_biz_id=11, app_name="sand_local_dev")).register()
BuiltinStrategyTemplateRegistry(Application.objects.get(bk_biz_id=2, app_name="bkop")).register()
BuiltinStrategyTemplateRegistry(Application.objects.get(bk_biz_id=7, app_name="bkmonitor_production")).register()
BuiltinStrategyTemplateRegistry(Application.objects.get(bk_biz_id=2, app_name="bcs_k8s_40735_defaul")).register()
BuiltinStrategyTemplateRegistry(Application.objects.get(bk_biz_id=19062, app_name="esp_pubgm_prod")).register()

c. 模板下发

python
from apm_web.models import StrategyTemplate, StrategyInstance
from apm_web.strategy.dispatch.core import StrategyDispatcher
from apm_web.strategy.dispatch.base import DispatchExtraConfig
from apm_web.strategy.query_template import QueryTemplateWrapperFactory

strategy_template = StrategyTemplate.objects.get(id=1)
qtw = QueryTemplateWrapperFactory.get_wrapper(
    strategy_template.query_template["bk_biz_id"], strategy_template.query_template["name"]
)

dispatcher = StrategyDispatcher(strategy_template, qtw)
dispatcher.dispatch(
    ["bkm.web"], 
    extra_configs=[
        DispatchExtraConfig(
            service_name="bkm.web", 
            context={"ALARM_THRESHOLD_VALUE": 10}
        )
    ]
)

d. 系统探测

python
from apm_web.strategy.dispatch.enricher import SystemChecker
from apm_web.strategy.dispatch.entity import EntitySet

bk_biz_id = -4228598
app_name = "hpjy_microservices_activities"

e = EntitySet(bk_biz_id, app_name)
SystemChecker(e).check_systems()

e. 重置清理

python
from django.db.models import Q
from core.drf_resource import resource
from apm_web.models import StrategyTemplate, StrategyInstance

def clean_strategy_template_data(bk_biz_id: int, app_name: str | None = None) -> None:
    """清理策略模板相关数据"""
    q = Q(bk_biz_id=bk_biz_id)
    if app_name:
        q &= Q(app_name=app_name)

    strategy_instance_qs = StrategyInstance.objects.filter(q)
    resource.strategies.delete_strategy_v2({
        "bk_biz_id": bk_biz_id,
        "ids": list(strategy_instance_qs.values_list("strategy_id", flat=True))
    })
    strategy_instance_qs.delete()
    StrategyTemplate.origin_objects.filter(q).delete()
    
clean_strategy_template_data(2, "bkop")
clean_strategy_template_data(11, "sand_local_dev")
clean_strategy_template_data(2, "trpc-cluster-access-demo")

f. 找出所有 tRPC 服务

python
from apm_web.handlers import service_handler

bk_biz_id: int = 640
app_name: str = "qqchess"

for node in service_handler.ServiceHandler.list_nodes(bk_biz_id, app_name):
    try:
        if node["system"][0]["extra_data"]["rpc_system"] == "tars":
            continue
    except Exception:
        pass
    print('"' + node["topo_key"] + '",')