Skip to content

APM 服务发现

0x01 关键信息

a. 适用场景

APM 服务节点发现、端点发现、指标发现的调试和手动触发。

0x02 代码片段

a. 服务节点发现

python
import datetime
from apm.resources import QuerySpanListResource
from apm.core.discover.node import NodeDiscover
from apm.core.discover.endpoint import EndpointDiscover

bk_biz_id = 5016863
app_name = "arashi"

end_time = int(datetime.datetime.now().timestamp())
begin_time = end_time - int(datetime.timedelta(minutes=60).total_seconds())

query_params = {
    "bk_biz_id": bk_biz_id, 
    "app_name": app_name, 
    "start_time": begin_time,
    "end_time": end_time,
    "filters": [{"key": "resource.service.name", "operator": "equal", "value": ["dscenter"]}],
    "exclude_field": ["events"],
    "limit": 10000,
    "es_dsl": {}
}
spans = QuerySpanListResource().request(query_params)["data"]

NodeDiscover(bk_biz_id, app_name).discover(spans)
EndpointDiscover(bk_biz_id, app_name).discover(spans)

b. 批量服务发现

python
from apm_web.handlers import service_handler

# 过滤出所有未发现 SDK 的服务
service_names = [
    service["topo_key"]
    for service in service_handler.ServiceHandler.list_nodes(100842, "uc-backend") 
    if service["extra_data"]["category"] not in ["db", "messaging"] and service["sdk"] is None
]

for service_name in service_names:
    query_params = {
        "bk_biz_id": bk_biz_id, 
        "app_name": app_name, 
        "start_time": begin_time,
        "end_time": end_time,
        "filters": [{"key": "resource.service.name", "operator": "equal", "value": [service_name]}],
        "exclude_field": ["events"],
        "limit": 10000,
        "es_dsl": {}
    }
    spans = QuerySpanListResource().request(query_params)["data"]
    NodeDiscover(bk_biz_id, app_name).discover(spans)

c. 指标发现

python
from datetime import datetime, timedelta
from apm.models import MetricDataSource
from apm.core.discover.metric.service import ServiceDiscover

bk_biz_id = 19062
app_name = "esp_pubgm_test"

datasource = MetricDataSource.objects.get(bk_biz_id=bk_biz_id, app_name=app_name)

end_time = int(datetime.now().timestamp())
start_time = end_time - int(timedelta(hours=1).total_seconds())
ServiceDiscover(datasource).discover(start_time, end_time)

d. 查找关联关系

python
from apm_web.topo.handle.relation.define import SourceK8sPod, SourceService
from apm_web.topo.handle.relation.query import RelationQ

bk_biz_id = 2
app_name = "trpc-cluster-access-demo"
service_name = "bkm.web"
start_time = 1732607026
end_time = 1732610626

result = RelationQ.query(
    RelationQ.generate_q(
        bk_biz_id=bk_biz_id,
        source_info=SourceService(
            apm_application_name=app_name,
            apm_service_name=service_name,
        ),
        target_type=SourceK8sPod,
        start_time=start_time,
        end_time=end_time,
    )
)

e. 按服务统计接口数量分布

python
from collections import Counter
from apm.models import Endpoint

bk_biz_id = 19062
app_name = "esp_pubgm_prod"

counter = Counter(
    Endpoint.objects.filter(bk_biz_id=bk_biz_id, app_name=app_name)
    .values_list("service_name", flat=True)
)
for service_name, count in counter.most_common():
    print(f"{service_name}: {count}")

f. 删除高基数接口

python
from django.db.models import Q, Count
from apm.models import Endpoint

bk_biz_id = 19062
app_name = "esp_pubgm_prod"

targets = Endpoint.objects.filter(
    bk_biz_id=bk_biz_id, app_name=app_name,
).filter(
    Q(endpoint_name__startswith="/checkLogin.php?")
    | Q(endpoint_name__startswith="/v4/openim/batchsendmsg?")
    | Q(endpoint_name__contains="sCloudApiName=")
    | Q(endpoint_name__startswith="/v2/profile/userinfo?")
    | Q(endpoint_name__startswith="/v2/profile/openid2uid?")
    | Q(endpoint_name__startswith="/v2/auth/verify_login?")
    | Q(endpoint_name__startswith="/audit2023/_doc/")
    | Q(endpoint_name__startswith="/matchauth542/_doc/")
    | Q(endpoint_name__startswith="/orgscores542/_doc/")
)

print(f"will delete {targets.count()} endpoints")
for row in targets.values("service_name").annotate(cnt=Count("id")).order_by("-cnt"):
    print(f"  {row['service_name']}: {row['cnt']}")

# targets.delete()

g. 手动创建服务节点

python
from apm.models import TopoNode

servers = ["TestApp.HelloGo"]

bk_biz_id = 5000140
app_name = "bcs_k8s_25973_default"

for server in servers:
    TopoNode.objects.update_or_create(
        bk_biz_id=bk_biz_id,
        app_name=app_name,
        topo_key=server,
        defaults={
            "extra_data": {
                "category": "rpc", 
                "kind": "service", 
                "predicate_value": "", 
                "service_language": "go"
            }
        }
    )

h. 批量克隆服务节点

python
from apm.models import TopoNode
import copy

base = {
    "bk_biz_id": "<bk_biz_id>",
    "app_name": "<app_name>",
    "extra_data": {
        "category": "rpc",
        "kind": "service",
        "predicate_value": None,
        "service_language": "go",
    },
    "system": [{"name": "trpc", "extra_data": {"attributes.trpc.namespace": "Production"}}],
    "platform": {},
    "sdk": [
        {
            "name": "galileo",
            "extra_data": {
                "resource.target": None,
                "resource.telemetry.sdk.name": "galileo",
            },
        }
    ],
    "source": ["trace", "metric"],
    "is_permanent": True,
}

ids = [10158, 10129, 60015, 60016, 10188, 10213, 60017, 10222, 10212]

for _id in ids:
    topo_key = f"activity-microservices.activities-{_id}"
    if TopoNode.objects.filter(bk_biz_id=base["bk_biz_id"], app_name=base["app_name"], topo_key=topo_key).exists():
        print(f"SKIP {topo_key}")
        continue

    data = copy.deepcopy(base)
    data["topo_key"] = topo_key
    data["sdk"][0]["extra_data"]["resource.target"] = f"BCS.activity-microservices.activities-{_id}"
    TopoNode.objects.create(**data)
    print(f"OK   {topo_key}")