anduin revised this gist . Go to revision
2 files changed, 210 insertions, 13 deletions
Fix.md renamed to FixBatch.md
@@ -1,4 +1,4 @@ | |||
1 | - | 1. 准备工作区:立刻打开两个DMS两个OSP | |
1 | + | 1. 准备工作区:立刻打开两个DMS,两个OSP和一个Kusto Explorer。 | |
2 | 2 | ||
3 | 3 | 2. 识别:识别有故障的范围,是版本还是Ring。在OSP检查此Ring趋势图。检查Substrate版本历史,确认其版本类型(Dogfood、Daily)。 | |
4 | 4 | ||
@@ -8,6 +8,8 @@ | |||
8 | 8 | ||
9 | 9 | 3. 检查Override: | |
10 | 10 | ||
11 | + | **不要**跳过这一步!很多问题都是由于Override引起的。或许你完全可以发现已经有人在Override这个问题了。 | |
12 | + | ||
11 | 13 | 在OSP Overrides 页面搜索: | |
12 | 14 | ||
13 | 15 | * 这个版本本身的信息 | |
@@ -19,7 +21,14 @@ | |||
19 | 21 | * 如果有相关Override,找到正确责任组。结束 | |
20 | 22 | * 如果没有相关Override,转到步骤4 | |
21 | 23 | ||
22 | - | 4. 进一步识别+验证:在DMS里使用gcam命令检查这个版本是否真的没有被部署。 | |
24 | + | **特别注意**:如果调查的环境是 Gallatin 或 Itar,**不要**相信 OSP Overrides 页面的信息。这两个环境的 OSP Overrides 页面是不准确的!要手工在目标环境运行下面的命令来检查Override: | |
25 | + | ||
26 | + | ```powershell | |
27 | + | Set-MyServiceInstance -ServiceInstanceName Multitenant | |
28 | + | Get-DeploymentOverrideNew -ShowDeleted $true | Sort-Object Id -Descending | Select-Object -First 100 | Format-Table -Property Id, DeploymentOverrideType, CreatedBy, Deleted, Data1 | |
29 | + | ``` | |
30 | + | ||
31 | + | 1. 进一步识别+验证:在DMS里使用gcam命令检查这个版本是否真的没有被部署。 | |
23 | 32 | ||
24 | 33 | ```powershell | |
25 | 34 | $machines = gcam -ShowAll -Filter "DeployRing -eq 'MSIT'" # 取出要调查的整个范围。 | |
@@ -55,27 +64,19 @@ APSFailedWorkitemEvent_Global | |||
55 | 64 | | where requestorMeta has 'BE' | |
56 | 65 | | where requestorMeta has "DotBuildUpgrade" | |
57 | 66 | | where failureMessage !has "forcing ReimageMode to WinPE" | |
58 | - | | where failureMessage !has "forcing ReimageMode to WinPE" | |
59 | 67 | | project failureTagId, resourceName, targetResourceUnit, targetIntention, deployRing, | |
60 | 68 | failureWorkflowName, workflowId, workflowStartTime, failureTagWords, | |
61 | 69 | failureMessage, workflowEndTime, sku | |
62 | 70 | | summarize Count = count(), | |
63 | 71 | ExampleErrorMessage = take_any(failureMessage), | |
64 | - | ExampleWorkflowId = take_any(workflowId) | |
72 | + | ExampleWorkflowId = take_any(workflowId), | |
73 | + | ExampleFailureTagId = take_any(failureTagId) | |
65 | 74 | by targetIntention, failureWorkflowName | |
66 | 75 | | where Count > 3 | |
67 | 76 | | order by targetIntention asc, Count desc | |
68 | 77 | ``` | |
69 | 78 | ||
70 | - | 如果需要针对一台机器,查查它历史上都是如何被Deploy的, 使用下面的Kusto: | |
71 | - | ||
72 | - | ```kusto | |
73 | - | DeploymentCogsEvent_Global | |
74 | - | | where machineName == '$machine' | |
75 | - | | where timestamp >= ago(3d) | |
76 | - | | sort by timestamp desc | |
77 | - | | project timestamp, machineName, actionType, requestor, provisioningState, deployMode, activityId, forest, actionStatus, deployVersion, SourceVersion, workflowId | |
78 | - | ``` | |
79 | + | 上面的查询会输出一些机器示例。请参考第二章以进一步诊断这些机器。 | |
79 | 80 | ||
80 | 81 | 8. 找到错误的信息,检查日志,找到正确的责任人。 | |
81 | 82 |
FixMachine.md(file created)
@@ -0,0 +1,196 @@ | |||
1 | + | ||
2 | + | ## 第二章 - 诊断少量机器无法部署的问题 | |
3 | + | ||
4 | + | 1. 运行下面的查询来查看机器的基本信息。 | |
5 | + | ||
6 | + | ```powershell | |
7 | + | $machineObject = Get-CentralAdminMachine $machine | |
8 | + | $machineObject | Format-List Name, *Definition, Dag, Nag, DesiredDefinition, DeployRing, AutopilotMode, *Version*, ProvisioningState, ActivityState, Forest, When*, City, ServiceInstanceType | |
9 | + | ``` | |
10 | + | ||
11 | + | 这些基本信息帮助快速了解机器的当前状态。 | |
12 | + | ||
13 | + | 我们也可能还需要知道机器所在单位的上下文信息,可以运行下面的命令: | |
14 | + | ||
15 | + | ```powershell | |
16 | + | if (-not [string]::IsNullOrEmpty($machineObject.Dag)) { | |
17 | + | Write-Host "Downloading DAG context..." -ForegroundColor Yellow; | |
18 | + | Write-Host "You are $machine and you are viewing DAG: $($machineObject.Dag)"; | |
19 | + | Get-CentralAdminMachine -Filter "Dag -eq '$($machineObject.Dag)'" -ShowAll | Sort-Object ProvisioningState | Format-Table -AutoSize Name, ActivityState, ProvisioningState, *Definition, ActualVersion, Location, AutopilotMode | |
20 | + | } | |
21 | + | ||
22 | + | if (-not [string]::IsNullOrEmpty($machineObject.CapacityUnit)) { | |
23 | + | Write-Host "Downloading CapacityUnit context..." -ForegroundColor Yellow; | |
24 | + | Write-Host "You are $machine and you are viewing CapacityUnit: $($machineObject.CapacityUnit) ."; | |
25 | + | Get-CentralAdminMachine -Filter "CapacityUnit -eq '$($machineObject.CapacityUnit)'" -ShowAll | Sort-Object ProvisioningState | Format-Table -AutoSize Name, ActivityState, ProvisioningState, *Definition, ActualVersion, Location, AutopilotMode | |
26 | + | } | |
27 | + | ``` | |
28 | + | ||
29 | + | 2. 查看机器的历史部署记录,以了解机器的部署历史。 | |
30 | + | ||
31 | + | 如果需要针对一台机器,查查它历史上都是如何被Deploy的, 使用下面的Kusto: | |
32 | + | ||
33 | + | ```kusto | |
34 | + | DeploymentCogsEvent_Global | |
35 | + | | where machineName == '$machine' | |
36 | + | | where timestamp >= ago(3d) | |
37 | + | | sort by timestamp desc | |
38 | + | | project timestamp, machineName, actionType, requestor, provisioningState, deployMode, activityId, forest, actionStatus, deployVersion, SourceVersion, workflowId | |
39 | + | ``` | |
40 | + | ||
41 | + | 上面的 Kusto 可以在 DMS 里交叉验证: | |
42 | + | ||
43 | + | ```powershell | |
44 | + | Get-DeploymentAPSWorkitem -Machines $machine -ShowAll ` | |
45 | + | | Sort-Object WorkflowStartTime -Descending ` | |
46 | + | | Select-Object -First 20 ` | |
47 | + | | Format-Table -AutoSize Status, WorkflowId, TargetIntention, WorkflowStartTime | |
48 | + | ``` | |
49 | + | ||
50 | + | **一定要检查机器的部署历史**。从而判断机器是真的部署失败了,还是根本没有尝试部署,亦或是可重试而恢复的错误,还是根本性的错误。是应该部署还是不应该部署。 | |
51 | + | ||
52 | + | 3. 了解一台机器为什么不部署或者部署了(可选): | |
53 | + | ||
54 | + | 在 Kusto 中运行下面的查询,以查看机器的部署期待性。这有助于诊断为什么机器迟迟没有部署。 | |
55 | + | ||
56 | + | ```kusto | |
57 | + | ApsEvaluatorTraceEvent_Global | |
58 | + | | where Message has "GV2PEPF0000385A" and env_time > ago(12h) | |
59 | + | | where Message has "Failed with rule:" | |
60 | + | | parse Message with * "Failed with rule:" FailRule:string "||" * | |
61 | + | | project env_time, FailRule, PolicyIdentifier, MessageId, Message | |
62 | + | | sort by env_time desc | |
63 | + | | limit 200 | |
64 | + | ``` | |
65 | + | ||
66 | + | 4. 了解机器为什么一定要部署一个版本(可选): | |
67 | + | ||
68 | + | 有的时候,机器可能会被强制部署一个意外版本(例如过老的版本)。在 Kusto 中运行下面的查询,以查看机器被强制部署的原因: | |
69 | + | ||
70 | + | ```kusto | |
71 | + | ApsPrioritizerTraceEvent_Global | |
72 | + | | where Message has "GV2PEPF0000385A" and env_time > ago(12h) | |
73 | + | | where Message has "" | |
74 | + | | where PrioritizerIdentifier startswith 'Sweeper:CapacityDeploymentSweeper' | |
75 | + | | sort by env_time desc | |
76 | + | | project PrioritizerIdentifier,Message,env_time | |
77 | + | | limit 200 | |
78 | + | ``` | |
79 | + | ||
80 | + | 5. 诊断机器的部署错误: | |
81 | + | ||
82 | + | 对于第二步的输出,我们可以看到 WorkflowId。我们可以使用这个 WorkflowId 来查看机器的部署错误。 | |
83 | + | ||
84 | + | ```powershell | |
85 | + | See-Workflow $workflowId | |
86 | + | ``` | |
87 | + | ||
88 | + | 一般到这里,我们已经可以知道机器为什么部署失败了。如果还不清楚,可以继续下面的步骤。 | |
89 | + | ||
90 | + | 6. 将部署的错误按原因分类: | |
91 | + | ||
92 | + | 有的时候,我们只检查一两条错误对于诊断机器为什么部署失败是没有说服力的。我们可以使用下面的 Kusto 查询来将错误按原因分类: | |
93 | + | ||
94 | + | ```kusto | |
95 | + | APSFailedWorkitemEvent_Global | |
96 | + | | where env_time > ago(200h) | |
97 | + | | where resourceName == "GV2PEPF0000385A" | |
98 | + | | where prioritizer contains "Sweeper:CapacityDeploymentSweeper" | |
99 | + | | where failureMessage !has "forcing ReimageMode to WinPE" | |
100 | + | | project failureTagId, resourceName, targetResourceUnit, targetIntention, deployRing, | |
101 | + | failureWorkflowName, workflowId, workflowStartTime, failureTagWords, | |
102 | + | failureMessage, workflowEndTime, sku | |
103 | + | | summarize Count = count(), | |
104 | + | ExampleErrorMessage = take_any(failureMessage), | |
105 | + | ExampleWorkflowId = take_any(workflowId), | |
106 | + | ExampleFailureTagId = take_any(failureTagId) | |
107 | + | by targetIntention, failureWorkflowName | |
108 | + | | where Count > 3 | |
109 | + | | order by targetIntention asc, Count desc | |
110 | + | ``` | |
111 | + | ||
112 | + | 这样,出现较多的错误就是我们需要关注的根本性错误。 | |
113 | + | ||
114 | + | 7. 诊断 ExchangeSetup 的错误: | |
115 | + | ||
116 | + | 如果机器是 ExchangeSetup 的错误,我们可以使用下面的 DMS 查询来诊断: | |
117 | + | ||
118 | + | ```powershell | |
119 | + | Get-MachineLog C_ExchangeSetup -Target "GV2PEPF0000385A" -Download | |
120 | + | ``` | |
121 | + | ||
122 | + | 8. 进行缓解措施: | |
123 | + | ||
124 | + | 机器的情况可能分为两类: | |
125 | + | ||
126 | + | * 机器是可以部署的,但是由于某些原因没有部署。这种情况下,我们可以手动触发机器的部署和修复手段。转到步骤 9 。 | |
127 | + | * 大范围的问题或版本本身的问题。这种情况下,即使这一台机器可以部署,它也会失败。转到步骤 10 。 | |
128 | + | ||
129 | + | 9. 手动触发机器的部署: | |
130 | + | ||
131 | + | 如果机器是可以部署的,我们可以手动触发机器的部署。有一系列命令可以针对一台机器进行修复: | |
132 | + | ||
133 | + | ### 文件操作类 | |
134 | + | ||
135 | + | 1. **列出目标机器上的文件夹内容** | |
136 | + | 命令:`Get-ChildItem.ps1 -Target "GV2PEPF0000385A" -Path "C:\"` | |
137 | + | ||
138 | + | 2. **下载目标机器上的文件** | |
139 | + | 命令:`Get-TorusFile -Path "C:\program files\microsoft\exchange server\v15\config\AntiMalware.settings.ini" -Machine "GV2PEPF0000385A"` | |
140 | + | ||
141 | + | 3. **查看目标机器上文件的属性** | |
142 | + | 命令:`Get-ItemProperty.ps1 -Path "C:\program files\microsoft\exchange server\v15\config\AntiMalware.settings.ini" -Target "GV2PEPF0000385A" | Format-List` | |
143 | + | ||
144 | + | 4. **下载Exchange安装日志** | |
145 | + | 命令:`Get-MachineLog C_ExchangeSetup -Target "GV2PEPF0000385A" -Download` | |
146 | + | ||
147 | + | ### 状态诊断类 | |
148 | + | ||
149 | + | 1. **检查机器性能** | |
150 | + | 命令:`Measure-Performance -Machine GV2PEPF0000385A` | |
151 | + | ||
152 | + | 2. **检查机器磁盘使用情况** | |
153 | + | 命令:`Get-WmiObject.ps1 -Target GV2PEPF0000385A -Class Win32_LogicalDisk -NoFormatting | Foreach-Object { 'Disk {0}({4}) has free space: {1:0.0}GB/{2:0.0}GB ({3:0.0}%)\n' -f .Caption, (.FreeSpace / 1024MB), (.Size / 1024MB), (.FreeSpace / .Size * 100), .VolumeName }` | |
154 | + | ||
155 | + | 3. **测试网络连接** | |
156 | + | 命令:`Test-MachineNetworkConnectivity.ps1 -TargetMachine GV2PEPF0000385A` | |
157 | + | ||
158 | + | 4. **诊断WinPE启动失败问题** | |
159 | + | 命令:`Invoke-WinPEFailureDiagnose.ps1 -TargetMachine "GV2PEPF0000385A"` | |
160 | + | ||
161 | + | 5. **检查机器组件信息** | |
162 | + | 命令:`Get-MachineComponentV2 -Filter "MachineName -eq 'GV2PEPF0000385A'" | Format-Table -AutoSize` | |
163 | + | ||
164 | + | 6. **检查机器服务信息** | |
165 | + | 命令:`Get-ServiceInfo.ps1.dms -RemoteFQDNs GV2PEPF0000385A` | |
166 | + | ||
167 | + | 7. **查看机器部署能力** | |
168 | + | 命令:`See-MachineDeployAbility -MachineName GV2PEPF0000385A` | |
169 | + | ||
170 | + | ### 缓解修复类 | |
171 | + | ||
172 | + | 1. **进入维护模式** | |
173 | + | 命令:`Request-MachineBeginMaintenance_V2.ps1 -TargetMachine "GV2PEPF0000385A" -Reason "Bring the machine to maintainance to reimage it."` | |
174 | + | ||
175 | + | 2. **重启目标机器** | |
176 | + | 命令:`Request-SetMachinePowerStateV2.ps1 -TargetMachine "GV2PEPF0000385A" -DesiredState "Restart" -Reason "Machine stacked."` | |
177 | + | ||
178 | + | 3. **清理机器缓存** | |
179 | + | 命令:`Invoke-ComponentReplicationWorkflow.ps1 -TargetMachine GV2PEPF0000385A -ComponentIds 'WipeCache'` | |
180 | + | ||
181 | + | 4. **优先分配备用机器** | |
182 | + | 命令:`New-MachineDeploymentControl.ps1 -MachineName "GV2PEPF0000385A" -MachineDeploymentControlType PrioritizeSpareAssignment -Justification "GV2PEPF0000385A has a bad disk, we need to replace it"` | |
183 | + | ||
184 | + | 5. **优先修复目标机器** | |
185 | + | 命令:`Request-RepairByDeployment.ps1 -MachineName GV2PEPF0000385A -ReimageMode "<ForceNextOS|ForceWinPE|ForceFlatten>" -Justification "Try to repair this machine."` | |
186 | + | ||
187 | + | 6. **立即尝试修复机器** | |
188 | + | 命令:`Invoke-RepairOnDemandWorkflow.ps1 -TargetMachine GV2PEPF0000385A` | |
189 | + | ||
190 | + | 使用合理的指令,触发机器部署即可。 | |
191 | + | ||
192 | + | > 结束 | |
193 | + | ||
194 | + | 10. 大范围问题或版本本身的问题: | |
195 | + | ||
196 | + | 参考第一章的诊断步骤,诊断大范围问题或版本本身的问题。 |
anduin revised this gist . Go to revision
1 file changed, 11 insertions, 6 deletions
Fix.md
@@ -50,16 +50,21 @@ $machines | Where-Object { $_.DesiredMachineDefinition -eq 'BE' } | Group-Object | |||
50 | 50 | APSFailedWorkitemEvent_Global | |
51 | 51 | | where env_time > ago(10h) | |
52 | 52 | | where prioritizer contains "Sweeper:CapacityDeploymentSweeper" | |
53 | - | //| where targetIntention contains "15.20.8032" | |
53 | + | | where targetIntention contains "15.20.8032" | |
54 | 54 | | where requestorMeta has 'TDF' | |
55 | 55 | | where requestorMeta has 'BE' | |
56 | 56 | | where requestorMeta has "DotBuildUpgrade" | |
57 | 57 | | where failureMessage !has "forcing ReimageMode to WinPE" | |
58 | - | | project failureTagId,resourceName,targetResourceUnit,targetIntention,deployRing, | |
59 | - | failureWorkflowName,workflowId,workflowStartTime,failureTagWords,failureMessage,workflowEndTime,sku | |
60 | - | | summarize Count=count(), ExampleErrorMessage=take_any(failureMessage), ExampleWorkflowId=take_any(workflowId) by failureWorkflowName | |
61 | - | | sort by Count | |
62 | - | | take 10 | |
58 | + | | where failureMessage !has "forcing ReimageMode to WinPE" | |
59 | + | | project failureTagId, resourceName, targetResourceUnit, targetIntention, deployRing, | |
60 | + | failureWorkflowName, workflowId, workflowStartTime, failureTagWords, | |
61 | + | failureMessage, workflowEndTime, sku | |
62 | + | | summarize Count = count(), | |
63 | + | ExampleErrorMessage = take_any(failureMessage), | |
64 | + | ExampleWorkflowId = take_any(workflowId) | |
65 | + | by targetIntention, failureWorkflowName | |
66 | + | | where Count > 3 | |
67 | + | | order by targetIntention asc, Count desc | |
63 | 68 | ``` | |
64 | 69 | ||
65 | 70 | 如果需要针对一台机器,查查它历史上都是如何被Deploy的, 使用下面的Kusto: |
anduin revised this gist . Go to revision
1 file changed, 11 insertions, 1 deletion
Fix.md
@@ -44,7 +44,7 @@ $machines | Where-Object { $_.DesiredMachineDefinition -eq 'BE' } | Group-Object | |||
44 | 44 | ||
45 | 45 | 7. 使用Dots App交叉验证6。找到除了无法BeginMM以外最多的错误。 | |
46 | 46 | ||
47 | - | 也可以使用下面的 Kusto: | |
47 | + | 如果需要查看一个Ring一个Role一个版本的升级发生的所有错误,按出错数量排序,使用下面的Kusto: | |
48 | 48 | ||
49 | 49 | ```kusto | |
50 | 50 | APSFailedWorkitemEvent_Global | |
@@ -62,6 +62,16 @@ failureWorkflowName,workflowId,workflowStartTime,failureTagWords,failureMessage, | |||
62 | 62 | | take 10 | |
63 | 63 | ``` | |
64 | 64 | ||
65 | + | 如果需要针对一台机器,查查它历史上都是如何被Deploy的, 使用下面的Kusto: | |
66 | + | ||
67 | + | ```kusto | |
68 | + | DeploymentCogsEvent_Global | |
69 | + | | where machineName == '$machine' | |
70 | + | | where timestamp >= ago(3d) | |
71 | + | | sort by timestamp desc | |
72 | + | | project timestamp, machineName, actionType, requestor, provisioningState, deployMode, activityId, forest, actionStatus, deployVersion, SourceVersion, workflowId | |
73 | + | ``` | |
74 | + | ||
65 | 75 | 8. 找到错误的信息,检查日志,找到正确的责任人。 | |
66 | 76 | ||
67 | 77 | (注意:CAMgt3 的机器天生不跑 FastTrain,只装 Dogfood,就不再管了,而且只有Reimage能成功) |
anduin revised this gist . Go to revision
No changes
anduin revised this gist . Go to revision
1 file changed, 2 insertions, 1 deletion
Fix.md
@@ -69,7 +69,8 @@ failureWorkflowName,workflowId,workflowStartTime,failureTagWords,failureMessage, | |||
69 | 69 | >结束 | |
70 | 70 | ||
71 | 71 | 9. 如果整个Ring都有这个问题,则在Email中搜索 Ringbot \ Stagebot 相关的告警,检查是否 Ringbot \ Stagebot 工作异常:(TODO:这里要讨论并非整个Ring有问题的情况) | |
72 | - | ||
72 | + | ||
73 | + | > gwrc (Get-OrchestrationScopeWorkflow (Get-LatestOrchestration StageBOT_ValidateWorkflow_Substrate_V2).QualifiedId).QualifiedWorkflowId | |
73 | 74 | ||
74 | 75 | * 如果有,检查Ringbot的异常:Get-ScheduledWorkflow ringbot* | ft name, lastoperationid, isenabled 或 Invoke-RingbotValidateProdRings.ps1 | |
75 | 76 | * 如果没有,继续 |
anduin revised this gist . Go to revision
1 file changed, 1 insertion
Fix.md
@@ -54,6 +54,7 @@ APSFailedWorkitemEvent_Global | |||
54 | 54 | | where requestorMeta has 'TDF' | |
55 | 55 | | where requestorMeta has 'BE' | |
56 | 56 | | where requestorMeta has "DotBuildUpgrade" | |
57 | + | | where failureMessage !has "forcing ReimageMode to WinPE" | |
57 | 58 | | project failureTagId,resourceName,targetResourceUnit,targetIntention,deployRing, | |
58 | 59 | failureWorkflowName,workflowId,workflowStartTime,failureTagWords,failureMessage,workflowEndTime,sku | |
59 | 60 | | summarize Count=count(), ExampleErrorMessage=take_any(failureMessage), ExampleWorkflowId=take_any(workflowId) by failureWorkflowName |
anduin revised this gist . Go to revision
1 file changed, 1 insertion, 1 deletion
Fix.md
@@ -56,7 +56,7 @@ APSFailedWorkitemEvent_Global | |||
56 | 56 | | where requestorMeta has "DotBuildUpgrade" | |
57 | 57 | | project failureTagId,resourceName,targetResourceUnit,targetIntention,deployRing, | |
58 | 58 | failureWorkflowName,workflowId,workflowStartTime,failureTagWords,failureMessage,workflowEndTime,sku | |
59 | - | | summarize Count=count() by failureWorkflowName | |
59 | + | | summarize Count=count(), ExampleErrorMessage=take_any(failureMessage), ExampleWorkflowId=take_any(workflowId) by failureWorkflowName | |
60 | 60 | | sort by Count | |
61 | 61 | | take 10 | |
62 | 62 | ``` |
anduin revised this gist . Go to revision
1 file changed, 17 insertions
Fix.md
@@ -44,6 +44,23 @@ $machines | Where-Object { $_.DesiredMachineDefinition -eq 'BE' } | Group-Object | |||
44 | 44 | ||
45 | 45 | 7. 使用Dots App交叉验证6。找到除了无法BeginMM以外最多的错误。 | |
46 | 46 | ||
47 | + | 也可以使用下面的 Kusto: | |
48 | + | ||
49 | + | ```kusto | |
50 | + | APSFailedWorkitemEvent_Global | |
51 | + | | where env_time > ago(10h) | |
52 | + | | where prioritizer contains "Sweeper:CapacityDeploymentSweeper" | |
53 | + | //| where targetIntention contains "15.20.8032" | |
54 | + | | where requestorMeta has 'TDF' | |
55 | + | | where requestorMeta has 'BE' | |
56 | + | | where requestorMeta has "DotBuildUpgrade" | |
57 | + | | project failureTagId,resourceName,targetResourceUnit,targetIntention,deployRing, | |
58 | + | failureWorkflowName,workflowId,workflowStartTime,failureTagWords,failureMessage,workflowEndTime,sku | |
59 | + | | summarize Count=count() by failureWorkflowName | |
60 | + | | sort by Count | |
61 | + | | take 10 | |
62 | + | ``` | |
63 | + | ||
47 | 64 | 8. 找到错误的信息,检查日志,找到正确的责任人。 | |
48 | 65 | ||
49 | 66 | (注意:CAMgt3 的机器天生不跑 FastTrain,只装 Dogfood,就不再管了,而且只有Reimage能成功) |
anduin revised this gist . Go to revision
1 file changed, 1 insertion, 1 deletion
Fix.md
@@ -114,7 +114,7 @@ New-DeploymentConfigWorkItem ` | |||
114 | 114 | -DeployRing MSIT ` | |
115 | 115 | -ApprovedVersion 15.20.7091.005 ` | |
116 | 116 | -ServerRole BE ` | |
117 | - | -HandlerType DeploymentEngine ` | |
117 | + | -HandlerType <-EACH-> ` | |
118 | 118 | -DeploymentTrainType RegularTrain ` | |
119 | 119 | -HandlerStatus Pending ` | |
120 | 120 | -UserComments "Rerun PrepareAD with new build" ` |
anduin revised this gist . Go to revision
1 file changed, 12 insertions, 1 deletion
Fix.md
@@ -108,7 +108,18 @@ Get-DeploymentConfigPrerequisiteVersion -EntityName BE -ApprovedVersion 15.20.74 | |||
108 | 108 | ``` | |
109 | 109 | ||
110 | 110 | For: DeploymentEngine, SeedComponents, Saturation, BuildComponentList, PrepareAD, SeedPavcComponents, WorkflowPairing | |
111 | - | New-DeploymentConfigWorkItem -DeployRing MSIT -ApprovedVersion 15.20.7091.005 -ServerRole BE -HandlerType DeploymentEngine -DeploymentTrainType RegularTrain | |
111 | + | ||
112 | + | ```powershell | |
113 | + | New-DeploymentConfigWorkItem ` | |
114 | + | -DeployRing MSIT ` | |
115 | + | -ApprovedVersion 15.20.7091.005 ` | |
116 | + | -ServerRole BE ` | |
117 | + | -HandlerType DeploymentEngine ` | |
118 | + | -DeploymentTrainType RegularTrain ` | |
119 | + | -HandlerStatus Pending ` | |
120 | + | -UserComments "Rerun PrepareAD with new build" ` | |
121 | + | -CreatedBy xuef | |
122 | + | ``` | |
112 | 123 | ||
113 | 124 | * 然后等待大约20分钟,重新执行此检查单 | |
114 | 125 |