Operator: aten._log_softmax.default
cnt: 1, ((T([64, 1000], f16), 1, False), {})
Operator: aten._log_softmax_backward_data.default
cnt: 1, ((T([64, 1000], f16), T([64, 1000], f16), 1, f16), {})
Operator: aten._softmax.default
cnt: 2, ((T([4096, 4, 49, 49], f16), -1, False), {})
cnt: 2, ((T([1024, 8, 49, 49], f16), -1, False), {})
cnt: 18, ((T([256, 16, 49, 49], f16), -1, False), {})
cnt: 2, ((T([64, 32, 49, 49], f16), -1, False), {})
Operator: aten._softmax_backward_data.default
cnt: 2, ((T([64, 32, 49, 49], f16), T([64, 32, 49, 49], f16), -1, f16), {})
cnt: 18, ((T([256, 16, 49, 49], f16), T([256, 16, 49, 49], f16), -1, f16), {})
cnt: 2, ((T([1024, 8, 49, 49], f16), T([1024, 8, 49, 49], f16), -1, f16), {})
cnt: 2, ((T([4096, 4, 49, 49], f16), T([4096, 4, 49, 49], f16), -1, f16), {})
Operator: aten._unsafe_view.default
cnt: 6, ((T([4096, 4, 49, 32], f16), [16384, 49, 32]), {})
cnt: 2, ((T([4096, 4, 32, 49], f16), [16384, 32, 49]), {})
cnt: 2, ((T([16384, 49, 49], f16), [4096, 4, 49, 49]), {})
cnt: 2, ((T([16384, 49, 32], f16), [4096, 4, 49, 32]), {})
cnt: 2, ((T([4096, 49, 4, 32], f16), [4096, 49, 128]), {})
cnt: 1, ((T([50176, 256], f16), [64, 784, 256]), {})
cnt: 6, ((T([1024, 8, 49, 32], f16), [8192, 49, 32]), {})
cnt: 2, ((T([1024, 8, 32, 49], f16), [8192, 32, 49]), {})
cnt: 2, ((T([8192, 49, 49], f16), [1024, 8, 49, 49]), {})
cnt: 2, ((T([8192, 49, 32], f16), [1024, 8, 49, 32]), {})
cnt: 2, ((T([1024, 49, 8, 32], f16), [1024, 49, 256]), {})
cnt: 1, ((T([12544, 512], f16), [64, 196, 512]), {})
cnt: 54, ((T([256, 16, 49, 32], f16), [4096, 49, 32]), {})
cnt: 18, ((T([256, 16, 32, 49], f16), [4096, 32, 49]), {})
cnt: 18, ((T([4096, 49, 49], f16), [256, 16, 49, 49]), {})
cnt: 18, ((T([4096, 49, 32], f16), [256, 16, 49, 32]), {})
cnt: 18, ((T([256, 49, 16, 32], f16), [256, 49, 512]), {})
cnt: 1, ((T([3136, 1024], f16), [64, 49, 1024]), {})
cnt: 6, ((T([64, 32, 49, 32], f16), [2048, 49, 32]), {})
cnt: 2, ((T([64, 32, 32, 49], f16), [2048, 32, 49]), {})
cnt: 2, ((T([2048, 49, 49], f16), [64, 32, 49, 49]), {})
cnt: 2, ((T([2048, 49, 32], f16), [64, 32, 49, 32]), {})
cnt: 2, ((T([64, 49, 32, 32], f16), [64, 49, 1024]), {})
cnt: 2, ((T([64, 49, 3, 32, 32], f16), [64, 49, 3072]), {})
cnt: 18, ((T([64, 2, 2, 7, 7, 512], f16), [256, 7, 7, 512]), {})
cnt: 18, ((T([256, 49, 3, 16, 32], f16), [256, 49, 1536]), {})
cnt: 18, ((T([64, 2, 7, 2, 7, 512], f16), [64, 14, 14, 512]), {})
cnt: 2, ((T([64, 4, 4, 7, 7, 256], f16), [1024, 7, 7, 256]), {})
cnt: 2, ((T([1024, 49, 3, 8, 32], f16), [1024, 49, 768]), {})
cnt: 2, ((T([64, 4, 7, 4, 7, 256], f16), [64, 28, 28, 256]), {})
cnt: 2, ((T([64, 8, 8, 7, 7, 128], f16), [4096, 7, 7, 128]), {})
cnt: 2, ((T([4096, 49, 3, 4, 32], f16), [4096, 49, 384]), {})
cnt: 2, ((T([64, 8, 7, 8, 7, 128], f16), [64, 56, 56, 128]), {})
Operator: aten.add.Tensor
cnt: 2, ((T([4096, 4, 49, 49], f16), T([1, 4, 49, 49], f16)), {})
cnt: 8, ((T([64, 3136, 128], f16), T([64, 3136, 128], f16)), {})
cnt: 1, ((T([64, 64, 4, 49, 49], f16), T([1, 64, 1, 49, 49], f16)), {})
cnt: 2, ((T([1024, 8, 49, 49], f16), T([1, 8, 49, 49], f16)), {})
cnt: 8, ((T([64, 784, 256], f16), T([64, 784, 256], f16)), {})
cnt: 1, ((T([64, 16, 8, 49, 49], f16), T([1, 16, 1, 49, 49], f16)), {})
cnt: 18, ((T([256, 16, 49, 49], f16), T([1, 16, 49, 49], f16)), {})
cnt: 72, ((T([64, 196, 512], f16), T([64, 196, 512], f16)), {})
cnt: 9, ((T([64, 4, 16, 49, 49], f16), T([1, 4, 1, 49, 49], f16)), {})
cnt: 2, ((T([64, 32, 49, 49], f16), T([1, 32, 49, 49], f16)), {})
cnt: 8, ((T([64, 49, 1024], f16), T([64, 49, 1024], f16)), {})
cnt: 3, ((T([64, 14, 14, 512], f16), T([64, 14, 14, 512], f16)), {})
cnt: 3, ((T([64, 28, 28, 256], f16), T([64, 28, 28, 256], f16)), {})
cnt: 3, ((T([64, 56, 56, 128], f16), T([64, 56, 56, 128], f16)), {})
Operator: aten.addmm.default
cnt: 2, ((T([384], f16), T([200704, 128], f16), T([128, 384], f16, stride=(1, 128))), {})
cnt: 2, ((T([128], f16), T([200704, 128], f16), T([128, 128], f16, stride=(1, 128))), {})
cnt: 2, ((T([512], f16), T([200704, 128], f16), T([128, 512], f16, stride=(1, 128))), {})
cnt: 2, ((T([128], f16), T([200704, 512], f16), T([512, 128], f16, stride=(1, 512))), {})
cnt: 2, ((T([768], f16), T([50176, 256], f16), T([256, 768], f16, stride=(1, 256))), {})
cnt: 2, ((T([256], f16), T([50176, 256], f16), T([256, 256], f16, stride=(1, 256))), {})
cnt: 2, ((T([1024], f16), T([50176, 256], f16), T([256, 1024], f16, stride=(1, 256))), {})
cnt: 2, ((T([256], f16), T([50176, 1024], f16), T([1024, 256], f16, stride=(1, 1024))), {})
cnt: 18, ((T([1536], f16), T([12544, 512], f16), T([512, 1536], f16, stride=(1, 512))), {})
cnt: 18, ((T([512], f16), T([12544, 512], f16), T([512, 512], f16, stride=(1, 512))), {})
cnt: 18, ((T([2048], f16), T([12544, 512], f16), T([512, 2048], f16, stride=(1, 512))), {})
cnt: 18, ((T([512], f16), T([12544, 2048], f16), T([2048, 512], f16, stride=(1, 2048))), {})
cnt: 2, ((T([3072], f16), T([3136, 1024], f16), T([1024, 3072], f16, stride=(1, 1024))), {})
cnt: 2, ((T([1024], f16), T([3136, 1024], f16), T([1024, 1024], f16, stride=(1, 1024))), {})
cnt: 2, ((T([4096], f16), T([3136, 1024], f16), T([1024, 4096], f16, stride=(1, 1024))), {})
cnt: 2, ((T([1024], f16), T([3136, 4096], f16), T([4096, 1024], f16, stride=(1, 4096))), {})
cnt: 1, ((T([1000], f16), T([64, 1024], f16), T([1024, 1000], f16, stride=(1, 1024))), {})
Operator: aten.bernoulli_.float
cnt: 2, ((T([64, 1, 1], f16), 0.9956521736457944), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9913043472915888), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9869565209373832), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9826086945831776), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9782608672976494), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9739130418747663), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9695652164518833), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9652173891663551), {})
cnt: 2, ((T([64, 1, 1], f16), 0.960869561880827), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9565217345952988), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9521739110350609), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9478260837495327), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9434782564640045), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9391304329037666), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9347826093435287), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9304347857832909), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9260869547724724), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9217391312122345), {})
cnt: 2, ((T([64, 1, 1], f16), 0.917391300201416), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9130434766411781), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9086956530809402), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9043478220701218), {})
cnt: 2, ((T([64, 1, 1], f16), 0.8999999985098839), {})
Operator: aten.bmm.default
cnt: 2, ((T([16384, 49, 32], f16), T([16384, 32, 49], f16)), {})
cnt: 2, ((T([16384, 49, 49], f16), T([16384, 49, 32], f16)), {})
cnt: 2, ((T([8192, 49, 32], f16), T([8192, 32, 49], f16)), {})
cnt: 2, ((T([8192, 49, 49], f16), T([8192, 49, 32], f16)), {})
cnt: 18, ((T([4096, 49, 32], f16), T([4096, 32, 49], f16)), {})
cnt: 18, ((T([4096, 49, 49], f16), T([4096, 49, 32], f16)), {})
cnt: 2, ((T([2048, 49, 32], f16), T([2048, 32, 49], f16)), {})
cnt: 2, ((T([2048, 49, 49], f16), T([2048, 49, 32], f16)), {})
cnt: 2, ((T([2048, 49, 49], f16, stride=(2401, 1, 49)), T([2048, 49, 32], f16)), {})
cnt: 2, ((T([2048, 49, 32], f16), T([2048, 32, 49], f16, stride=(1568, 1, 32))), {})
cnt: 2, ((T([2048, 32, 49], f16, stride=(1568, 1, 32)), T([2048, 49, 49], f16)), {})
cnt: 2, ((T([2048, 49, 49], f16), T([2048, 49, 32], f16, stride=(1568, 1, 49))), {})
cnt: 18, ((T([4096, 49, 49], f16, stride=(2401, 1, 49)), T([4096, 49, 32], f16)), {})
cnt: 18, ((T([4096, 49, 32], f16), T([4096, 32, 49], f16, stride=(1568, 1, 32))), {})
cnt: 18, ((T([4096, 32, 49], f16, stride=(1568, 1, 32)), T([4096, 49, 49], f16)), {})
cnt: 18, ((T([4096, 49, 49], f16), T([4096, 49, 32], f16, stride=(1568, 1, 49))), {})
cnt: 2, ((T([8192, 49, 49], f16, stride=(2401, 1, 49)), T([8192, 49, 32], f16)), {})
cnt: 2, ((T([8192, 49, 32], f16), T([8192, 32, 49], f16, stride=(1568, 1, 32))), {})
cnt: 2, ((T([8192, 32, 49], f16, stride=(1568, 1, 32)), T([8192, 49, 49], f16)), {})
cnt: 2, ((T([8192, 49, 49], f16), T([8192, 49, 32], f16, stride=(1568, 1, 49))), {})
cnt: 2, ((T([16384, 49, 49], f16, stride=(2401, 1, 49)), T([16384, 49, 32], f16)), {})
cnt: 2, ((T([16384, 49, 32], f16), T([16384, 32, 49], f16, stride=(1568, 1, 32))), {})
cnt: 2, ((T([16384, 32, 49], f16, stride=(1568, 1, 32)), T([16384, 49, 49], f16)), {})
cnt: 2, ((T([16384, 49, 49], f16), T([16384, 49, 32], f16, stride=(1568, 1, 49))), {})
Operator: aten.cat.default
cnt: 1, (([T([64, 28, 28, 128], f16, stride=(401408, 14336, 256, 1)), T([64, 28, 28, 128], f16, stride=(401408, 14336, 256, 1)), T([64, 28, 28, 128], f16, stride=(401408, 14336, 256, 1)), T([64, 28, 28, 128], f16, stride=(401408, 14336, 256, 1))], -1), {})
cnt: 1, (([T([64, 14, 14, 256], f16, stride=(200704, 14336, 512, 1)), T([64, 14, 14, 256], f16, stride=(200704, 14336, 512, 1)), T([64, 14, 14, 256], f16, stride=(200704, 14336, 512, 1)), T([64, 14, 14, 256], f16, stride=(200704, 14336, 512, 1))], -1), {})
cnt: 1, (([T([64, 7, 7, 512], f16, stride=(100352, 14336, 1024, 1)), T([64, 7, 7, 512], f16, stride=(100352, 14336, 1024, 1)), T([64, 7, 7, 512], f16, stride=(100352, 14336, 1024, 1)), T([64, 7, 7, 512], f16, stride=(100352, 14336, 1024, 1))], -1), {})
Operator: aten.clone.default
cnt: 1, ((T([64, 3, 224, 224], f16),), {})
Operator: aten.convolution.default
cnt: 1, ((T([64, 3, 224, 224], f16), T([128, 3, 4, 4], f16), T([128], f16), [4, 4], [0, 0], [1, 1], False, [0, 0], 1), {})
Operator: aten.convolution_backward.default
cnt: 1, ((T([64, 128, 56, 56], f16, stride=(401408, 1, 7168, 128)), T([64, 3, 224, 224], f16), T([128, 3, 4, 4], f16), [128], [4, 4], [0, 0], [1, 1], False, [0, 0], 1, [False, True, True]), {})
Operator: aten.copy_.default
cnt: 1, ((T([64, 3, 224, 224], f16), T([64, 3, 224, 224], f16)), {})
Operator: aten.div.Scalar
cnt: 1, ((T([64, 49, 1024], f16, stride=(1024, 0, 1)), 49), {})
Operator: aten.div_.Tensor
cnt: 2, ((T([64, 1, 1], f16), 0.9956521736457944), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9913043472915888), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9869565209373832), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9826086945831776), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9782608672976494), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9739130418747663), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9695652164518833), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9652173891663551), {})
cnt: 2, ((T([64, 1, 1], f16), 0.960869561880827), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9565217345952988), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9521739110350609), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9478260837495327), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9434782564640045), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9391304329037666), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9347826093435287), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9304347857832909), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9260869547724724), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9217391312122345), {})
cnt: 2, ((T([64, 1, 1], f16), 0.917391300201416), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9130434766411781), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9086956530809402), {})
cnt: 2, ((T([64, 1, 1], f16), 0.9043478220701218), {})
cnt: 2, ((T([64, 1, 1], f16), 0.8999999985098839), {})
Operator: aten.gelu.default
cnt: 2, ((T([64, 3136, 512], f16),), {})
cnt: 2, ((T([64, 784, 1024], f16),), {})
cnt: 18, ((T([64, 196, 2048], f16),), {})
cnt: 2, ((T([64, 49, 4096], f16),), {})
Operator: aten.gelu_backward.default
cnt: 2, ((T([64, 49, 4096], f16), T([64, 49, 4096], f16)), {})
cnt: 18, ((T([64, 196, 2048], f16), T([64, 196, 2048], f16)), {})
cnt: 2, ((T([64, 784, 1024], f16), T([64, 784, 1024], f16)), {})
cnt: 2, ((T([64, 3136, 512], f16), T([64, 3136, 512], f16)), {})
Operator: aten.index.Tensor
cnt: 2, ((T([169, 4], f16), [T([2401], i64)]), {})
cnt: 2, ((T([169, 8], f16), [T([2401], i64)]), {})
cnt: 18, ((T([169, 16], f16), [T([2401], i64)]), {})
cnt: 2, ((T([169, 32], f16), [T([2401], i64)]), {})
Operator: aten.index_put.default
cnt: 2, ((T([169, 32], f16), [T([2401], i64)], T([2401, 32], f16, stride=(1, 2401)), True), {})
cnt: 18, ((T([169, 16], f16), [T([2401], i64)], T([2401, 16], f16, stride=(1, 2401)), True), {})
cnt: 2, ((T([169, 8], f16), [T([2401], i64)], T([2401, 8], f16, stride=(1, 2401)), True), {})
cnt: 2, ((T([169, 4], f16), [T([2401], i64)], T([2401, 4], f16, stride=(1, 2401)), True), {})
Operator: aten.lift_fresh_copy.default
cnt: 1, ((T([64], i64),), {})
Operator: aten.mean.dim
cnt: 1, ((T([64, 49, 1024], f16), [1]), {})
Operator: aten.mm.default
cnt: 1, ((T([50176, 512], f16), T([512, 256], f16, stride=(1, 512))), {})
cnt: 1, ((T([12544, 1024], f16), T([1024, 512], f16, stride=(1, 1024))), {})
cnt: 1, ((T([3136, 2048], f16), T([2048, 1024], f16, stride=(1, 2048))), {})
cnt: 1, ((T([64, 1000], f16), T([1000, 1024], f16)), {})
cnt: 1, ((T([1000, 64], f16, stride=(1, 1000)), T([64, 1024], f16)), {})
cnt: 2, ((T([3136, 1024], f16), T([1024, 4096], f16)), {})
cnt: 2, ((T([1024, 3136], f16, stride=(1, 1024)), T([3136, 4096], f16)), {})
cnt: 2, ((T([3136, 4096], f16), T([4096, 1024], f16)), {})
cnt: 2, ((T([4096, 3136], f16, stride=(1, 4096)), T([3136, 1024], f16)), {})
cnt: 2, ((T([3136, 1024], f16), T([1024, 1024], f16)), {})
cnt: 2, ((T([1024, 3136], f16, stride=(1, 1024)), T([3136, 1024], f16)), {})
cnt: 2, ((T([3136, 3072], f16), T([3072, 1024], f16)), {})
cnt: 2, ((T([3072, 3136], f16, stride=(1, 3072)), T([3136, 1024], f16)), {})
cnt: 1, ((T([1024, 3136], f16, stride=(1, 1024)), T([3136, 2048], f16)), {})
cnt: 1, ((T([3136, 1024], f16), T([1024, 2048], f16)), {})
cnt: 18, ((T([12544, 512], f16), T([512, 2048], f16)), {})
cnt: 18, ((T([512, 12544], f16, stride=(1, 512)), T([12544, 2048], f16)), {})
cnt: 18, ((T([12544, 2048], f16), T([2048, 512], f16)), {})
cnt: 18, ((T([2048, 12544], f16, stride=(1, 2048)), T([12544, 512], f16)), {})
cnt: 18, ((T([12544, 512], f16), T([512, 512], f16)), {})
cnt: 18, ((T([512, 12544], f16, stride=(1, 512)), T([12544, 512], f16)), {})
cnt: 18, ((T([12544, 1536], f16), T([1536, 512], f16)), {})
cnt: 18, ((T([1536, 12544], f16, stride=(1, 1536)), T([12544, 512], f16)), {})
cnt: 1, ((T([512, 12544], f16, stride=(1, 512)), T([12544, 1024], f16)), {})
cnt: 1, ((T([12544, 512], f16), T([512, 1024], f16)), {})
cnt: 2, ((T([50176, 256], f16), T([256, 1024], f16)), {})
cnt: 2, ((T([256, 50176], f16, stride=(1, 256)), T([50176, 1024], f16)), {})
cnt: 2, ((T([50176, 1024], f16), T([1024, 256], f16)), {})
cnt: 2, ((T([1024, 50176], f16, stride=(1, 1024)), T([50176, 256], f16)), {})
cnt: 2, ((T([50176, 256], f16), T([256, 256], f16)), {})
cnt: 2, ((T([256, 50176], f16, stride=(1, 256)), T([50176, 256], f16)), {})
cnt: 2, ((T([50176, 768], f16), T([768, 256], f16)), {})
cnt: 2, ((T([768, 50176], f16, stride=(1, 768)), T([50176, 256], f16)), {})
cnt: 1, ((T([256, 50176], f16, stride=(1, 256)), T([50176, 512], f16)), {})
cnt: 1, ((T([50176, 256], f16), T([256, 512], f16)), {})
cnt: 2, ((T([200704, 128], f16), T([128, 512], f16)), {})
cnt: 2, ((T([128, 200704], f16, stride=(1, 128)), T([200704, 512], f16)), {})
cnt: 2, ((T([200704, 512], f16), T([512, 128], f16)), {})
cnt: 2, ((T([512, 200704], f16, stride=(1, 512)), T([200704, 128], f16)), {})
cnt: 2, ((T([200704, 128], f16), T([128, 128], f16)), {})
cnt: 2, ((T([128, 200704], f16, stride=(1, 128)), T([200704, 128], f16)), {})
cnt: 2, ((T([200704, 384], f16), T([384, 128], f16)), {})
cnt: 2, ((T([384, 200704], f16, stride=(1, 384)), T([200704, 128], f16)), {})
Operator: aten.mul.Tensor
cnt: 2, ((T([4096, 4, 49, 32], f16, stride=(18816, 32, 384, 1)), 0.1767766952966369), {})
cnt: 4, ((T([64, 3136, 128], f16), T([64, 1, 1], f16)), {})
cnt: 2, ((T([1024, 8, 49, 32], f16, stride=(37632, 32, 768, 1)), 0.1767766952966369), {})
cnt: 8, ((T([64, 784, 256], f16), T([64, 1, 1], f16)), {})
cnt: 18, ((T([256, 16, 49, 32], f16, stride=(75264, 32, 1536, 1)), 0.1767766952966369), {})
cnt: 72, ((T([64, 196, 512], f16), T([64, 1, 1], f16)), {})
cnt: 2, ((T([64, 32, 49, 32], f16, stride=(150528, 32, 3072, 1)), 0.1767766952966369), {})
cnt: 8, ((T([64, 49, 1024], f16), T([64, 1, 1], f16)), {})
cnt: 2, ((T([64, 32, 49, 32], f16), 0.1767766952966369), {})
cnt: 18, ((T([256, 16, 49, 32], f16), 0.1767766952966369), {})
cnt: 2, ((T([1024, 8, 49, 32], f16), 0.1767766952966369), {})
cnt: 2, ((T([4096, 4, 49, 32], f16), 0.1767766952966369), {})
Operator: aten.native_layer_norm.default
cnt: 1, ((T([64, 3136, 128], f16, stride=(401408, 1, 3136)), [128], T([128], f16), T([128], f16), 1e-05), {})
cnt: 4, ((T([64, 3136, 128], f16), [128], T([128], f16), T([128], f16), 1e-05), {})
cnt: 1, ((T([64, 784, 512], f16), [512], T([512], f16), T([512], f16), 1e-05), {})
cnt: 4, ((T([64, 784, 256], f16), [256], T([256], f16), T([256], f16), 1e-05), {})
cnt: 1, ((T([64, 196, 1024], f16), [1024], T([1024], f16), T([1024], f16), 1e-05), {})
cnt: 36, ((T([64, 196, 512], f16), [512], T([512], f16), T([512], f16), 1e-05), {})
cnt: 1, ((T([64, 49, 2048], f16), [2048], T([2048], f16), T([2048], f16), 1e-05), {})
cnt: 5, ((T([64, 49, 1024], f16), [1024], T([1024], f16), T([1024], f16), 1e-05), {})
Operator: aten.native_layer_norm_backward.default
cnt: 5, ((T([64, 49, 1024], f16), T([64, 49, 1024], f16), [1024], T([64, 49, 1], f32), T([64, 49, 1], f32), T([1024], f16), T([1024], f16), [True, True, True]), {})
cnt: 1, ((T([64, 49, 2048], f16), T([64, 49, 2048], f16), [2048], T([64, 49, 1], f32), T([64, 49, 1], f32), T([2048], f16), T([2048], f16), [True, True, True]), {})
cnt: 36, ((T([64, 196, 512], f16), T([64, 196, 512], f16), [512], T([64, 196, 1], f32), T([64, 196, 1], f32), T([512], f16), T([512], f16), [True, True, True]), {})
cnt: 1, ((T([64, 196, 1024], f16), T([64, 196, 1024], f16), [1024], T([64, 196, 1], f32), T([64, 196, 1], f32), T([1024], f16), T([1024], f16), [True, True, True]), {})
cnt: 4, ((T([64, 784, 256], f16), T([64, 784, 256], f16), [256], T([64, 784, 1], f32), T([64, 784, 1], f32), T([256], f16), T([256], f16), [True, True, True]), {})
cnt: 1, ((T([64, 784, 512], f16), T([64, 784, 512], f16), [512], T([64, 784, 1], f32), T([64, 784, 1], f32), T([512], f16), T([512], f16), [True, True, True]), {})
cnt: 4, ((T([64, 3136, 128], f16), T([64, 3136, 128], f16), [128], T([64, 3136, 1], f32), T([64, 3136, 1], f32), T([128], f16), T([128], f16), [True, True, True]), {})
cnt: 1, ((T([64, 3136, 128], f16), T([64, 3136, 128], f16, stride=(401408, 1, 3136)), [128], T([64, 3136, 1], f32), T([64, 3136, 1], f32), T([128], f16), T([128], f16), [True, True, True]), {})
Operator: aten.new_empty.default
cnt: 2, ((T([64, 3136, 128], f16), [64, 1, 1]), {'dtype': f16, 'layout': torch.strided, 'device': 'cuda', 'pin_memory': False})
cnt: 4, ((T([64, 784, 256], f16), [64, 1, 1]), {'dtype': f16, 'layout': torch.strided, 'device': 'cuda', 'pin_memory': False})
cnt: 36, ((T([64, 196, 512], f16), [64, 1, 1]), {'dtype': f16, 'layout': torch.strided, 'device': 'cuda', 'pin_memory': False})
cnt: 4, ((T([64, 49, 1024], f16), [64, 1, 1]), {'dtype': f16, 'layout': torch.strided, 'device': 'cuda', 'pin_memory': False})
Operator: aten.new_zeros.default
cnt: 2, ((T([2401, 32], f16, stride=(1, 2401)), [169, 32]), {'dtype': f16, 'layout': torch.strided, 'device': 'cuda'})
cnt: 18, ((T([2401, 16], f16, stride=(1, 2401)), [169, 16]), {'dtype': f16, 'layout': torch.strided, 'device': 'cuda'})
cnt: 2, ((T([2401, 8], f16, stride=(1, 2401)), [169, 8]), {'dtype': f16, 'layout': torch.strided, 'device': 'cuda'})
cnt: 2, ((T([2401, 4], f16, stride=(1, 2401)), [169, 4]), {'dtype': f16, 'layout': torch.strided, 'device': 'cuda'})
Operator: aten.nll_loss_backward.default
cnt: 1, ((T([], f16), T([64, 1000], f16), T([64], i64), None, 1, -100, T([], f16)), {})
Operator: aten.nll_loss_forward.default
cnt: 1, ((T([64, 1000], f16), T([64], i64), None, 1, -100), {})
Operator: aten.roll.default
cnt: 1, ((T([64, 56, 56, 128], f16), [-3, -3], [1, 2]), {})
cnt: 1, ((T([64, 56, 56, 128], f16), [3, 3], [1, 2]), {})
cnt: 1, ((T([64, 28, 28, 256], f16), [-3, -3], [1, 2]), {})
cnt: 1, ((T([64, 28, 28, 256], f16), [3, 3], [1, 2]), {})
cnt: 9, ((T([64, 14, 14, 512], f16), [-3, -3], [1, 2]), {})
cnt: 9, ((T([64, 14, 14, 512], f16), [3, 3], [1, 2]), {})
cnt: 9, ((T([64, 14, 14, 512], f16), [-3, -3], [2, 1]), {})
cnt: 9, ((T([64, 14, 14, 512], f16), [3, 3], [2, 1]), {})
cnt: 1, ((T([64, 28, 28, 256], f16), [-3, -3], [2, 1]), {})
cnt: 1, ((T([64, 28, 28, 256], f16), [3, 3], [2, 1]), {})
cnt: 1, ((T([64, 56, 56, 128], f16), [-3, -3], [2, 1]), {})
cnt: 1, ((T([64, 56, 56, 128], f16), [3, 3], [2, 1]), {})
Operator: aten.slice_backward.default
cnt: 4, ((T([64, 7, 7, 512], f16, stride=(100352, 14336, 2048, 1)), [64, 7, 7, 512], 3, 0, 9223372036854775807, 1), {})
cnt: 2, ((T([64, 7, 7, 512], f16), [64, 7, 14, 512], 2, 1, 9223372036854775807, 2), {})
cnt: 2, ((T([64, 7, 14, 512], f16), [64, 14, 14, 512], 1, 1, 9223372036854775807, 2), {})
cnt: 4, ((T([64, 14, 14, 512], f16), [64, 14, 14, 512], 0, 0, 9223372036854775807, 1), {})
cnt: 2, ((T([64, 7, 14, 512], f16), [64, 14, 14, 512], 1, 0, 9223372036854775807, 2), {})
cnt: 2, ((T([64, 7, 7, 512], f16), [64, 7, 14, 512], 2, 0, 9223372036854775807, 2), {})
cnt: 4, ((T([64, 14, 14, 256], f16, stride=(200704, 14336, 1024, 1)), [64, 14, 14, 256], 3, 0, 9223372036854775807, 1), {})
cnt: 2, ((T([64, 14, 14, 256], f16), [64, 14, 28, 256], 2, 1, 9223372036854775807, 2), {})
cnt: 2, ((T([64, 14, 28, 256], f16), [64, 28, 28, 256], 1, 1, 9223372036854775807, 2), {})
cnt: 4, ((T([64, 28, 28, 256], f16), [64, 28, 28, 256], 0, 0, 9223372036854775807, 1), {})
cnt: 2, ((T([64, 14, 28, 256], f16), [64, 28, 28, 256], 1, 0, 9223372036854775807, 2), {})
cnt: 2, ((T([64, 14, 14, 256], f16), [64, 14, 28, 256], 2, 0, 9223372036854775807, 2), {})
cnt: 4, ((T([64, 28, 28, 128], f16, stride=(401408, 14336, 512, 1)), [64, 28, 28, 128], 3, 0, 9223372036854775807, 1), {})
cnt: 2, ((T([64, 28, 28, 128], f16), [64, 28, 56, 128], 2, 1, 9223372036854775807, 2), {})
cnt: 2, ((T([64, 28, 56, 128], f16), [64, 56, 56, 128], 1, 1, 9223372036854775807, 2), {})
cnt: 4, ((T([64, 56, 56, 128], f16), [64, 56, 56, 128], 0, 0, 9223372036854775807, 1), {})
cnt: 2, ((T([64, 28, 56, 128], f16), [64, 56, 56, 128], 1, 0, 9223372036854775807, 2), {})
cnt: 2, ((T([64, 28, 28, 128], f16), [64, 28, 56, 128], 2, 0, 9223372036854775807, 2), {})
Operator: aten.stack.default
cnt: 2, (([T([64, 32, 49, 32], f16), T([64, 32, 49, 32], f16, stride=(50176, 1568, 1, 49)), T([64, 32, 49, 32], f16)],), {})
cnt: 18, (([T([256, 16, 49, 32], f16), T([256, 16, 49, 32], f16, stride=(25088, 1568, 1, 49)), T([256, 16, 49, 32], f16)],), {})
cnt: 2, (([T([1024, 8, 49, 32], f16), T([1024, 8, 49, 32], f16, stride=(12544, 1568, 1, 49)), T([1024, 8, 49, 32], f16)],), {})
cnt: 2, (([T([4096, 4, 49, 32], f16), T([4096, 4, 49, 32], f16, stride=(6272, 1568, 1, 49)), T([4096, 4, 49, 32], f16)],), {})
Operator: aten.sum.SymInt
cnt: 1, ((T([64, 1000], f16), [0], True), {})
cnt: 4, ((T([3136, 1024], f16), [0], True), {})
cnt: 2, ((T([3136, 4096], f16), [0], True), {})
cnt: 2, ((T([64, 32, 49, 49], f16), [0], True), {})
cnt: 2, ((T([3136, 3072], f16), [0], True), {})
cnt: 36, ((T([12544, 512], f16), [0], True), {})
cnt: 18, ((T([12544, 2048], f16), [0], True), {})
cnt: 18, ((T([256, 16, 49, 49], f16), [0], True), {})
cnt: 18, ((T([12544, 1536], f16), [0], True), {})
cnt: 4, ((T([50176, 256], f16), [0], True), {})
cnt: 2, ((T([50176, 1024], f16), [0], True), {})
cnt: 2, ((T([1024, 8, 49, 49], f16), [0], True), {})
cnt: 2, ((T([50176, 768], f16), [0], True), {})
cnt: 4, ((T([200704, 128], f16), [0], True), {})
cnt: 2, ((T([200704, 512], f16), [0], True), {})
cnt: 2, ((T([4096, 4, 49, 49], f16), [0], True), {})
cnt: 2, ((T([200704, 384], f16), [0], True), {})
Operator: aten.unbind.int
cnt: 2, ((T([3, 4096, 4, 49, 32], f16, stride=(128, 18816, 32, 384, 1)),), {})
cnt: 2, ((T([3, 1024, 8, 49, 32], f16, stride=(256, 37632, 32, 768, 1)),), {})
cnt: 18, ((T([3, 256, 16, 49, 32], f16, stride=(512, 75264, 32, 1536, 1)),), {})
cnt: 2, ((T([3, 64, 32, 49, 32], f16, stride=(1024, 150528, 32, 3072, 1)),), {})
