.NET Inside Out Part 20 – Try doing nothing but decreasing performance

This is the twentieth part of the .NET Inside Out series. For your convenience you can find other parts in the table of contents in Part 1 – Virtual and non-virtual calls in C#

Let’s take this code and see its performance with BenchmarkDotNet:

public class Inlining
{
	[Benchmark(Baseline = true)]
	public long Inlining1()
	{
		long Helper()
		{
			return 1;
		}

		long result = 0;
		for(int i = 0; i < 100; ++i)
		{
			result += Helper();
		}

		return result;
	}

	[Benchmark]
	[MethodImpl(MethodImplOptions.AggressiveInlining)]
	public long Inlining2()
	{
		long Helper()
		{
			try { return 1; }
			catch { return 1; }
		}

		long result = 0;
		for (int i = 0; i < 100; ++i)
		{
			result += Helper();
		}

		return result;
	}
}

public class Inlining

{

[Benchmark(Baseline = true)]

public long Inlining1()

{

long Helper()

{

return 1;

}

long result = 0;

for(int i = 0; i < 100; ++i)

{

result += Helper();

}

return result;

}

[Benchmark]

[MethodImpl(MethodImplOptions.AggressiveInlining)]

public long Inlining2()

{

long Helper()

{

try { return 1; }

catch { return 1; }

}

long result = 0;

for (int i = 0; i < 100; ++i)

{

result += Helper();

}

return result;

}

Results on Windows 7 and .NET Core 4.6 (I know I should provide details of the environment but this is pretty reproducible, feel free to try it out on your own):

    Method |      Mean |     Error |     StdDev |    Median | Scaled | ScaledSD | 
 Inlining1 |  41.83 ns | 0.8811 ns |  0.7358 ns |  41.75 ns |   1.00 |     0.00 |
 Inlining2 | 221.31 ns | 4.7157 ns | 11.1154 ns | 216.91 ns |   5.29 |     0.28 |

Inlining1 | 41.83 ns | 0.8811 ns | 0.7358 ns | 41.75 ns | 1.00 | 0.00 |

Inlining2 | 221.31 ns | 4.7157 ns | 11.1154 ns | 216.91 ns | 5.29 | 0.28 |

So we can see second method is much slower. Why? That’s because of the inlining. Helper with try cannot be inlined as it would break the stacktrace.

Another example comes from Pro .NET Benchmarking:

public class Fibonacci
{
	private const int N = 93;

	[Benchmark(Baseline = true)]
	public long Fibonacci1()
	{
		long a = 0, b = 0, c = 1;
		for (int i = 1; i < N; i++)
		{
			a = b;
			b = c;
			c = a + b;
		}
		return c;
	}

	[Benchmark]
	public long Fibonacci2()
	{
		long a = 0, b = 0, c = 1;
		try
		{
			for (int i = 1; i < N; i++)
			{
				a = b;
				b = c;
				c = a + b;
			}
		}
		catch { }
		return c;
	}
}

public class Fibonacci

{

private const int N = 93;

[Benchmark(Baseline = true)]

public long Fibonacci1()

{

long a = 0, b = 0, c = 1;

for (int i = 1; i < N; i++)

{

a = b;

b = c;

c = a + b;

}

return c;

}

[Benchmark]

public long Fibonacci2()

{

long a = 0, b = 0, c = 1;

try

{

for (int i = 1; i < N; i++)

{

a = b;

b = c;

c = a + b;

}

catch { }

return c;

}

Results:

     Method |     Mean |    Error |    StdDev | Scaled | ScaledSD | 
 Fibonacci1 | 127.0 ns | 2.729 ns |  5.875 ns |   1.00 |     0.00 |
 Fibonacci2 | 159.2 ns | 4.432 ns |  7.527 ns |   1.26 |     0.08 |

Fibonacci1 | 127.0 ns | 2.729 ns | 5.875 ns | 1.00 | 0.00 |

Fibonacci2 | 159.2 ns | 4.432 ns | 7.527 ns | 1.26 | 0.08 |

Why? Let’s see the machine code:

Fibonacci.Fibonacci1()
    L0000: push ebp
    L0001: mov ebp, esp
    L0003: push edi
    L0004: push esi
    L0005: xor eax, eax
    L0007: xor edx, edx
    L0009: xor ecx, ecx
    L000b: mov esi, 0x1
    L0010: mov edi, 0x1
    L0015: add eax, esi
    L0017: adc edx, ecx
    L0019: inc edi
    L001a: cmp edi, 0x5d
    L001d: jl L002b
    L001f: mov ecx, edx
    L0021: mov esi, eax
    L0023: mov eax, esi
    L0025: mov edx, ecx
    L0027: pop esi
    L0028: pop edi
    L0029: pop ebp
    L002a: ret
    L002b: xchg esi, eax
    L002d: xchg edx, ecx
    L002f: jmp L0015

Fibonacci.Fibonacci1()

L0000: push ebp

L0001: mov ebp, esp

L0003: push edi

L0004: push esi

L0005: xor eax, eax

L0007: xor edx, edx

L0009: xor ecx, ecx

L000b: mov esi, 0x1

L0010: mov edi, 0x1

L0015: add eax, esi

L0017: adc edx, ecx

L0019: inc edi

L001a: cmp edi, 0x5d

L001d: jl L002b

L001f: mov ecx, edx

L0021: mov esi, eax

L0023: mov eax, esi

L0025: mov edx, ecx

L0027: pop esi

L0028: pop edi

L0029: pop ebp

L002a: ret

L002b: xchg esi, eax

L002d: xchg edx, ecx

L002f: jmp L0015

Fibonacci.Fibonacci2()
    L0000: push ebp
    L0001: mov ebp, esp
    L0003: push edi
    L0004: push esi
    L0005: sub esp, 0x1c
    L0008: xor eax, eax
    L000a: mov [ebp-0x1c], eax
    L000d: mov [ebp-0x18], eax
    L0010: mov [ebp-0x14], eax
    L0013: mov [ebp-0x10], eax
    L0016: xor eax, eax
    L0018: xor edx, edx
    L001a: mov ecx, 0x1
    L001f: xor esi, esi
    L0021: mov [ebp-0x24], ecx
    L0024: mov [ebp-0x20], esi
    L0027: mov ecx, 0x1
    L002c: mov esi, [ebp-0x24]
    L002f: mov edi, [ebp-0x20]
    L0032: add eax, [ebp-0x24]
    L0035: adc edx, [ebp-0x20]
    L0038: mov [ebp-0x24], eax
    L003b: mov [ebp-0x20], edx
    L003e: inc ecx
    L003f: cmp ecx, 0x5d
    L0042: mov eax, esi
    L0044: mov edx, edi
    L0046: jl L002c
    L0048: jmp L004f
    L004a: call 0x72509369
    L004f: mov eax, [ebp-0x24]
    L0052: mov edx, [ebp-0x20]
    L0055: lea esp, [ebp-0x8]
    L0058: pop esi
    L0059: pop edi
    L005a: pop ebp
    L005b: ret

Fibonacci.Fibonacci2()

L0000: push ebp

L0001: mov ebp, esp

L0003: push edi

L0004: push esi

L0005: sub esp, 0x1c

L0008: xor eax, eax

L000a: mov [ebp-0x1c], eax

L000d: mov [ebp-0x18], eax

L0010: mov [ebp-0x14], eax

L0013: mov [ebp-0x10], eax

L0016: xor eax, eax

L0018: xor edx, edx

L001a: mov ecx, 0x1

L001f: xor esi, esi

L0021: mov [ebp-0x24], ecx

L0024: mov [ebp-0x20], esi

L0027: mov ecx, 0x1

L002c: mov esi, [ebp-0x24]

L002f: mov edi, [ebp-0x20]

L0032: add eax, [ebp-0x24]

L0035: adc edx, [ebp-0x20]

L0038: mov [ebp-0x24], eax

L003b: mov [ebp-0x20], edx

L003e: inc ecx

L003f: cmp ecx, 0x5d

L0042: mov eax, esi

L0044: mov edx, edi

L0046: jl L002c

L0048: jmp L004f

L004a: call 0x72509369

L004f: mov eax, [ebp-0x24]

L0052: mov edx, [ebp-0x20]

L0055: lea esp, [ebp-0x8]

L0058: pop esi

L0059: pop edi

L005a: pop ebp

L005b: ret

So we can see that the second implementation uses variables on the stack instead of the registers. That’s why it’s much slower.