Am not sure if this might help, but it need a good Assembly programmer(I barely know it myself). This is how VC++ implement the __int64 using Assembly, remember that all processors register are 32 (except 64 bit CPU of course) and all operations are done on them at the processor level. Anyway these are C++ statements with corresponding Assembly, it you may be able to implement using Assembly.
__int64 l1, l2, l3;
l1 = 256;
00417053 mov dword ptr [l1],100h
0041705A mov dword ptr [ebp-14h],0
l2 = 128;
00417061 mov dword ptr [l2],80h
00417068 mov dword ptr [ebp-24h],0
l3 = l1 + l2;
0041706F mov eax,dword ptr [l1]
00417072 add eax,dword ptr [l2]
00417075 mov ecx,dword ptr [ebp-14h]
00417078 adc ecx,dword ptr [ebp-24h]
0041707B mov dword ptr [l3],eax
0041707E mov dword ptr [ebp-34h],ecx
l3 = l1 - l2;
00417081 mov eax,dword ptr [l1]
00417084 sub eax,dword ptr [l2]
00417087 mov ecx,dword ptr [ebp-14h]
0041708A sbb ecx,dword ptr [ebp-24h]
0041708D mov dword ptr [l3],eax
00417090 mov dword ptr [ebp-34h],ecx
l3 = l1 * l2;
00417093 mov eax,dword ptr [ebp-24h]
00417096 push eax
00417097 mov ecx,dword ptr [l2]
0041709A push ecx
0041709B mov edx,dword ptr [ebp-14h]
0041709E push edx
0041709F mov eax,dword ptr [l1]
004170A2 push eax
004170A3 call @ILT+1330(__allmul) (411537h)
004170A8 mov dword ptr [l3],eax
004170AB mov dword ptr [ebp-34h],edx
l3 = l1 / l2;
004170AE mov eax,dword ptr [ebp-24h]
004170B1 push eax
004170B2 mov ecx,dword ptr [l2]
004170B5 push ecx
004170B6 mov edx,dword ptr [ebp-14h]
004170B9 push edx
004170BA mov eax,dword ptr [l1]
004170BD push eax
004170BE call @ILT+5340(__aulldiv) (4124E1h)
004170C3 mov dword ptr [l3],eax
004170C6 mov dword ptr [ebp-34h],edx
l3 = l1 % l2;
004170C9 mov eax,dword ptr [ebp-24h]
004170CC push eax
004170CD mov ecx,dword ptr [l2]
004170D0 push ecx
004170D1 mov edx,dword ptr [ebp-14h]
004170D4 push edx
004170D5 mov eax,dword ptr [l1]
004170D8 push eax
004170D9 call @ILT+5345(__aullrem) (4124E6h)
004170DE mov dword ptr [l3],eax
004170E1