[PATCH] PPC assembly implementation of SHA1
[git/git.git] / ppc / sha1ppc.S
1 /*
2 * SHA-1 implementation for PowerPC.
3 *
4 * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
5 */
6 #define FS 80
7
8 /*
9 * We roll the registers for T, A, B, C, D, E around on each
10 * iteration; T on iteration t is A on iteration t+1, and so on.
11 * We use registers 7 - 12 for this.
12 */
13 #define RT(t) ((((t)+5)%6)+7)
14 #define RA(t) ((((t)+4)%6)+7)
15 #define RB(t) ((((t)+3)%6)+7)
16 #define RC(t) ((((t)+2)%6)+7)
17 #define RD(t) ((((t)+1)%6)+7)
18 #define RE(t) ((((t)+0)%6)+7)
19
20 /* We use registers 16 - 31 for the W values */
21 #define W(t) (((t)%16)+16)
22
23 #define STEPD0(t) \
24 and %r6,RB(t),RC(t); \
25 andc %r0,RD(t),RB(t); \
26 rotlwi RT(t),RA(t),5; \
27 rotlwi RB(t),RB(t),30; \
28 or %r6,%r6,%r0; \
29 add %r0,RE(t),%r15; \
30 add RT(t),RT(t),%r6; \
31 add %r0,%r0,W(t); \
32 add RT(t),RT(t),%r0
33
34 #define STEPD1(t) \
35 xor %r6,RB(t),RC(t); \
36 rotlwi RT(t),RA(t),5; \
37 rotlwi RB(t),RB(t),30; \
38 xor %r6,%r6,RD(t); \
39 add %r0,RE(t),%r15; \
40 add RT(t),RT(t),%r6; \
41 add %r0,%r0,W(t); \
42 add RT(t),RT(t),%r0
43
44 #define STEPD2(t) \
45 and %r6,RB(t),RC(t); \
46 and %r0,RB(t),RD(t); \
47 rotlwi RT(t),RA(t),5; \
48 rotlwi RB(t),RB(t),30; \
49 or %r6,%r6,%r0; \
50 and %r0,RC(t),RD(t); \
51 or %r6,%r6,%r0; \
52 add %r0,RE(t),%r15; \
53 add RT(t),RT(t),%r6; \
54 add %r0,%r0,W(t); \
55 add RT(t),RT(t),%r0
56
57 #define LOADW(t) \
58 lwz W(t),(t)*4(%r4)
59
60 #define UPDATEW(t) \
61 xor %r0,W((t)-3),W((t)-8); \
62 xor W(t),W((t)-16),W((t)-14); \
63 xor W(t),W(t),%r0; \
64 rotlwi W(t),W(t),1
65
66 #define STEP0LD4(t) \
67 STEPD0(t); LOADW((t)+4); \
68 STEPD0((t)+1); LOADW((t)+5); \
69 STEPD0((t)+2); LOADW((t)+6); \
70 STEPD0((t)+3); LOADW((t)+7)
71
72 #define STEPUP4(t, fn) \
73 STEP##fn(t); UPDATEW((t)+4); \
74 STEP##fn((t)+1); UPDATEW((t)+5); \
75 STEP##fn((t)+2); UPDATEW((t)+6); \
76 STEP##fn((t)+3); UPDATEW((t)+7)
77
78 #define STEPUP20(t, fn) \
79 STEPUP4(t, fn); \
80 STEPUP4((t)+4, fn); \
81 STEPUP4((t)+8, fn); \
82 STEPUP4((t)+12, fn); \
83 STEPUP4((t)+16, fn)
84
85 .globl sha1_core
86 sha1_core:
87 stwu %r1,-FS(%r1)
88 stw %r15,FS-68(%r1)
89 stw %r16,FS-64(%r1)
90 stw %r17,FS-60(%r1)
91 stw %r18,FS-56(%r1)
92 stw %r19,FS-52(%r1)
93 stw %r20,FS-48(%r1)
94 stw %r21,FS-44(%r1)
95 stw %r22,FS-40(%r1)
96 stw %r23,FS-36(%r1)
97 stw %r24,FS-32(%r1)
98 stw %r25,FS-28(%r1)
99 stw %r26,FS-24(%r1)
100 stw %r27,FS-20(%r1)
101 stw %r28,FS-16(%r1)
102 stw %r29,FS-12(%r1)
103 stw %r30,FS-8(%r1)
104 stw %r31,FS-4(%r1)
105
106 /* Load up A - E */
107 lwz RA(0),0(%r3) /* A */
108 lwz RB(0),4(%r3) /* B */
109 lwz RC(0),8(%r3) /* C */
110 lwz RD(0),12(%r3) /* D */
111 lwz RE(0),16(%r3) /* E */
112
113 mtctr %r5
114
115 1: LOADW(0)
116 LOADW(1)
117 LOADW(2)
118 LOADW(3)
119
120 lis %r15,0x5a82 /* K0-19 */
121 ori %r15,%r15,0x7999
122 STEP0LD4(0)
123 STEP0LD4(4)
124 STEP0LD4(8)
125 STEPUP4(12, D0)
126 STEPUP4(16, D0)
127
128 lis %r15,0x6ed9 /* K20-39 */
129 ori %r15,%r15,0xeba1
130 STEPUP20(20, D1)
131
132 lis %r15,0x8f1b /* K40-59 */
133 ori %r15,%r15,0xbcdc
134 STEPUP20(40, D2)
135
136 lis %r15,0xca62 /* K60-79 */
137 ori %r15,%r15,0xc1d6
138 STEPUP4(60, D1)
139 STEPUP4(64, D1)
140 STEPUP4(68, D1)
141 STEPUP4(72, D1)
142 STEPD1(76)
143 STEPD1(77)
144 STEPD1(78)
145 STEPD1(79)
146
147 lwz %r20,16(%r3)
148 lwz %r19,12(%r3)
149 lwz %r18,8(%r3)
150 lwz %r17,4(%r3)
151 lwz %r16,0(%r3)
152 add %r20,RE(80),%r20
153 add RD(0),RD(80),%r19
154 add RC(0),RC(80),%r18
155 add RB(0),RB(80),%r17
156 add RA(0),RA(80),%r16
157 mr RE(0),%r20
158 stw RA(0),0(%r3)
159 stw RB(0),4(%r3)
160 stw RC(0),8(%r3)
161 stw RD(0),12(%r3)
162 stw RE(0),16(%r3)
163
164 addi %r4,%r4,64
165 bdnz 1b
166
167 lwz %r15,FS-68(%r1)
168 lwz %r16,FS-64(%r1)
169 lwz %r17,FS-60(%r1)
170 lwz %r18,FS-56(%r1)
171 lwz %r19,FS-52(%r1)
172 lwz %r20,FS-48(%r1)
173 lwz %r21,FS-44(%r1)
174 lwz %r22,FS-40(%r1)
175 lwz %r23,FS-36(%r1)
176 lwz %r24,FS-32(%r1)
177 lwz %r25,FS-28(%r1)
178 lwz %r26,FS-24(%r1)
179 lwz %r27,FS-20(%r1)
180 lwz %r28,FS-16(%r1)
181 lwz %r29,FS-12(%r1)
182 lwz %r30,FS-8(%r1)
183 lwz %r31,FS-4(%r1)
184 addi %r1,%r1,FS
185 blr