smult.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733
  1. /* Copyright 2008, Google Inc.
  2. * All rights reserved.
  3. *
  4. * Redistribution and use in source and binary forms, with or without
  5. * modification, are permitted provided that the following conditions are
  6. * met:
  7. *
  8. * * Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * * Redistributions in binary form must reproduce the above
  11. * copyright notice, this list of conditions and the following disclaimer
  12. * in the documentation and/or other materials provided with the
  13. * distribution.
  14. * * Neither the name of Google Inc. nor the names of its
  15. * contributors may be used to endorse or promote products derived from
  16. * this software without specific prior written permission.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. *
  30. * curve25519-donna: Curve25519 elliptic curve, public key function
  31. *
  32. * http://code.google.com/p/curve25519-donna/
  33. *
  34. * Adam Langley <agl@imperialviolet.org>
  35. *
  36. * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
  37. *
  38. * More information about curve25519 can be found here
  39. * http://cr.yp.to/ecdh.html
  40. *
  41. * djb's sample implementation of curve25519 is written in a special assembly
  42. * language called qhasm and uses the floating point registers.
  43. *
  44. * This is, almost, a clean room reimplementation from the curve25519 paper. It
  45. * uses many of the tricks described therein. Only the crecip function is taken
  46. * from the sample implementation.
  47. */
  48. #include <string.h>
  49. #include <stdint.h>
  50. #include "crypto_scalarmult.h"
  51. #ifdef _MSC_VER
  52. #define inline __inline
  53. #endif
  54. typedef uint8_t u8;
  55. typedef int32_t s32;
  56. typedef int64_t limb;
  57. /* Field element representation:
  58. *
  59. * Field elements are written as an array of signed, 64-bit limbs, least
  60. * significant first. The value of the field element is:
  61. * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
  62. *
  63. * i.e. the limbs are 26, 25, 26, 25, ... bits wide.
  64. */
  65. /* Sum two numbers: output += in */
  66. static void fsum(limb *output, const limb *in) {
  67. unsigned i;
  68. for (i = 0; i < 10; i += 2) {
  69. output[0+i] = (output[0+i] + in[0+i]);
  70. output[1+i] = (output[1+i] + in[1+i]);
  71. }
  72. }
  73. /* Find the difference of two numbers: output = in - output
  74. * (note the order of the arguments!)
  75. */
  76. static void fdifference(limb *output, const limb *in) {
  77. unsigned i;
  78. for (i = 0; i < 10; ++i) {
  79. output[i] = (in[i] - output[i]);
  80. }
  81. }
  82. /* Multiply a number by a scalar: output = in * scalar */
  83. static void fscalar_product(limb *output, const limb *in, const limb scalar) {
  84. unsigned i;
  85. for (i = 0; i < 10; ++i) {
  86. output[i] = in[i] * scalar;
  87. }
  88. }
  89. /* Multiply two numbers: output = in2 * in
  90. *
  91. * output must be distinct to both inputs. The inputs are reduced coefficient
  92. * form, the output is not.
  93. */
  94. static void fproduct(limb *output, const limb *in2, const limb *in) {
  95. output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]);
  96. output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) +
  97. ((limb) ((s32) in2[1])) * ((s32) in[0]);
  98. output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) +
  99. ((limb) ((s32) in2[0])) * ((s32) in[2]) +
  100. ((limb) ((s32) in2[2])) * ((s32) in[0]);
  101. output[3] = ((limb) ((s32) in2[1])) * ((s32) in[2]) +
  102. ((limb) ((s32) in2[2])) * ((s32) in[1]) +
  103. ((limb) ((s32) in2[0])) * ((s32) in[3]) +
  104. ((limb) ((s32) in2[3])) * ((s32) in[0]);
  105. output[4] = ((limb) ((s32) in2[2])) * ((s32) in[2]) +
  106. 2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) +
  107. ((limb) ((s32) in2[3])) * ((s32) in[1])) +
  108. ((limb) ((s32) in2[0])) * ((s32) in[4]) +
  109. ((limb) ((s32) in2[4])) * ((s32) in[0]);
  110. output[5] = ((limb) ((s32) in2[2])) * ((s32) in[3]) +
  111. ((limb) ((s32) in2[3])) * ((s32) in[2]) +
  112. ((limb) ((s32) in2[1])) * ((s32) in[4]) +
  113. ((limb) ((s32) in2[4])) * ((s32) in[1]) +
  114. ((limb) ((s32) in2[0])) * ((s32) in[5]) +
  115. ((limb) ((s32) in2[5])) * ((s32) in[0]);
  116. output[6] = 2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) +
  117. ((limb) ((s32) in2[1])) * ((s32) in[5]) +
  118. ((limb) ((s32) in2[5])) * ((s32) in[1])) +
  119. ((limb) ((s32) in2[2])) * ((s32) in[4]) +
  120. ((limb) ((s32) in2[4])) * ((s32) in[2]) +
  121. ((limb) ((s32) in2[0])) * ((s32) in[6]) +
  122. ((limb) ((s32) in2[6])) * ((s32) in[0]);
  123. output[7] = ((limb) ((s32) in2[3])) * ((s32) in[4]) +
  124. ((limb) ((s32) in2[4])) * ((s32) in[3]) +
  125. ((limb) ((s32) in2[2])) * ((s32) in[5]) +
  126. ((limb) ((s32) in2[5])) * ((s32) in[2]) +
  127. ((limb) ((s32) in2[1])) * ((s32) in[6]) +
  128. ((limb) ((s32) in2[6])) * ((s32) in[1]) +
  129. ((limb) ((s32) in2[0])) * ((s32) in[7]) +
  130. ((limb) ((s32) in2[7])) * ((s32) in[0]);
  131. output[8] = ((limb) ((s32) in2[4])) * ((s32) in[4]) +
  132. 2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) +
  133. ((limb) ((s32) in2[5])) * ((s32) in[3]) +
  134. ((limb) ((s32) in2[1])) * ((s32) in[7]) +
  135. ((limb) ((s32) in2[7])) * ((s32) in[1])) +
  136. ((limb) ((s32) in2[2])) * ((s32) in[6]) +
  137. ((limb) ((s32) in2[6])) * ((s32) in[2]) +
  138. ((limb) ((s32) in2[0])) * ((s32) in[8]) +
  139. ((limb) ((s32) in2[8])) * ((s32) in[0]);
  140. output[9] = ((limb) ((s32) in2[4])) * ((s32) in[5]) +
  141. ((limb) ((s32) in2[5])) * ((s32) in[4]) +
  142. ((limb) ((s32) in2[3])) * ((s32) in[6]) +
  143. ((limb) ((s32) in2[6])) * ((s32) in[3]) +
  144. ((limb) ((s32) in2[2])) * ((s32) in[7]) +
  145. ((limb) ((s32) in2[7])) * ((s32) in[2]) +
  146. ((limb) ((s32) in2[1])) * ((s32) in[8]) +
  147. ((limb) ((s32) in2[8])) * ((s32) in[1]) +
  148. ((limb) ((s32) in2[0])) * ((s32) in[9]) +
  149. ((limb) ((s32) in2[9])) * ((s32) in[0]);
  150. output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) +
  151. ((limb) ((s32) in2[3])) * ((s32) in[7]) +
  152. ((limb) ((s32) in2[7])) * ((s32) in[3]) +
  153. ((limb) ((s32) in2[1])) * ((s32) in[9]) +
  154. ((limb) ((s32) in2[9])) * ((s32) in[1])) +
  155. ((limb) ((s32) in2[4])) * ((s32) in[6]) +
  156. ((limb) ((s32) in2[6])) * ((s32) in[4]) +
  157. ((limb) ((s32) in2[2])) * ((s32) in[8]) +
  158. ((limb) ((s32) in2[8])) * ((s32) in[2]);
  159. output[11] = ((limb) ((s32) in2[5])) * ((s32) in[6]) +
  160. ((limb) ((s32) in2[6])) * ((s32) in[5]) +
  161. ((limb) ((s32) in2[4])) * ((s32) in[7]) +
  162. ((limb) ((s32) in2[7])) * ((s32) in[4]) +
  163. ((limb) ((s32) in2[3])) * ((s32) in[8]) +
  164. ((limb) ((s32) in2[8])) * ((s32) in[3]) +
  165. ((limb) ((s32) in2[2])) * ((s32) in[9]) +
  166. ((limb) ((s32) in2[9])) * ((s32) in[2]);
  167. output[12] = ((limb) ((s32) in2[6])) * ((s32) in[6]) +
  168. 2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) +
  169. ((limb) ((s32) in2[7])) * ((s32) in[5]) +
  170. ((limb) ((s32) in2[3])) * ((s32) in[9]) +
  171. ((limb) ((s32) in2[9])) * ((s32) in[3])) +
  172. ((limb) ((s32) in2[4])) * ((s32) in[8]) +
  173. ((limb) ((s32) in2[8])) * ((s32) in[4]);
  174. output[13] = ((limb) ((s32) in2[6])) * ((s32) in[7]) +
  175. ((limb) ((s32) in2[7])) * ((s32) in[6]) +
  176. ((limb) ((s32) in2[5])) * ((s32) in[8]) +
  177. ((limb) ((s32) in2[8])) * ((s32) in[5]) +
  178. ((limb) ((s32) in2[4])) * ((s32) in[9]) +
  179. ((limb) ((s32) in2[9])) * ((s32) in[4]);
  180. output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) +
  181. ((limb) ((s32) in2[5])) * ((s32) in[9]) +
  182. ((limb) ((s32) in2[9])) * ((s32) in[5])) +
  183. ((limb) ((s32) in2[6])) * ((s32) in[8]) +
  184. ((limb) ((s32) in2[8])) * ((s32) in[6]);
  185. output[15] = ((limb) ((s32) in2[7])) * ((s32) in[8]) +
  186. ((limb) ((s32) in2[8])) * ((s32) in[7]) +
  187. ((limb) ((s32) in2[6])) * ((s32) in[9]) +
  188. ((limb) ((s32) in2[9])) * ((s32) in[6]);
  189. output[16] = ((limb) ((s32) in2[8])) * ((s32) in[8]) +
  190. 2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) +
  191. ((limb) ((s32) in2[9])) * ((s32) in[7]));
  192. output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) +
  193. ((limb) ((s32) in2[9])) * ((s32) in[8]);
  194. output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]);
  195. }
  196. /* Reduce a long form to a short form by taking the input mod 2^255 - 19. */
  197. static void freduce_degree(limb *output) {
  198. /* Each of these shifts and adds ends up multiplying the value by 19. */
  199. output[8] += output[18] << 4;
  200. output[8] += output[18] << 1;
  201. output[8] += output[18];
  202. output[7] += output[17] << 4;
  203. output[7] += output[17] << 1;
  204. output[7] += output[17];
  205. output[6] += output[16] << 4;
  206. output[6] += output[16] << 1;
  207. output[6] += output[16];
  208. output[5] += output[15] << 4;
  209. output[5] += output[15] << 1;
  210. output[5] += output[15];
  211. output[4] += output[14] << 4;
  212. output[4] += output[14] << 1;
  213. output[4] += output[14];
  214. output[3] += output[13] << 4;
  215. output[3] += output[13] << 1;
  216. output[3] += output[13];
  217. output[2] += output[12] << 4;
  218. output[2] += output[12] << 1;
  219. output[2] += output[12];
  220. output[1] += output[11] << 4;
  221. output[1] += output[11] << 1;
  222. output[1] += output[11];
  223. output[0] += output[10] << 4;
  224. output[0] += output[10] << 1;
  225. output[0] += output[10];
  226. }
  227. #if (-1 & 3) != 3
  228. #error "This code only works on a two's complement system"
  229. #endif
  230. /* return v / 2^26, using only shifts and adds. */
  231. static inline limb
  232. div_by_2_26(const limb v)
  233. {
  234. /* High word of v; no shift needed*/
  235. const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
  236. /* Set to all 1s if v was negative; else set to 0s. */
  237. const int32_t sign = ((int32_t) highword) >> 31;
  238. /* Set to 0x3ffffff if v was negative; else set to 0. */
  239. const int32_t roundoff = ((uint32_t) sign) >> 6;
  240. /* Should return v / (1<<26) */
  241. return (v + roundoff) >> 26;
  242. }
  243. /* return v / (2^25), using only shifts and adds. */
  244. static inline limb
  245. div_by_2_25(const limb v)
  246. {
  247. /* High word of v; no shift needed*/
  248. const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
  249. /* Set to all 1s if v was negative; else set to 0s. */
  250. const int32_t sign = ((int32_t) highword) >> 31;
  251. /* Set to 0x1ffffff if v was negative; else set to 0. */
  252. const int32_t roundoff = ((uint32_t) sign) >> 7;
  253. /* Should return v / (1<<25) */
  254. return (v + roundoff) >> 25;
  255. }
  256. static inline s32
  257. div_s32_by_2_25(const s32 v)
  258. {
  259. const s32 roundoff = ((uint32_t)(v >> 31)) >> 7;
  260. return (v + roundoff) >> 25;
  261. }
  262. /* Reduce all coefficients of the short form input so that |x| < 2^26.
  263. *
  264. * On entry: |output[i]| < 2^62
  265. */
  266. static void freduce_coefficients(limb *output) {
  267. unsigned i;
  268. output[10] = 0;
  269. for (i = 0; i < 10; i += 2) {
  270. limb over = div_by_2_26(output[i]);
  271. output[i] -= over << 26;
  272. output[i+1] += over;
  273. over = div_by_2_25(output[i+1]);
  274. output[i+1] -= over << 25;
  275. output[i+2] += over;
  276. }
  277. /* Now |output[10]| < 2 ^ 38 and all other coefficients are reduced. */
  278. output[0] += output[10] << 4;
  279. output[0] += output[10] << 1;
  280. output[0] += output[10];
  281. output[10] = 0;
  282. /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19 * 2^38
  283. * So |over| will be no more than 77825 */
  284. {
  285. limb over = div_by_2_26(output[0]);
  286. output[0] -= over << 26;
  287. output[1] += over;
  288. }
  289. /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 77825
  290. * So |over| will be no more than 1. */
  291. {
  292. /* output[1] fits in 32 bits, so we can use div_s32_by_2_25 here. */
  293. s32 over32 = div_s32_by_2_25((s32) output[1]);
  294. output[1] -= over32 << 25;
  295. output[2] += over32;
  296. }
  297. /* Finally, output[0,1,3..9] are reduced, and output[2] is "nearly reduced":
  298. * we have |output[2]| <= 2^26. This is good enough for all of our math,
  299. * but it will require an extra freduce_coefficients before fcontract. */
  300. }
  301. /* A helpful wrapper around fproduct: output = in * in2.
  302. *
  303. * output must be distinct to both inputs. The output is reduced degree and
  304. * reduced coefficient.
  305. */
  306. static void
  307. fmul(limb *output, const limb *in, const limb *in2) {
  308. limb t[19];
  309. fproduct(t, in, in2);
  310. freduce_degree(t);
  311. freduce_coefficients(t);
  312. memcpy(output, t, sizeof(limb) * 10);
  313. }
  314. static void fsquare_inner(limb *output, const limb *in) {
  315. output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]);
  316. output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]);
  317. output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) +
  318. ((limb) ((s32) in[0])) * ((s32) in[2]));
  319. output[3] = 2 * (((limb) ((s32) in[1])) * ((s32) in[2]) +
  320. ((limb) ((s32) in[0])) * ((s32) in[3]));
  321. output[4] = ((limb) ((s32) in[2])) * ((s32) in[2]) +
  322. 4 * ((limb) ((s32) in[1])) * ((s32) in[3]) +
  323. 2 * ((limb) ((s32) in[0])) * ((s32) in[4]);
  324. output[5] = 2 * (((limb) ((s32) in[2])) * ((s32) in[3]) +
  325. ((limb) ((s32) in[1])) * ((s32) in[4]) +
  326. ((limb) ((s32) in[0])) * ((s32) in[5]));
  327. output[6] = 2 * (((limb) ((s32) in[3])) * ((s32) in[3]) +
  328. ((limb) ((s32) in[2])) * ((s32) in[4]) +
  329. ((limb) ((s32) in[0])) * ((s32) in[6]) +
  330. 2 * ((limb) ((s32) in[1])) * ((s32) in[5]));
  331. output[7] = 2 * (((limb) ((s32) in[3])) * ((s32) in[4]) +
  332. ((limb) ((s32) in[2])) * ((s32) in[5]) +
  333. ((limb) ((s32) in[1])) * ((s32) in[6]) +
  334. ((limb) ((s32) in[0])) * ((s32) in[7]));
  335. output[8] = ((limb) ((s32) in[4])) * ((s32) in[4]) +
  336. 2 * (((limb) ((s32) in[2])) * ((s32) in[6]) +
  337. ((limb) ((s32) in[0])) * ((s32) in[8]) +
  338. 2 * (((limb) ((s32) in[1])) * ((s32) in[7]) +
  339. ((limb) ((s32) in[3])) * ((s32) in[5])));
  340. output[9] = 2 * (((limb) ((s32) in[4])) * ((s32) in[5]) +
  341. ((limb) ((s32) in[3])) * ((s32) in[6]) +
  342. ((limb) ((s32) in[2])) * ((s32) in[7]) +
  343. ((limb) ((s32) in[1])) * ((s32) in[8]) +
  344. ((limb) ((s32) in[0])) * ((s32) in[9]));
  345. output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) +
  346. ((limb) ((s32) in[4])) * ((s32) in[6]) +
  347. ((limb) ((s32) in[2])) * ((s32) in[8]) +
  348. 2 * (((limb) ((s32) in[3])) * ((s32) in[7]) +
  349. ((limb) ((s32) in[1])) * ((s32) in[9])));
  350. output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) +
  351. ((limb) ((s32) in[4])) * ((s32) in[7]) +
  352. ((limb) ((s32) in[3])) * ((s32) in[8]) +
  353. ((limb) ((s32) in[2])) * ((s32) in[9]));
  354. output[12] = ((limb) ((s32) in[6])) * ((s32) in[6]) +
  355. 2 * (((limb) ((s32) in[4])) * ((s32) in[8]) +
  356. 2 * (((limb) ((s32) in[5])) * ((s32) in[7]) +
  357. ((limb) ((s32) in[3])) * ((s32) in[9])));
  358. output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) +
  359. ((limb) ((s32) in[5])) * ((s32) in[8]) +
  360. ((limb) ((s32) in[4])) * ((s32) in[9]));
  361. output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) +
  362. ((limb) ((s32) in[6])) * ((s32) in[8]) +
  363. 2 * ((limb) ((s32) in[5])) * ((s32) in[9]));
  364. output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) +
  365. ((limb) ((s32) in[6])) * ((s32) in[9]));
  366. output[16] = ((limb) ((s32) in[8])) * ((s32) in[8]) +
  367. 4 * ((limb) ((s32) in[7])) * ((s32) in[9]);
  368. output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]);
  369. output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]);
  370. }
  371. static void
  372. fsquare(limb *output, const limb *in) {
  373. limb t[19];
  374. fsquare_inner(t, in);
  375. freduce_degree(t);
  376. freduce_coefficients(t);
  377. memcpy(output, t, sizeof(limb) * 10);
  378. }
  379. /* Take a little-endian, 32-byte number and expand it into polynomial form */
  380. static void
  381. fexpand(limb *output, const u8 *input) {
  382. #define F(n,start,shift,mask) \
  383. output[n] = ((((limb) input[start + 0]) | \
  384. ((limb) input[start + 1]) << 8 | \
  385. ((limb) input[start + 2]) << 16 | \
  386. ((limb) input[start + 3]) << 24) >> shift) & mask;
  387. F(0, 0, 0, 0x3ffffff);
  388. F(1, 3, 2, 0x1ffffff);
  389. F(2, 6, 3, 0x3ffffff);
  390. F(3, 9, 5, 0x1ffffff);
  391. F(4, 12, 6, 0x3ffffff);
  392. F(5, 16, 0, 0x1ffffff);
  393. F(6, 19, 1, 0x3ffffff);
  394. F(7, 22, 3, 0x1ffffff);
  395. F(8, 25, 4, 0x3ffffff);
  396. F(9, 28, 6, 0x3ffffff);
  397. #undef F
  398. }
  399. #if (-32 >> 1) != -16
  400. #error "This code only works when >> does sign-extension on negative numbers"
  401. #endif
  402. /* Take a fully reduced polynomial form number and contract it into a
  403. * little-endian, 32-byte array
  404. */
  405. static void
  406. fcontract(u8 *output, limb *input) {
  407. int i;
  408. int j;
  409. for (j = 0; j < 2; ++j) {
  410. for (i = 0; i < 9; ++i) {
  411. if ((i & 1) == 1) {
  412. /* This calculation is a time-invariant way to make input[i] positive
  413. by borrowing from the next-larger limb.
  414. */
  415. const s32 mask = (s32)(input[i]) >> 31;
  416. const s32 carry = -(((s32)(input[i]) & mask) >> 25);
  417. input[i] = (s32)(input[i]) + (carry << 25);
  418. input[i+1] = (s32)(input[i+1]) - carry;
  419. } else {
  420. const s32 mask = (s32)(input[i]) >> 31;
  421. const s32 carry = -(((s32)(input[i]) & mask) >> 26);
  422. input[i] = (s32)(input[i]) + (carry << 26);
  423. input[i+1] = (s32)(input[i+1]) - carry;
  424. }
  425. }
  426. {
  427. const s32 mask = (s32)(input[9]) >> 31;
  428. const s32 carry = -(((s32)(input[9]) & mask) >> 25);
  429. input[9] = (s32)(input[9]) + (carry << 25);
  430. input[0] = (s32)(input[0]) - (carry * 19);
  431. }
  432. }
  433. /* The first borrow-propagation pass above ended with every limb
  434. except (possibly) input[0] non-negative.
  435. Since each input limb except input[0] is decreased by at most 1
  436. by a borrow-propagation pass, the second borrow-propagation pass
  437. could only have wrapped around to decrease input[0] again if the
  438. first pass left input[0] negative *and* input[1] through input[9]
  439. were all zero. In that case, input[1] is now 2^25 - 1, and this
  440. last borrow-propagation step will leave input[1] non-negative.
  441. */
  442. {
  443. const s32 mask = (s32)(input[0]) >> 31;
  444. const s32 carry = -(((s32)(input[0]) & mask) >> 26);
  445. input[0] = (s32)(input[0]) + (carry << 26);
  446. input[1] = (s32)(input[1]) - carry;
  447. }
  448. /* Both passes through the above loop, plus the last 0-to-1 step, are
  449. necessary: if input[9] is -1 and input[0] through input[8] are 0,
  450. negative values will remain in the array until the end.
  451. */
  452. input[1] <<= 2;
  453. input[2] <<= 3;
  454. input[3] <<= 5;
  455. input[4] <<= 6;
  456. input[6] <<= 1;
  457. input[7] <<= 3;
  458. input[8] <<= 4;
  459. input[9] <<= 6;
  460. #define F(i, s) \
  461. output[s+0] |= input[i] & 0xff; \
  462. output[s+1] = (input[i] >> 8) & 0xff; \
  463. output[s+2] = (input[i] >> 16) & 0xff; \
  464. output[s+3] = (input[i] >> 24) & 0xff;
  465. output[0] = 0;
  466. output[16] = 0;
  467. F(0,0);
  468. F(1,3);
  469. F(2,6);
  470. F(3,9);
  471. F(4,12);
  472. F(5,16);
  473. F(6,19);
  474. F(7,22);
  475. F(8,25);
  476. F(9,28);
  477. #undef F
  478. }
  479. /* Input: Q, Q', Q-Q'
  480. * Output: 2Q, Q+Q'
  481. *
  482. * x2 z3: long form
  483. * x3 z3: long form
  484. * x z: short form, destroyed
  485. * xprime zprime: short form, destroyed
  486. * qmqp: short form, preserved
  487. */
  488. static void fmonty(limb *x2, limb *z2, /* output 2Q */
  489. limb *x3, limb *z3, /* output Q + Q' */
  490. limb *x, limb *z, /* input Q */
  491. limb *xprime, limb *zprime, /* input Q' */
  492. const limb *qmqp /* input Q - Q' */) {
  493. limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19],
  494. zzprime[19], zzzprime[19], xxxprime[19];
  495. memcpy(origx, x, 10 * sizeof(limb));
  496. fsum(x, z);
  497. fdifference(z, origx); // does x - z
  498. memcpy(origxprime, xprime, sizeof(limb) * 10);
  499. fsum(xprime, zprime);
  500. fdifference(zprime, origxprime);
  501. fproduct(xxprime, xprime, z);
  502. fproduct(zzprime, x, zprime);
  503. freduce_degree(xxprime);
  504. freduce_coefficients(xxprime);
  505. freduce_degree(zzprime);
  506. freduce_coefficients(zzprime);
  507. memcpy(origxprime, xxprime, sizeof(limb) * 10);
  508. fsum(xxprime, zzprime);
  509. fdifference(zzprime, origxprime);
  510. fsquare(xxxprime, xxprime);
  511. fsquare(zzzprime, zzprime);
  512. fproduct(zzprime, zzzprime, qmqp);
  513. freduce_degree(zzprime);
  514. freduce_coefficients(zzprime);
  515. memcpy(x3, xxxprime, sizeof(limb) * 10);
  516. memcpy(z3, zzprime, sizeof(limb) * 10);
  517. fsquare(xx, x);
  518. fsquare(zz, z);
  519. fproduct(x2, xx, zz);
  520. freduce_degree(x2);
  521. freduce_coefficients(x2);
  522. fdifference(zz, xx); // does zz = xx - zz
  523. memset(zzz + 10, 0, sizeof(limb) * 9);
  524. fscalar_product(zzz, zz, 121665);
  525. /* No need to call freduce_degree here:
  526. fscalar_product doesn't increase the degree of its input. */
  527. freduce_coefficients(zzz);
  528. fsum(zzz, xx);
  529. fproduct(z2, zz, zzz);
  530. freduce_degree(z2);
  531. freduce_coefficients(z2);
  532. }
  533. /* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave
  534. * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid
  535. * side-channel attacks.
  536. *
  537. * NOTE that this function requires that 'iswap' be 1 or 0; other values give
  538. * wrong results. Also, the two limb arrays must be in reduced-coefficient,
  539. * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped,
  540. * and all all values in a[0..9],b[0..9] must have magnitude less than
  541. * INT32_MAX.
  542. */
  543. static void
  544. swap_conditional(limb a[19], limb b[19], limb iswap) {
  545. unsigned i;
  546. const s32 swap = (s32) -iswap;
  547. for (i = 0; i < 10; ++i) {
  548. const s32 x = swap & ( ((s32)a[i]) ^ ((s32)b[i]) );
  549. a[i] = ((s32)a[i]) ^ x;
  550. b[i] = ((s32)b[i]) ^ x;
  551. }
  552. }
  553. /* Calculates nQ where Q is the x-coordinate of a point on the curve
  554. *
  555. * resultx/resultz: the x coordinate of the resulting curve point (short form)
  556. * n: a little endian, 32-byte number
  557. * q: a point of the curve (short form)
  558. */
  559. static void
  560. cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) {
  561. limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0};
  562. limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
  563. limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1};
  564. limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
  565. unsigned i, j;
  566. memcpy(nqpqx, q, sizeof(limb) * 10);
  567. for (i = 0; i < 32; ++i) {
  568. u8 byte = n[31 - i];
  569. for (j = 0; j < 8; ++j) {
  570. const limb bit = byte >> 7;
  571. swap_conditional(nqx, nqpqx, bit);
  572. swap_conditional(nqz, nqpqz, bit);
  573. fmonty(nqx2, nqz2,
  574. nqpqx2, nqpqz2,
  575. nqx, nqz,
  576. nqpqx, nqpqz,
  577. q);
  578. swap_conditional(nqx2, nqpqx2, bit);
  579. swap_conditional(nqz2, nqpqz2, bit);
  580. t = nqx;
  581. nqx = nqx2;
  582. nqx2 = t;
  583. t = nqz;
  584. nqz = nqz2;
  585. nqz2 = t;
  586. t = nqpqx;
  587. nqpqx = nqpqx2;
  588. nqpqx2 = t;
  589. t = nqpqz;
  590. nqpqz = nqpqz2;
  591. nqpqz2 = t;
  592. byte <<= 1;
  593. }
  594. }
  595. memcpy(resultx, nqx, sizeof(limb) * 10);
  596. memcpy(resultz, nqz, sizeof(limb) * 10);
  597. }
  598. // -----------------------------------------------------------------------------
  599. // Shamelessly copied from djb's code
  600. // -----------------------------------------------------------------------------
  601. static void
  602. crecip(limb *out, const limb *z) {
  603. limb z2[10];
  604. limb z9[10];
  605. limb z11[10];
  606. limb z2_5_0[10];
  607. limb z2_10_0[10];
  608. limb z2_20_0[10];
  609. limb z2_50_0[10];
  610. limb z2_100_0[10];
  611. limb t0[10];
  612. limb t1[10];
  613. int i;
  614. /* 2 */ fsquare(z2,z);
  615. /* 4 */ fsquare(t1,z2);
  616. /* 8 */ fsquare(t0,t1);
  617. /* 9 */ fmul(z9,t0,z);
  618. /* 11 */ fmul(z11,z9,z2);
  619. /* 22 */ fsquare(t0,z11);
  620. /* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9);
  621. /* 2^6 - 2^1 */ fsquare(t0,z2_5_0);
  622. /* 2^7 - 2^2 */ fsquare(t1,t0);
  623. /* 2^8 - 2^3 */ fsquare(t0,t1);
  624. /* 2^9 - 2^4 */ fsquare(t1,t0);
  625. /* 2^10 - 2^5 */ fsquare(t0,t1);
  626. /* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0);
  627. /* 2^11 - 2^1 */ fsquare(t0,z2_10_0);
  628. /* 2^12 - 2^2 */ fsquare(t1,t0);
  629. /* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  630. /* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0);
  631. /* 2^21 - 2^1 */ fsquare(t0,z2_20_0);
  632. /* 2^22 - 2^2 */ fsquare(t1,t0);
  633. /* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  634. /* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0);
  635. /* 2^41 - 2^1 */ fsquare(t1,t0);
  636. /* 2^42 - 2^2 */ fsquare(t0,t1);
  637. /* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
  638. /* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0);
  639. /* 2^51 - 2^1 */ fsquare(t0,z2_50_0);
  640. /* 2^52 - 2^2 */ fsquare(t1,t0);
  641. /* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  642. /* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0);
  643. /* 2^101 - 2^1 */ fsquare(t1,z2_100_0);
  644. /* 2^102 - 2^2 */ fsquare(t0,t1);
  645. /* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
  646. /* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0);
  647. /* 2^201 - 2^1 */ fsquare(t0,t1);
  648. /* 2^202 - 2^2 */ fsquare(t1,t0);
  649. /* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  650. /* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0);
  651. /* 2^251 - 2^1 */ fsquare(t1,t0);
  652. /* 2^252 - 2^2 */ fsquare(t0,t1);
  653. /* 2^253 - 2^3 */ fsquare(t1,t0);
  654. /* 2^254 - 2^4 */ fsquare(t0,t1);
  655. /* 2^255 - 2^5 */ fsquare(t1,t0);
  656. /* 2^255 - 21 */ fmul(out,t1,z11);
  657. }
  658. int
  659. crypto_scalarmult_curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) {
  660. limb bp[10], x[10], z[11], zmone[10];
  661. uint8_t e[32];
  662. int i;
  663. for (i = 0; i < 32; ++i) e[i] = secret[i];
  664. e[0] &= 248;
  665. e[31] &= 127;
  666. e[31] |= 64;
  667. fexpand(bp, basepoint);
  668. cmult(x, z, e, bp);
  669. crecip(zmone, z);
  670. fmul(z, x, zmone);
  671. freduce_coefficients(z);
  672. fcontract(mypublic, z);
  673. return 0;
  674. }