java - للصف - طريقة ايجاد الجذر التربيعي للعدد العشري




أسرع طريقة لتحديد ما إذا كان الجذر التربيعي لعدد صحيح هو عدد صحيح (20)

I want this function to work with all positive 64-bit signed integers

Math.sqrt() works with doubles as input parameters, so you won't get accurate results for integers bigger than 2^53 .

أنا أبحث عن أسرع طريقة لتحديد ما إذا كانت قيمة long مربع مثالي (أي الجذر التربيعي هو عدد صحيح آخر). لقد فعلت ذلك بطريقة سهلة ، باستخدام الدالة Math.sqrt () المضمنة ، ولكن أنا أتساءل إذا كان هناك طريقة للقيام بذلك بشكل أسرع عن طريق تقييد نفسك إلى مجال صحيح فقط. المحافظة على جدول بحث غير نهائية (حيث يوجد حوالي 31.5 2 أعداد صحيحة أقل من 2 63 ).

هذه هي الطريقة البسيطة والبسيطة التي أقوم بها الآن:

public final static boolean isPerfectSquare(long n)
{
  if (n < 0)
    return false;

  long tst = (long)(Math.sqrt(n) + 0.5);
  return tst*tst == n;
}

ملاحظات: أستخدم هذه الوظيفة في العديد من مشاكل مشروع Euler . لذلك لن يضطر أي شخص آخر للحفاظ على هذا الرمز. ويمكن لهذا النوع من التحسين الجزئي أن يحدث فرقا بالفعل ، لأن جزءًا من التحدي هو القيام بكل خوارزمية في أقل من دقيقة ، وستحتاج هذه الوظيفة إلى تسمية ملايين المرات في بعض المشاكل.

تحديث 2 : أثبتت الحلول الجديدة التي نشرها A. Rex أنها أسرع. في تشغيل أكثر من أول بليون صحيح ، تطلب الحل فقط 34٪ من الوقت الذي استخدم فيه الحل الأصلي. في حين أن الاختراق John Carmack أفضل قليلاً بالنسبة للقيم الصغيرة لـ n ، فإن الفائدة مقارنة بهذا الحل صغيرة جدًا.

هنا هو حل A. Rex ، وتحويلها إلى جافا:

private final static boolean isPerfectSquare(long n)
{
  // Quickfail
  if( n < 0 || ((n&2) != 0) || ((n & 7) == 5) || ((n & 11) == 8) )
    return false;
  if( n == 0 )
    return true;

  // Check mod 255 = 3 * 5 * 17, for fun
  long y = n;
  y = (y & 0xffffffffL) + (y >> 32);
  y = (y & 0xffffL) + (y >> 16);
  y = (y & 0xffL) + ((y >> 8) & 0xffL) + (y >> 16);
  if( bad255[(int)y] )
      return false;

  // Divide out powers of 4 using binary search
  if((n & 0xffffffffL) == 0)
      n >>= 32;
  if((n & 0xffffL) == 0)
      n >>= 16;
  if((n & 0xffL) == 0)
      n >>= 8;
  if((n & 0xfL) == 0)
      n >>= 4;
  if((n & 0x3L) == 0)
      n >>= 2;

  if((n & 0x7L) != 1)
      return false;

  // Compute sqrt using something like Hensel's lemma
  long r, t, z;
  r = start[(int)((n >> 3) & 0x3ffL)];
  do {
    z = n - r * r;
    if( z == 0 )
      return true;
    if( z < 0 )
      return false;
    t = z & (-z);
    r += (z & t) >> 1;
    if( r > (t  >> 1) )
    r = t - r;
  } while( t <= (1L << 33) );
  return false;
}

private static boolean[] bad255 =
{
   false,false,true ,true ,false,true ,true ,true ,true ,false,true ,true ,true ,
   true ,true ,false,false,true ,true ,false,true ,false,true ,true ,true ,false,
   true ,true ,true ,true ,false,true ,true ,true ,false,true ,false,true ,true ,
   true ,true ,true ,true ,true ,true ,true ,true ,true ,true ,false,true ,false,
   true ,true ,true ,false,true ,true ,true ,true ,false,true ,true ,true ,false,
   true ,false,true ,true ,false,false,true ,true ,true ,true ,true ,false,true ,
   true ,true ,true ,false,true ,true ,false,false,true ,true ,true ,true ,true ,
   true ,true ,true ,false,true ,true ,true ,true ,true ,false,true ,true ,true ,
   true ,true ,false,true ,true ,true ,true ,false,true ,true ,true ,false,true ,
   true ,true ,true ,false,false,true ,true ,true ,true ,true ,true ,true ,true ,
   true ,true ,true ,true ,true ,false,false,true ,true ,true ,true ,true ,true ,
   true ,false,false,true ,true ,true ,true ,true ,false,true ,true ,false,true ,
   true ,true ,true ,true ,true ,true ,true ,true ,true ,true ,false,true ,true ,
   false,true ,false,true ,true ,false,true ,true ,true ,true ,true ,true ,true ,
   true ,true ,true ,true ,false,true ,true ,false,true ,true ,true ,true ,true ,
   false,false,true ,true ,true ,true ,true ,true ,true ,false,false,true ,true ,
   true ,true ,true ,true ,true ,true ,true ,true ,true ,true ,true ,false,false,
   true ,true ,true ,true ,false,true ,true ,true ,false,true ,true ,true ,true ,
   false,true ,true ,true ,true ,true ,false,true ,true ,true ,true ,true ,false,
   true ,true ,true ,true ,true ,true ,true ,true ,false,false,true ,true ,false,
   true ,true ,true ,true ,false,true ,true ,true ,true ,true ,false,false,true ,
   true ,false,true ,false,true ,true ,true ,false,true ,true ,true ,true ,false,
   true ,true ,true ,false,true ,false,true ,true ,true ,true ,true ,true ,true ,
   true ,true ,true ,true ,true ,false,true ,false,true ,true ,true ,false,true ,
   true ,true ,true ,false,true ,true ,true ,false,true ,false,true ,true ,false,
   false,true ,true ,true ,true ,true ,false,true ,true ,true ,true ,false,true ,
   true ,false,false,true ,true ,true ,true ,true ,true ,true ,true ,false,true ,
   true ,true ,true ,true ,false,true ,true ,true ,true ,true ,false,true ,true ,
   true ,true ,false,true ,true ,true ,false,true ,true ,true ,true ,false,false,
   true ,true ,true ,true ,true ,true ,true ,true ,true ,true ,true ,true ,true ,
   false,false,true ,true ,true ,true ,true ,true ,true ,false,false,true ,true ,
   true ,true ,true ,false,true ,true ,false,true ,true ,true ,true ,true ,true ,
   true ,true ,true ,true ,true ,false,true ,true ,false,true ,false,true ,true ,
   false,true ,true ,true ,true ,true ,true ,true ,true ,true ,true ,true ,false,
   true ,true ,false,true ,true ,true ,true ,true ,false,false,true ,true ,true ,
   true ,true ,true ,true ,false,false,true ,true ,true ,true ,true ,true ,true ,
   true ,true ,true ,true ,true ,true ,false,false,true ,true ,true ,true ,false,
   true ,true ,true ,false,true ,true ,true ,true ,false,true ,true ,true ,true ,
   true ,false,true ,true ,true ,true ,true ,false,true ,true ,true ,true ,true ,
   true ,true ,true ,false,false
};

private static int[] start =
{
  1,3,1769,5,1937,1741,7,1451,479,157,9,91,945,659,1817,11,
  1983,707,1321,1211,1071,13,1479,405,415,1501,1609,741,15,339,1703,203,
  129,1411,873,1669,17,1715,1145,1835,351,1251,887,1573,975,19,1127,395,
  1855,1981,425,453,1105,653,327,21,287,93,713,1691,1935,301,551,587,
  257,1277,23,763,1903,1075,1799,1877,223,1437,1783,859,1201,621,25,779,
  1727,573,471,1979,815,1293,825,363,159,1315,183,27,241,941,601,971,
  385,131,919,901,273,435,647,1493,95,29,1417,805,719,1261,1177,1163,
  1599,835,1367,315,1361,1933,1977,747,31,1373,1079,1637,1679,1581,1753,1355,
  513,1539,1815,1531,1647,205,505,1109,33,1379,521,1627,1457,1901,1767,1547,
  1471,1853,1833,1349,559,1523,967,1131,97,35,1975,795,497,1875,1191,1739,
  641,1149,1385,133,529,845,1657,725,161,1309,375,37,463,1555,615,1931,
  1343,445,937,1083,1617,883,185,1515,225,1443,1225,869,1423,1235,39,1973,
  769,259,489,1797,1391,1485,1287,341,289,99,1271,1701,1713,915,537,1781,
  1215,963,41,581,303,243,1337,1899,353,1245,329,1563,753,595,1113,1589,
  897,1667,407,635,785,1971,135,43,417,1507,1929,731,207,275,1689,1397,
  1087,1725,855,1851,1873,397,1607,1813,481,163,567,101,1167,45,1831,1205,
  1025,1021,1303,1029,1135,1331,1017,427,545,1181,1033,933,1969,365,1255,1013,
  959,317,1751,187,47,1037,455,1429,609,1571,1463,1765,1009,685,679,821,
  1153,387,1897,1403,1041,691,1927,811,673,227,137,1499,49,1005,103,629,
  831,1091,1449,1477,1967,1677,697,1045,737,1117,1737,667,911,1325,473,437,
  1281,1795,1001,261,879,51,775,1195,801,1635,759,165,1871,1645,1049,245,
  703,1597,553,955,209,1779,1849,661,865,291,841,997,1265,1965,1625,53,
  1409,893,105,1925,1297,589,377,1579,929,1053,1655,1829,305,1811,1895,139,
  575,189,343,709,1711,1139,1095,277,993,1699,55,1435,655,1491,1319,331,
  1537,515,791,507,623,1229,1529,1963,1057,355,1545,603,1615,1171,743,523,
  447,1219,1239,1723,465,499,57,107,1121,989,951,229,1521,851,167,715,
  1665,1923,1687,1157,1553,1869,1415,1749,1185,1763,649,1061,561,531,409,907,
  319,1469,1961,59,1455,141,1209,491,1249,419,1847,1893,399,211,985,1099,
  1793,765,1513,1275,367,1587,263,1365,1313,925,247,1371,1359,109,1561,1291,
  191,61,1065,1605,721,781,1735,875,1377,1827,1353,539,1777,429,1959,1483,
  1921,643,617,389,1809,947,889,981,1441,483,1143,293,817,749,1383,1675,
  63,1347,169,827,1199,1421,583,1259,1505,861,457,1125,143,1069,807,1867,
  2047,2045,279,2043,111,307,2041,597,1569,1891,2039,1957,1103,1389,231,2037,
  65,1341,727,837,977,2035,569,1643,1633,547,439,1307,2033,1709,345,1845,
  1919,637,1175,379,2031,333,903,213,1697,797,1161,475,1073,2029,921,1653,
  193,67,1623,1595,943,1395,1721,2027,1761,1955,1335,357,113,1747,1497,1461,
  1791,771,2025,1285,145,973,249,171,1825,611,265,1189,847,1427,2023,1269,
  321,1475,1577,69,1233,755,1223,1685,1889,733,1865,2021,1807,1107,1447,1077,
  1663,1917,1129,1147,1775,1613,1401,555,1953,2019,631,1243,1329,787,871,885,
  449,1213,681,1733,687,115,71,1301,2017,675,969,411,369,467,295,693,
  1535,509,233,517,401,1843,1543,939,2015,669,1527,421,591,147,281,501,
  577,195,215,699,1489,525,1081,917,1951,2013,73,1253,1551,173,857,309,
  1407,899,663,1915,1519,1203,391,1323,1887,739,1673,2011,1585,493,1433,117,
  705,1603,1111,965,431,1165,1863,533,1823,605,823,1179,625,813,2009,75,
  1279,1789,1559,251,657,563,761,1707,1759,1949,777,347,335,1133,1511,267,
  833,1085,2007,1467,1745,1805,711,149,1695,803,1719,485,1295,1453,935,459,
  1151,381,1641,1413,1263,77,1913,2005,1631,541,119,1317,1841,1773,359,651,
  961,323,1193,197,175,1651,441,235,1567,1885,1481,1947,881,2003,217,843,
  1023,1027,745,1019,913,717,1031,1621,1503,867,1015,1115,79,1683,793,1035,
  1089,1731,297,1861,2001,1011,1593,619,1439,477,585,283,1039,1363,1369,1227,
  895,1661,151,645,1007,1357,121,1237,1375,1821,1911,549,1999,1043,1945,1419,
  1217,957,599,571,81,371,1351,1003,1311,931,311,1381,1137,723,1575,1611,
  767,253,1047,1787,1169,1997,1273,853,1247,413,1289,1883,177,403,999,1803,
  1345,451,1495,1093,1839,269,199,1387,1183,1757,1207,1051,783,83,423,1995,
  639,1155,1943,123,751,1459,1671,469,1119,995,393,219,1743,237,153,1909,
  1473,1859,1705,1339,337,909,953,1771,1055,349,1993,613,1393,557,729,1717,
  511,1533,1257,1541,1425,819,519,85,991,1693,503,1445,433,877,1305,1525,
  1601,829,809,325,1583,1549,1991,1941,927,1059,1097,1819,527,1197,1881,1333,
  383,125,361,891,495,179,633,299,863,285,1399,987,1487,1517,1639,1141,
  1729,579,87,1989,593,1907,839,1557,799,1629,201,155,1649,1837,1063,949,
  255,1283,535,773,1681,461,1785,683,735,1123,1801,677,689,1939,487,757,
  1857,1987,983,443,1327,1267,313,1173,671,221,695,1509,271,1619,89,565,
  127,1405,1431,1659,239,1101,1159,1067,607,1565,905,1755,1231,1299,665,373,
  1985,701,1879,1221,849,627,1465,789,543,1187,1591,923,1905,979,1241,181
};

تحديث : لقد جربت الحلول المختلفة المعروضة أدناه.

  • بعد اختبار شامل ، وجدت أن إضافة 0.5 إلى نتيجة Math.sqrt () غير ضروري ، على الأقل ليس على الجهاز الخاص بي.
  • كان الاختراق John Carmack أسرع ، ولكنه أعطى نتائج غير صحيحة بدءًا من n = 410881. ومع ذلك ، كما اقترح BobbyShaftoe ، يمكننا استخدام الاختراق Carmack لـ n <410881.
  • كانت طريقة نيوتن أبطأ قليلاً من Math.sqrt() . ربما هذا بسبب استخدام Math.sqrt() لشيء مشابه Math.sqrt() Newton ، ولكن تم تنفيذه في الجهاز بحيث يكون أسرع بكثير من Java. أيضا ، لا تزال طريقة نيوتن تتطلب استخدام الزوجي.
  • تطلبت طريقة نيوتن المعدلة ، التي استخدمت بعض الحيل بحيث لا تتضمّن سوى عدد صحيح من الرياضيات فقط ، بعض الاختراقات لتفادي التجاوز (أريد أن تعمل هذه الوظيفة مع كل الأعداد الصحيحة الموقعة ذات 64 بت) ، وكانت لا تزال أبطأ من Math.sqrt() .
  • كان ختم ثنائي حتى أبطأ. وهذا منطقي لأن الختم الثنائي سيتطلب في المتوسط ​​16 تمريرًا للعثور على الجذر التربيعي لرقم 64 بت.

الاقتراح الوحيد الذي أظهر تحسينات تم إجراؤه بواسطة John D. Cook . يمكنك ملاحظة أن آخر رقم ست عشري (أي آخر 4 بتات) في مربع مثالي يجب أن يكون 0 أو 1 أو 4 أو 9. وهذا يعني أن 75٪ من الأرقام يمكن التخلص منها فورًا كمربعات ممكنة. أدى تنفيذ هذا الحل إلى تقليل بنسبة 50٪ تقريبًا في وقت التشغيل.

العمل من اقتراح جون ، أنا التحقيق في خصائص البتات n الأخيرة من مربع الكمال. من خلال تحليل آخر 6 بتات ، اكتشفت أن 12 فقط من أصل 64 قيمة ممكنة للبتّات الست الأخيرة. وهذا يعني أنه يمكن التخلص من 81٪ من القيم دون استخدام أي رياضيات. أعطى تنفيذ هذا الحل تخفيضًا إضافيًا بنسبة 8٪ في وقت التشغيل (مقارنة بخوارزمي الأصلي). يؤدي تحليل أكثر من 6 بتات إلى قائمة من البتات النهائية المحتملة التي تكون كبيرة جدًا إلى أن تكون عملية.

إليك الشفرة التي استخدمتها ، والتي تعمل في 42٪ من الوقت الذي تتطلبه الخوارزمية الأصلية (استنادًا إلى تجاوز أول 100 مليون صحيح). بالنسبة إلى قيم n أقل من 410881 ، يتم تشغيلها في 29٪ فقط من الوقت الذي تتطلبه الخوارزمية الأصلية.

private final static boolean isPerfectSquare(long n)
{
  if (n < 0)
    return false;

  switch((int)(n & 0x3F))
  {
  case 0x00: case 0x01: case 0x04: case 0x09: case 0x10: case 0x11:
  case 0x19: case 0x21: case 0x24: case 0x29: case 0x31: case 0x39:
    long sqrt;
    if(n < 410881L)
    {
      //John Carmack hack, converted to Java.
      // See: http://www.codemaestro.com/reviews/9
      int i;
      float x2, y;

      x2 = n * 0.5F;
      y  = n;
      i  = Float.floatToRawIntBits(y);
      i  = 0x5f3759df - ( i >> 1 );
      y  = Float.intBitsToFloat(i);
      y  = y * ( 1.5F - ( x2 * y * y ) );

      sqrt = (long)(1.0F/y);
    }
    else
    {
      //Carmack hack gives incorrect answer for n >= 410881.
      sqrt = (long)Math.sqrt(n);
    }
    return sqrt*sqrt == n;

  default:
    return false;
  }
}

ملاحظات :

  • وفقًا لاختبارات جون ، فإن الاستخدام or العبارات أسرع في C ++ من استخدام switch ، ولكن في Java و C # لا يبدو أن هناك فرق بين or switch .
  • لقد حاولت أيضًا إنشاء جدول بحث (بصفته مصفوفة ثابتة خاصة تتكون من 64 قيمة منطقية). ثم بدلا من إما تبديل أو بيان ، أود فقط أن أقول if(lookup[(int)(n&0x3F)]) { test } else return false; . لدهشتي ، كان هذا (فقط قليلاً) أبطأ. لست متأكدا لماذا. هذا لأن حدود الصفيف محددة في Java .

أنا أحسب طريقة تعمل ~ 35 ٪ أسرع من كود الخاص بك 6bits + Carmack + ، على الأقل مع وحدة المعالجة المركزية (x86) ولغة البرمجة (C / C ++). قد تختلف نتائجك ، خاصةً لأنني لا أعرف كيف سيعالج عامل Java.

مقاربتي ثلاثية:

  1. أولاً ، تصفية الإجابات الواضحة. يتضمن هذا الأرقام السالبة والنظر إلى آخر 4 بتات. (وجدت أن النظر إلى الستة الأخيرة لم يساعد.) أجب أيضًا بنعم على 0. (في قراءة الكود أدناه ، لاحظ أن int64 x .)
    if( x < 0 || (x&2) || ((x & 7) == 5) || ((x & 11) == 8) )
        return false;
    if( x == 0 )
        return true;
  2. بعد ذلك ، تحقق مما إذا كان هو modulo مربع 255 = 3 * 5 * 17. نظرًا لأن هذا المنتج يتكون من ثلاثة أعداد مميزة ، فإن حوالي 1/8 فقط من وحدات البقايا mod 255 هي مربعات. ومع ذلك ، من خلال خبرتي ، فإن استدعاء مشغل modulo (٪) يكلف أكثر من الفائدة التي يحصل عليها المرء ، لذلك أستخدم حيل صغيرة تتضمن 255 = 2 ^ 8 لحساب المتبقي. (للأفضل أو للأسوأ ، أنا لا أستخدم خدعة قراءة وحدات البايت الفردية من كلمة واحدة ، فقط من البتات والتغييرات.)
    int64 y = x;
    y = (y & 4294967295LL) + (y >> 32); 
    y = (y & 65535) + (y >> 16);
    y = (y & 255) + ((y >> 8) & 255) + (y >> 16);
    // At this point, y is between 0 and 511.  More code can reduce it farther.
    
    للتحقق بالفعل مما إذا كان المتبقي مربعًا ، أقوم بالبحث عن الإجابة في جدول precomputed.
    if( bad255[y] )
        return false;
    // However, I just use a table of size 512
    
  3. أخيرًا ، حاول حساب الجذر التربيعي باستخدام طريقة مشابهة لنجمة "هنسيل" . (لا أعتقد أنها تنطبق بشكل مباشر ، ولكنها تعمل مع بعض التعديلات.) قبل القيام بذلك ، أقوم بتقسيم كل القوى لـ 2 مع بحث ثنائي:
    if((x & 4294967295LL) == 0)
        x >>= 32;
    if((x & 65535) == 0)
        x >>= 16;
    if((x & 255) == 0)
        x >>= 8;
    if((x & 15) == 0)
        x >>= 4;
    if((x & 3) == 0)
        x >>= 2;
    في هذه المرحلة ، لكي يكون رقمنا مربعًا ، يجب أن يكون 1 mod 8.
    if((x & 7) != 1)
        return false;
    الهيكل الأساسي لصة هنسيل هو ما يلي. (ملاحظة: رمز لم يتم اختباره ؛ إذا لم يعمل ، حاول t = 2 أو 8.)
    int64 t = 4, r = 1;
    t <<= 1; r += ((x - r * r) & t) >> 1;
    t <<= 1; r += ((x - r * r) & t) >> 1;
    t <<= 1; r += ((x - r * r) & t) >> 1;
    // Repeat until t is 2^33 or so.  Use a loop if you want.
    الفكرة هي أنه عند كل تكرار ، يمكنك إضافة بت واحد إلى r ، الجذر التربيعي "الحالي" لـ x؛ كل الجذر التربيعي هو modulo دقيقة أكبر وأكبر من قوة 2 ، وهي T / 2. في النهاية ، سيكون r و t / 2-r جذور مربعة من x modulo t / 2. (لاحظ أنه إذا كان r هو الجذر التربيعي لـ x ، فهذا يعني أن -r. هذا صحيح حتى الأرقام modulo ، ولكن حذار ، modulo بعض الأرقام ، يمكن أن يكون للأشياء أكثر من 2 جذور مربعة ؛ ولا سيما ، وهذا يشمل قوى 2. ) لأن الجذر التربيعي الفعلي لدينا هو أقل من 2 ^ 32 ، في هذه المرحلة يمكننا فقط التحقق من أن r أو t / 2-r جذور مربعة حقيقية. في شفرتي الفعلية ، أستخدم الحلقة المعدّلة التالية:
    int64 r, t, z;
    r = start[(x >> 3) & 1023];
    do {
        z = x - r * r;
        if( z == 0 )
            return true;
        if( z < 0 )
            return false;
        t = z & (-z);
        r += (z & t) >> 1;
        if( r > (t >> 1) )
            r = t - r;
    } while( t <= (1LL << 33) );
    يتم الحصول على التسريع هنا بثلاث طرق: قيمة بدء مبدئية (مكافئة لتكرار 10 تكرار للحلقة) ، مخرج سابق للحلقة ، وتخطي بعض قيم t. بالنسبة للجزء الأخير ، ألق نظرة على z = r - x * x ، وضبط t ليكون أكبر قوة لـ 2 div z مع خدعة صغيرة. هذا يسمح لي بتخطي قيم t التي لن تؤثر على قيمة r على أية حال. قيمة البدء المبدئي في حالتي تلتقط صيغة "الجذر الأصغر" لمجموع الجذر 8192.

حتى إذا لم يعمل هذا الرمز بشكل أسرع بالنسبة لك ، آمل أن تستمتع ببعض الأفكار التي يحتوي عليها. يتبع رمز اختبار كاملة ، بما في ذلك الجداول precomputed.

typedef signed long long int int64;

int start[1024] =
{1,3,1769,5,1937,1741,7,1451,479,157,9,91,945,659,1817,11,
1983,707,1321,1211,1071,13,1479,405,415,1501,1609,741,15,339,1703,203,
129,1411,873,1669,17,1715,1145,1835,351,1251,887,1573,975,19,1127,395,
1855,1981,425,453,1105,653,327,21,287,93,713,1691,1935,301,551,587,
257,1277,23,763,1903,1075,1799,1877,223,1437,1783,859,1201,621,25,779,
1727,573,471,1979,815,1293,825,363,159,1315,183,27,241,941,601,971,
385,131,919,901,273,435,647,1493,95,29,1417,805,719,1261,1177,1163,
1599,835,1367,315,1361,1933,1977,747,31,1373,1079,1637,1679,1581,1753,1355,
513,1539,1815,1531,1647,205,505,1109,33,1379,521,1627,1457,1901,1767,1547,
1471,1853,1833,1349,559,1523,967,1131,97,35,1975,795,497,1875,1191,1739,
641,1149,1385,133,529,845,1657,725,161,1309,375,37,463,1555,615,1931,
1343,445,937,1083,1617,883,185,1515,225,1443,1225,869,1423,1235,39,1973,
769,259,489,1797,1391,1485,1287,341,289,99,1271,1701,1713,915,537,1781,
1215,963,41,581,303,243,1337,1899,353,1245,329,1563,753,595,1113,1589,
897,1667,407,635,785,1971,135,43,417,1507,1929,731,207,275,1689,1397,
1087,1725,855,1851,1873,397,1607,1813,481,163,567,101,1167,45,1831,1205,
1025,1021,1303,1029,1135,1331,1017,427,545,1181,1033,933,1969,365,1255,1013,
959,317,1751,187,47,1037,455,1429,609,1571,1463,1765,1009,685,679,821,
1153,387,1897,1403,1041,691,1927,811,673,227,137,1499,49,1005,103,629,
831,1091,1449,1477,1967,1677,697,1045,737,1117,1737,667,911,1325,473,437,
1281,1795,1001,261,879,51,775,1195,801,1635,759,165,1871,1645,1049,245,
703,1597,553,955,209,1779,1849,661,865,291,841,997,1265,1965,1625,53,
1409,893,105,1925,1297,589,377,1579,929,1053,1655,1829,305,1811,1895,139,
575,189,343,709,1711,1139,1095,277,993,1699,55,1435,655,1491,1319,331,
1537,515,791,507,623,1229,1529,1963,1057,355,1545,603,1615,1171,743,523,
447,1219,1239,1723,465,499,57,107,1121,989,951,229,1521,851,167,715,
1665,1923,1687,1157,1553,1869,1415,1749,1185,1763,649,1061,561,531,409,907,
319,1469,1961,59,1455,141,1209,491,1249,419,1847,1893,399,211,985,1099,
1793,765,1513,1275,367,1587,263,1365,1313,925,247,1371,1359,109,1561,1291,
191,61,1065,1605,721,781,1735,875,1377,1827,1353,539,1777,429,1959,1483,
1921,643,617,389,1809,947,889,981,1441,483,1143,293,817,749,1383,1675,
63,1347,169,827,1199,1421,583,1259,1505,861,457,1125,143,1069,807,1867,
2047,2045,279,2043,111,307,2041,597,1569,1891,2039,1957,1103,1389,231,2037,
65,1341,727,837,977,2035,569,1643,1633,547,439,1307,2033,1709,345,1845,
1919,637,1175,379,2031,333,903,213,1697,797,1161,475,1073,2029,921,1653,
193,67,1623,1595,943,1395,1721,2027,1761,1955,1335,357,113,1747,1497,1461,
1791,771,2025,1285,145,973,249,171,1825,611,265,1189,847,1427,2023,1269,
321,1475,1577,69,1233,755,1223,1685,1889,733,1865,2021,1807,1107,1447,1077,
1663,1917,1129,1147,1775,1613,1401,555,1953,2019,631,1243,1329,787,871,885,
449,1213,681,1733,687,115,71,1301,2017,675,969,411,369,467,295,693,
1535,509,233,517,401,1843,1543,939,2015,669,1527,421,591,147,281,501,
577,195,215,699,1489,525,1081,917,1951,2013,73,1253,1551,173,857,309,
1407,899,663,1915,1519,1203,391,1323,1887,739,1673,2011,1585,493,1433,117,
705,1603,1111,965,431,1165,1863,533,1823,605,823,1179,625,813,2009,75,
1279,1789,1559,251,657,563,761,1707,1759,1949,777,347,335,1133,1511,267,
833,1085,2007,1467,1745,1805,711,149,1695,803,1719,485,1295,1453,935,459,
1151,381,1641,1413,1263,77,1913,2005,1631,541,119,1317,1841,1773,359,651,
961,323,1193,197,175,1651,441,235,1567,1885,1481,1947,881,2003,217,843,
1023,1027,745,1019,913,717,1031,1621,1503,867,1015,1115,79,1683,793,1035,
1089,1731,297,1861,2001,1011,1593,619,1439,477,585,283,1039,1363,1369,1227,
895,1661,151,645,1007,1357,121,1237,1375,1821,1911,549,1999,1043,1945,1419,
1217,957,599,571,81,371,1351,1003,1311,931,311,1381,1137,723,1575,1611,
767,253,1047,1787,1169,1997,1273,853,1247,413,1289,1883,177,403,999,1803,
1345,451,1495,1093,1839,269,199,1387,1183,1757,1207,1051,783,83,423,1995,
639,1155,1943,123,751,1459,1671,469,1119,995,393,219,1743,237,153,1909,
1473,1859,1705,1339,337,909,953,1771,1055,349,1993,613,1393,557,729,1717,
511,1533,1257,1541,1425,819,519,85,991,1693,503,1445,433,877,1305,1525,
1601,829,809,325,1583,1549,1991,1941,927,1059,1097,1819,527,1197,1881,1333,
383,125,361,891,495,179,633,299,863,285,1399,987,1487,1517,1639,1141,
1729,579,87,1989,593,1907,839,1557,799,1629,201,155,1649,1837,1063,949,
255,1283,535,773,1681,461,1785,683,735,1123,1801,677,689,1939,487,757,
1857,1987,983,443,1327,1267,313,1173,671,221,695,1509,271,1619,89,565,
127,1405,1431,1659,239,1101,1159,1067,607,1565,905,1755,1231,1299,665,373,
1985,701,1879,1221,849,627,1465,789,543,1187,1591,923,1905,979,1241,181};

bool bad255[512] =
{0,0,1,1,0,1,1,1,1,0,1,1,1,1,1,0,0,1,1,0,1,0,1,1,1,0,1,1,1,1,0,1,
 1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,0,1,1,1,1,0,1,1,1,
 0,1,0,1,1,0,0,1,1,1,1,1,0,1,1,1,1,0,1,1,0,0,1,1,1,1,1,1,1,1,0,1,
 1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,1,1,0,1,1,1,1,0,0,1,1,1,1,1,1,
 1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,0,0,1,1,1,1,1,0,1,1,0,1,1,1,1,1,
 1,1,1,1,1,1,0,1,1,0,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,
 1,1,1,0,0,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,
 1,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,
 0,0,1,1,0,1,1,1,1,0,1,1,1,1,1,0,0,1,1,0,1,0,1,1,1,0,1,1,1,1,0,1,
 1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,0,1,1,1,1,0,1,1,1,
 0,1,0,1,1,0,0,1,1,1,1,1,0,1,1,1,1,0,1,1,0,0,1,1,1,1,1,1,1,1,0,1,
 1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,1,1,0,1,1,1,1,0,0,1,1,1,1,1,1,
 1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,0,0,1,1,1,1,1,0,1,1,0,1,1,1,1,1,
 1,1,1,1,1,1,0,1,1,0,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,
 1,1,1,0,0,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,
 1,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,
 0,0};

inline bool square( int64 x ) {
    // Quickfail
    if( x < 0 || (x&2) || ((x & 7) == 5) || ((x & 11) == 8) )
        return false;
    if( x == 0 )
        return true;

    // Check mod 255 = 3 * 5 * 17, for fun
    int64 y = x;
    y = (y & 4294967295LL) + (y >> 32);
    y = (y & 65535) + (y >> 16);
    y = (y & 255) + ((y >> 8) & 255) + (y >> 16);
    if( bad255[y] )
        return false;

    // Divide out powers of 4 using binary search
    if((x & 4294967295LL) == 0)
        x >>= 32;
    if((x & 65535) == 0)
        x >>= 16;
    if((x & 255) == 0)
        x >>= 8;
    if((x & 15) == 0)
        x >>= 4;
    if((x & 3) == 0)
        x >>= 2;

    if((x & 7) != 1)
        return false;

    // Compute sqrt using something like Hensel's lemma
    int64 r, t, z;
    r = start[(x >> 3) & 1023];
    do {
        z = x - r * r;
        if( z == 0 )
            return true;
        if( z < 0 )
            return false;
        t = z & (-z);
        r += (z & t) >> 1;
        if( r > (t  >> 1) )
            r = t - r;
    } while( t <= (1LL << 33) );

    return false;
}

An integer problem deserves an integer solution. Thus

Do binary search on the (non-negative) integers to find the greatest integer t such that t**2 <= n . Then test whether r**2 = n exactly. This takes time O(log n).

If you don't know how to binary search the positive integers because the set is unbounded, it's easy. You starting by computing your increasing function f (above f(t) = t**2 - n ) on powers of two. When you see it turn positive, you've found an upper bound. Then you can do standard binary search.


Considering for general bit length (though I have used specific type here), I tried to design simplistic algo as below. Simple and obvious check for 0,1,2 or <0 is required initially. Following is simple in sense that it doesn't try to use any existing maths functions. Most of the operator can be replaced with bit-wise operators. I haven't tested with any bench mark data though. I'm neither expert at maths or computer algorithm design in particular, I would love to see you pointing out problem. I know there is lots of improvement chances there.

int main()
{
    unsigned int c1=0 ,c2 = 0;  
    unsigned int x = 0;  
    unsigned int p = 0;  
    int k1 = 0;  
    scanf("%d",&p);  
    if(p % 2 == 0) {  
        x = p/2; 
    }  
    else {  
        x = (p/2) +1;  
    }  
    while(x) 
    {
        if((x*x) > p) {  
            c1 = x;  
            x = x/2; 
        }else {  
            c2 = x;  
            break;  
        }  
    }  
    if((p%2) != 0)  
        c2++;

    while(c2 < c1) 
    {  
        if((c2 * c2 ) == p) {  
            k1 = 1;  
            break;  
        }  
        c2++; 
    }  
    if(k1)  
        printf("\n Perfect square for %d", c2);  
    else  
        printf("\n Not perfect but nearest to :%d :", c2);  
    return 0;  
}  

For performance, you very often have to do some compromsies. Others have expressed various methods, however, you noted Carmack's hack was faster up to certain values of N. Then, you should check the "n" and if it is less than that number N, use Carmack's hack, else use some other method described in the answers here.


Here is the simplest and most concise way, although I do not know how it compares in terms of CPU cycles. This works great if you only wish to know if the root is a whole number. If you really care if it is an integer, you can also figure that out. Here is a simple (and pure) function:

public static boolean isRootWhole(double number) {
    return Math.sqrt(number) % 1 == 0;
}

If you do not need micro-optimization, this answer is better in terms of simplicity and maintainability. If you will be getting negative numbers, perhaps you will want to use Math.abs() on the number argument as the Math.sqrt() argument.

On my 3.6Ghz Intel i7-4790 CPU, a run of this algorithm on 0 - 10,000,000 took an average of 35 - 37 nanoseconds per calculation. I did 10 sequential runs, printing the average time spent on each of the ten million sqrt calculations. Each total run took just a little over 600 ms to complete.

If you are performing a lesser number of calculations, the earlier calculations take a bit longer.


I like the idea to use an almost correct method on some of the input. Here is a version with a higher "offset". The code seems to work and passes my simple test case.

Just replace your:

if(n < 410881L){...}

code with this one:

if (n < 11043908100L) {
    //John Carmack hack, converted to Java.
    // See: http://www.codemaestro.com/reviews/9
    int i;
    float x2, y;

    x2 = n * 0.5F;
    y = n;
    i = Float.floatToRawIntBits(y);
    //using the magic number from 
    //http://www.lomont.org/Math/Papers/2003/InvSqrt.pdf
    //since it more accurate
    i = 0x5f375a86 - (i >> 1);
    y = Float.intBitsToFloat(i);
    y = y * (1.5F - (x2 * y * y));
    y = y * (1.5F - (x2 * y * y)); //Newton iteration, more accurate

    sqrt = Math.round(1.0F / y);
} else {
    //Carmack hack gives incorrect answer for n >= 11043908100.
    sqrt = (long) Math.sqrt(n);
}

I ran my own analysis of several of the algorithms in this thread and came up with some new results. You can see those old results in the edit history of this answer, but they're not accurate, as I made a mistake, and wasted time analyzing several algorithms which aren't close. However, pulling lessons from several different answers, I now have two algorithms that crush the "winner" of this thread. Here's the core thing I do differently than everyone else:

// This is faster because a number is divisible by 2^4 or more only 6% of the time
// and more than that a vanishingly small percentage.
while((x & 0x3) == 0) x >>= 2;
// This is effectively the same as the switch-case statement used in the original
// answer. 
if((x & 0x7) != 1) return false;

However, this simple line, which most of the time adds one or two very fast instructions, greatly simplifies the switch-case statement into one if statement. However, it can add to the runtime if many of the tested numbers have significant power-of-two factors.

The algorithms below are as follows:

  • Internet - Kip's posted answer
  • Durron - My modified answer using the one-pass answer as a base
  • DurronTwo - My modified answer using the two-pass answer (by @JohnnyHeggheim), with some other slight modifications.

Here is a sample runtime if the numbers are generated using Math.abs(java.util.Random.nextLong())

 0% Scenario{vm=java, trial=0, benchmark=Internet} 39673.40 ns; ?=378.78 ns @ 3 trials
33% Scenario{vm=java, trial=0, benchmark=Durron} 37785.75 ns; ?=478.86 ns @ 10 trials
67% Scenario{vm=java, trial=0, benchmark=DurronTwo} 35978.10 ns; ?=734.10 ns @ 10 trials

benchmark   us linear runtime
 Internet 39.7 ==============================
   Durron 37.8 ============================
DurronTwo 36.0 ===========================

vm: java
trial: 0

And here is a sample runtime if it's run on the first million longs only:

 0% Scenario{vm=java, trial=0, benchmark=Internet} 2933380.84 ns; ?=56939.84 ns @ 10 trials
33% Scenario{vm=java, trial=0, benchmark=Durron} 2243266.81 ns; ?=50537.62 ns @ 10 trials
67% Scenario{vm=java, trial=0, benchmark=DurronTwo} 3159227.68 ns; ?=10766.22 ns @ 3 trials

benchmark   ms linear runtime
 Internet 2.93 ===========================
   Durron 2.24 =====================
DurronTwo 3.16 ==============================

vm: java
trial: 0

As you can see, DurronTwo does better for large inputs, because it gets to use the magic trick very very often, but gets clobbered compared to the first algorithm and Math.sqrt because the numbers are so much smaller. Meanwhile, the simpler Durron is a huge winner because it never has to divide by 4 many many times in the first million numbers.

Here's Durron :

public final static boolean isPerfectSquareDurron(long n) {
    if(n < 0) return false;
    if(n == 0) return true;

    long x = n;
    // This is faster because a number is divisible by 16 only 6% of the time
    // and more than that a vanishingly small percentage.
    while((x & 0x3) == 0) x >>= 2;
    // This is effectively the same as the switch-case statement used in the original
    // answer. 
    if((x & 0x7) == 1) {

        long sqrt;
        if(x < 410881L)
        {
            int i;
            float x2, y;

            x2 = x * 0.5F;
            y  = x;
            i  = Float.floatToRawIntBits(y);
            i  = 0x5f3759df - ( i >> 1 );
            y  = Float.intBitsToFloat(i);
            y  = y * ( 1.5F - ( x2 * y * y ) );

            sqrt = (long)(1.0F/y);
        } else {
            sqrt = (long) Math.sqrt(x);
        }
        return sqrt*sqrt == x;
    }
    return false;
}

And DurronTwo

public final static boolean isPerfectSquareDurronTwo(long n) {
    if(n < 0) return false;
    // Needed to prevent infinite loop
    if(n == 0) return true;

    long x = n;
    while((x & 0x3) == 0) x >>= 2;
    if((x & 0x7) == 1) {
        long sqrt;
        if (x < 41529141369L) {
            int i;
            float x2, y;

            x2 = x * 0.5F;
            y = x;
            i = Float.floatToRawIntBits(y);
            //using the magic number from 
            //http://www.lomont.org/Math/Papers/2003/InvSqrt.pdf
            //since it more accurate
            i = 0x5f375a86 - (i >> 1);
            y = Float.intBitsToFloat(i);
            y = y * (1.5F - (x2 * y * y));
            y = y * (1.5F - (x2 * y * y)); //Newton iteration, more accurate
            sqrt = (long) ((1.0F/y) + 0.2);
        } else {
            //Carmack hack gives incorrect answer for n >= 41529141369.
            sqrt = (long) Math.sqrt(x);
        }
        return sqrt*sqrt == x;
    }
    return false;
}

And my benchmark harness: (Requires Google caliper 0.1-rc5)

public class SquareRootBenchmark {
    public static class Benchmark1 extends SimpleBenchmark {
        private static final int ARRAY_SIZE = 10000;
        long[] trials = new long[ARRAY_SIZE];

        @Override
        protected void setUp() throws Exception {
            Random r = new Random();
            for (int i = 0; i < ARRAY_SIZE; i++) {
                trials[i] = Math.abs(r.nextLong());
            }
        }


        public int timeInternet(int reps) {
            int trues = 0;
            for(int i = 0; i < reps; i++) {
                for(int j = 0; j < ARRAY_SIZE; j++) {
                    if(SquareRootAlgs.isPerfectSquareInternet(trials[j])) trues++;
                }
            }

            return trues;   
        }

        public int timeDurron(int reps) {
            int trues = 0;
            for(int i = 0; i < reps; i++) {
                for(int j = 0; j < ARRAY_SIZE; j++) {
                    if(SquareRootAlgs.isPerfectSquareDurron(trials[j])) trues++;
                }
            }

            return trues;   
        }

        public int timeDurronTwo(int reps) {
            int trues = 0;
            for(int i = 0; i < reps; i++) {
                for(int j = 0; j < ARRAY_SIZE; j++) {
                    if(SquareRootAlgs.isPerfectSquareDurronTwo(trials[j])) trues++;
                }
            }

            return trues;   
        }
    }

    public static void main(String... args) {
        Runner.main(Benchmark1.class, args);
    }
}

UPDATE: I've made a new algorithm that is faster in some scenarios, slower in others, I've gotten different benchmarks based on different inputs. If we calculate modulo 0xFFFFFF = 3 x 3 x 5 x 7 x 13 x 17 x 241 , we can eliminate 97.82% of numbers that cannot be squares. This can be (sort of) done in one line, with 5 bitwise operations:

if (!goodLookupSquares[(int) ((n & 0xFFFFFFl) + ((n >> 24) & 0xFFFFFFl) + (n >> 48))]) return false;

The resulting index is either 1) the residue, 2) the residue + 0xFFFFFF , or 3) the residue + 0x1FFFFFE . Of course, we need to have a lookup table for residues modulo 0xFFFFFF , which is about a 3mb file (in this case stored as ascii text decimal numbers, not optimal but clearly improvable with a ByteBuffer and so forth. But since that is precalculation it doesn't matter so much. You can find the file here (or generate it yourself):

public final static boolean isPerfectSquareDurronThree(long n) {
    if(n < 0) return false;
    if(n == 0) return true;

    long x = n;
    while((x & 0x3) == 0) x >>= 2;
    if((x & 0x7) == 1) {
        if (!goodLookupSquares[(int) ((n & 0xFFFFFFl) + ((n >> 24) & 0xFFFFFFl) + (n >> 48))]) return false;
        long sqrt;
        if(x < 410881L)
        {
            int i;
            float x2, y;

            x2 = x * 0.5F;
            y  = x;
            i  = Float.floatToRawIntBits(y);
            i  = 0x5f3759df - ( i >> 1 );
            y  = Float.intBitsToFloat(i);
            y  = y * ( 1.5F - ( x2 * y * y ) );

            sqrt = (long)(1.0F/y);
        } else {
            sqrt = (long) Math.sqrt(x);
        }
        return sqrt*sqrt == x;
    }
    return false;
}

I load it into a boolean array like this:

private static boolean[] goodLookupSquares = null;

public static void initGoodLookupSquares() throws Exception {
    Scanner s = new Scanner(new File("24residues_squares.txt"));

    goodLookupSquares = new boolean[0x1FFFFFE];

    while(s.hasNextLine()) {
        int residue = Integer.valueOf(s.nextLine());
        goodLookupSquares[residue] = true;
        goodLookupSquares[residue + 0xFFFFFF] = true;
        goodLookupSquares[residue + 0x1FFFFFE] = true;
    }

    s.close();
}

Example runtime. It beat Durron (version one) in every trial I ran.

 0% Scenario{vm=java, trial=0, benchmark=Internet} 40665.77 ns; ?=566.71 ns @ 10 trials
33% Scenario{vm=java, trial=0, benchmark=Durron} 38397.60 ns; ?=784.30 ns @ 10 trials
67% Scenario{vm=java, trial=0, benchmark=DurronThree} 36171.46 ns; ?=693.02 ns @ 10 trials

  benchmark   us linear runtime
   Internet 40.7 ==============================
     Durron 38.4 ============================
DurronThree 36.2 ==========================

vm: java
trial: 0

I'm not sure if it would be faster, or even accurate, but you could use http://www.codemaestro.com/reviews/9 , algorithm to solve the square root faster. You could probably easily test this for all possible 32 bit integers, and validate that you actually got correct results, as it's only an appoximation. However, now that I think about it, using doubles is approximating also, so I'm not sure how that would come into play.


If speed is a concern, why not partition off the most commonly used set of inputs and their values to a lookup table and then do whatever optimized magic algorithm you have come up with for the exceptional cases?


If you want speed, given that your integers are of finite size, I suspect that the quickest way would involve (a) partitioning the parameters by size (eg into categories by largest bit set), then checking the value against an array of perfect squares within that range.


It ought to be possible to pack the 'cannot be a perfect square if the last X digits are N' much more efficiently than that! I'll use java 32 bit ints, and produce enough data to check the last 16 bits of the number - that's 2048 hexadecimal int values.

...

حسنا. Either I have run into some number theory that is a little beyond me, or there is a bug in my code. In any case, here is the code:

public static void main(String[] args) {
    final int BITS = 16;

    BitSet foo = new BitSet();

    for(int i = 0; i< (1<<BITS); i++) {
        int sq = (i*i);
        sq = sq & ((1<<BITS)-1);
        foo.set(sq);
    }

    System.out.println("int[] mayBeASquare = {");

    for(int i = 0; i< 1<<(BITS-5); i++) {
        int kk = 0;
        for(int j = 0; j<32; j++) {
            if(foo.get((i << 5) | j)) {
                kk |= 1<<j;
            }
        }
        System.out.print("0x" + Integer.toHexString(kk) + ", ");
        if(i%8 == 7) System.out.println();
    }
    System.out.println("};");
}

وهنا النتائج:

(ed: elided for poor performance in prettify.js; view revision history to see.)


It's been pointed out that the last d digits of a perfect square can only take on certain values. The last d digits (in base b ) of a number n is the same as the remainder when n is divided by b d , ie. in C notation n % pow(b, d) .

This can be generalized to any modulus m , ie. n % m can be used to rule out some percentage of numbers from being perfect squares. The modulus you are currently using is 64, which allows 12, ie. 19% of remainders, as possible squares. With a little coding I found the modulus 110880, which allows only 2016, ie. 1.8% of remainders as possible squares. So depending on the cost of a modulus operation (ie. division) and a table lookup versus a square root on your machine, using this modulus might be faster.

By the way if Java has a way to store a packed array of bits for the lookup table, don't use it. 110880 32-bit words is not much RAM these days and fetching a machine word is going to be faster than fetching a single bit.


Just for the record, another approach is to use the prime decomposition. If every factor of the decomposition is even, then the number is a perfect square. So what you want is to see if a number can be decomposed as a product of squares of prime numbers. Of course, you don't need to obtain such a decomposition, just to see if it exists.

First build a table of squares of prime numbers which are lower than 2^32. This is far smaller than a table of all integers up to this limit.

A solution would then be like this:

boolean isPerfectSquare(long number)
{
    if (number < 0) return false;
    if (number < 2) return true;

    for (int i = 0; ; i++)
    {
        long square = squareTable[i];
        if (square > number) return false;
        while (number % square == 0)
        {
            number /= square;
        }
        if (number == 1) return true;
    }
}

I guess it's a bit cryptic. What it does is checking in every step that the square of a prime number divide the input number. If it does then it divides the number by the square as long as it is possible, to remove this square from the prime decomposition. If by this process, we came to 1, then the input number was a decomposition of square of prime numbers. If the square becomes larger than the number itself, then there is no way this square, or any larger squares, can divide it, so the number can not be a decomposition of squares of prime numbers.

Given nowadays' sqrt done in hardware and the need to compute prime numbers here, I guess this solution is way slower. But it should give better results than solution with sqrt which won't work over 2^54, as says mrzl in his answer.


Regarding the Carmac method, it seems like it would be quite easy just to iterate once more, which should double the number of digits of accuracy. It is, after all, an extremely truncated iterative method -- Newton's, with a very good first guess.

Regarding your current best, I see two micro-optimizations:

  • move the check vs. 0 after the check using mod255
  • rearrange the dividing out powers of four to skip all the checks for the usual (75%) case.

Ie:

// Divide out powers of 4 using binary search

if((n & 0x3L) == 0) {
  n >>=2;

  if((n & 0xffffffffL) == 0)
    n >>= 32;
  if((n & 0xffffL) == 0)
      n >>= 16;
  if((n & 0xffL) == 0)
      n >>= 8;
  if((n & 0xfL) == 0)
      n >>= 4;
  if((n & 0x3L) == 0)
      n >>= 2;
}

Even better might be a simple

while ((n & 0x03L) == 0) n >>= 2;

Obviously, it would be interesting to know how many numbers get culled at each checkpoint -- I rather doubt the checks are truly independent, which makes things tricky.


The following simplification of maaartinus's solution appears to shave a few percentage points off the runtime, but I'm not good enough at benchmarking to produce a benchmark I can trust:

long goodMask; // 0xC840C04048404040 computed below
{
    for (int i=0; i<64; ++i) goodMask |= Long.MIN_VALUE >>> (i*i);
}

public boolean isSquare(long x) {
    // This tests if the 6 least significant bits are right.
    // Moving the to be tested bit to the highest position saves us masking.
    if (goodMask << x >= 0) return false;
    // Remove an even number of trailing zeros, leaving at most one.
    x >>= (Long.numberOfTrailingZeros(x) & (-2);
    // Repeat the test on the 6 least significant remaining bits.
    if (goodMask << x >= 0 | x <= 0) return x == 0;
    // Do it in the classical way.
    // The correctness is not trivial as the conversion from long to double is lossy!
    final long tst = (long) Math.sqrt(x);
    return tst * tst == x;
}

It would be worth checking how omitting the first test,

if (goodMask << x >= 0) return false;

would affect performance.


This a rework from decimal to binary of the old Marchant calculator algorithm (sorry, I don't have a reference), in Ruby, adapted specifically for this question:

def isexactsqrt(v)
    value = v.abs
    residue = value
    root = 0
    onebit = 1
    onebit <<= 8 while (onebit < residue)
    onebit >>= 2 while (onebit > residue)
    while (onebit > 0)
        x = root + onebit
        if (residue >= x) then
            residue -= x
            root = x + onebit
        end
        root >>= 1
        onebit >>= 2
    end
    return (residue == 0)
end

Here's a workup of something similar (please don't vote me down for coding style/smells or clunky O/O - it's the algorithm that counts, and C++ is not my home language). In this case, we're looking for residue == 0:

#include <iostream>  

using namespace std;  
typedef unsigned long long int llint;

class ISqrt {           // Integer Square Root
    llint value;        // Integer whose square root is required
    llint root;         // Result: floor(sqrt(value))
    llint residue;      // Result: value-root*root
    llint onebit, x;    // Working bit, working value

public:

    ISqrt(llint v = 2) {    // Constructor
        Root(v);            // Take the root 
    };

    llint Root(llint r) {   // Resets and calculates new square root
        value = r;          // Store input
        residue = value;    // Initialise for subtracting down
        root = 0;           // Clear root accumulator

        onebit = 1;                 // Calculate start value of counter
        onebit <<= (8*sizeof(llint)-2);         // Set up counter bit as greatest odd power of 2 
        while (onebit > residue) {onebit >>= 2; };  // Shift down until just < value

        while (onebit > 0) {
            x = root ^ onebit;          // Will check root+1bit (root bit corresponding to onebit is always zero)
            if (residue >= x) {         // Room to subtract?
                residue -= x;           // Yes - deduct from residue
                root = x + onebit;      // and step root
            };
            root >>= 1;
            onebit >>= 2;
        };
        return root;                    
    };
    llint Residue() {           // Returns residue from last calculation
        return residue;                 
    };
};

int main() {
    llint big, i, q, r, v, delta;
    big = 0; big = (big-1);         // Kludge for "big number"
    ISqrt b;                            // Make q sqrt generator
    for ( i = big; i > 0 ; i /= 7 ) {   // for several numbers
        q = b.Root(i);                  // Get the square root
        r = b.Residue();                // Get the residue
        v = q*q+r;                      // Recalc original value
        delta = v-i;                    // And diff, hopefully 0
        cout << i << ": " << q << " ++ " << r << " V: " << v << " Delta: " << delta << "\n";
    };
    return 0;
};

This is the fastest Java implementation I could come up with, using a combination of techniques suggested by others in this thread.

  • Mod-256 test
  • Inexact mod-3465 test (avoids integer division at the cost of some false positives)
  • Floating-point square root, round and compare with input value

I also experimented with these modifications but they did not help performance:

  • Additional mod-255 test
  • Dividing the input value by powers of 4
  • Fast Inverse Square Root (to work for high values of N it needs 3 iterations, enough to make it slower than the hardware square root function.)

public class SquareTester {

    public static boolean isPerfectSquare(long n) {
        if (n < 0) {
            return false;
        } else {
            switch ((byte) n) {
            case -128: case -127: case -124: case -119: case -112:
            case -111: case -103: case  -95: case  -92: case  -87:
            case  -79: case  -71: case  -64: case  -63: case  -60:
            case  -55: case  -47: case  -39: case  -31: case  -28:
            case  -23: case  -15: case   -7: case    0: case    1:
            case    4: case    9: case   16: case   17: case   25:
            case   33: case   36: case   41: case   49: case   57:
            case   64: case   65: case   68: case   73: case   81:
            case   89: case   97: case  100: case  105: case  113:
            case  121:
                long i = (n * INV3465) >>> 52;
                if (! good3465[(int) i]) {
                    return false;
                } else {
                    long r = round(Math.sqrt(n));
                    return r*r == n; 
                }
            default:
                return false;
            }
        }
    }

    private static int round(double x) {
        return (int) Double.doubleToRawLongBits(x + (double) (1L << 52));
    }

    /** 3465<sup>-1</sup> modulo 2<sup>64</sup> */
    private static final long INV3465 = 0x8ffed161732e78b9L;

    private static final boolean[] good3465 =
        new boolean[0x1000];

    static {
        for (int r = 0; r < 3465; ++ r) {
            int i = (int) ((r * r * INV3465) >>> 52);
            good3465[i] = good3465[i+1] = true;
        }
    }

}

You'll have to do some benchmarking. The best algorithm will depend on the distribution of your inputs.

Your algorithm may be nearly optimal, but you might want to do a quick check to rule out some possibilities before calling your square root routine. For example, look at the last digit of your number in hex by doing a bit-wise "and." Perfect squares can only end in 0, 1, 4, or 9 in base 16, So for 75% of your inputs (assuming they are uniformly distributed) you can avoid a call to the square root in exchange for some very fast bit twiddling.

Kip benchmarked the following code implementing the hex trick. When testing numbers 1 through 100,000,000, this code ran twice as fast as the original.

public final static boolean isPerfectSquare(long n)
{
    if (n < 0)
        return false;

    switch((int)(n & 0xF))
    {
    case 0: case 1: case 4: case 9:
        long tst = (long)Math.sqrt(n);
        return tst*tst == n;

    default:
        return false;
    }
}

When I tested the analogous code in C++, it actually ran slower than the original. However, when I eliminated the switch statement, the hex trick once again make the code twice as fast.

int isPerfectSquare(int n)
{
    int h = n & 0xF;  // h is the last hex "digit"
    if (h > 9)
        return 0;
    // Use lazy evaluation to jump out of the if statement as soon as possible
    if (h != 2 && h != 3 && h != 5 && h != 6 && h != 7 && h != 8)
    {
        int t = (int) floor( sqrt((double) n) + 0.5 );
        return t*t == n;
    }
    return 0;
}

Eliminating the switch statement had little effect on the C# code.


أنا متأخرة جدا للحزب ، لكني آمل أن أقدم إجابة أفضل. أقصر و (على افتراض أن benchmark صحيح) أيضا faster بكثير.

long goodMask; // 0xC840C04048404040 computed below
{
    for (int i=0; i<64; ++i) goodMask |= Long.MIN_VALUE >>> (i*i);
}

public boolean isSquare(long x) {
    // This tests if the 6 least significant bits are right.
    // Moving the to be tested bit to the highest position saves us masking.
    if (goodMask << x >= 0) return false;
    final int numberOfTrailingZeros = Long.numberOfTrailingZeros(x);
    // Each square ends with an even number of zeros.
    if ((numberOfTrailingZeros & 1) != 0) return false;
    x >>= numberOfTrailingZeros;
    // Now x is either 0 or odd.
    // In binary each odd square ends with 001.
    // Postpone the sign test until now; handle zero in the branch.
    if ((x&7) != 1 | x <= 0) return x == 0;
    // Do it in the classical way.
    // The correctness is not trivial as the conversion from long to double is lossy!
    final long tst = (long) Math.sqrt(x);
    return tst * tst == x;
}

الاختبار الأول يمسك معظم غير الساحات بسرعة. ويستخدم جدول مكون من 64 عنصرًا معبأًا لفترة طويلة ، لذلك لا توجد تكلفة الوصول إلى الصفيف (الشيكات غير المباشرة والحدود). لفترة long بشكل عشوائي ، هناك احتمال 81.25 ٪ من إنهاء هنا.

يمسك الاختبار الثاني جميع الأرقام التي تحتوي على عدد فردي من الاثنتين في عاملها. الأسلوب Long.numberOfTrailingZeros سريع جدا لأنه يحصل على JIT إد في تعليمات i86 واحدة.

بعد إسقاط الأصفار الزائدة ، يعالج الاختبار الثالث الأرقام المنتهية بـ 011 ، أو 101 ، أو 111 في ثنائي ، وهي ليست مربعات مثالية. كما يهتم أيضًا بالأرقام السالبة وأيضًا مقابض 0.

يعود الاختبار النهائي إلى الحساب double . وبما أن double يحتوي فقط على 53 بتة ، فإن التحويل من long إلى double يشمل التقريب للقيم الكبيرة. ومع ذلك ، فإن الاختبار صحيح (ما لم يكن proof خاطئًا).

محاولة دمج فكرة mod255 لم تكن ناجحة.





perfect-square