/*---------------------------------------------------------------
This is the chi-square goodness-of-fit test for testing the
null hypothesis:

  "the data are a sample of independent f-distributed numbers"

for a list data = [x.1,..,x.n] of random quantitiess.
The distribution f may be discrete or continuous;

Call:
   stats::csGOFT( x1, x2, ..,  [[a1, b1], [a2, b2], ..], X = f)
   stats::csGOFT([x1, x2, ..], [[a1, b1], [a2, b2], ..], X = f)
   stats::csGOFT(   s, <c>,    [[a1, b1], [a2, b2], ..], X = f)

Parameters:

  x1, x2, ..         : the statistical data: real numerical values
  a1, b1, a2, b2, .. : cell boundaries: real numerical data or
                       +/- infinity satisfying
                           a1 < b1 <= a2 < b2 <= a3 < ...
                       This defines cells (semi-open intervals)
                             c.i = (a.i, b.i]
                       Their union must cover the 'possibility space', i.e.,
                       the cell probabilites p.i = Pr(a.i < x <= b.i)
                       must add up to 1 for a random variable with
                       the hypothesized distribution specified by X = f.
 
                       Recommendation:
                       Choose the cells such that n*p.i>=5 for all i, 
                       where n is the size of the data sample x1, x2, ..

                       For continuous distributions, equiprobable cells
                       can be computed by stats::equiprobableCells.

                       At least 3 cells must be specified

    X : CDF, PF, or PDF. 
    f : a procedure of function environment representing either
        - a cumulative distribution function (CDF), or
        - a probability density function (PDF), or 
        - a propability function (PF) of a discrete distribution.
        Alternatively, a univariate arithmetical expression.

Returns: the list [PValue = p, StatValue = s, ExpectedCellFrequencies = m] with
      - the attained significance level p of s,
      - the chi-square statistic s, 
      - the minimal expected cell frequency m = n*min(p.i), 
        where n is the sample size and the p.i are the
        expected cell probabilites p.i = Pr(a.i < x <= b.i)
        under the hypothesized distribution f specified by X=f.

Describtion of the statistic::

  The probability p.i of the cell c.i is 
     p.i:= f(b.i)-f(a.i)  in the CDF case
     p.i:= int(f(t),t=a.i..b.i) in the PDF case;
     p.i:= sum(f(j),j=a.i+1,.., b.i) in the PF case

  Therefore, the expected number of data falling into
  cell c.i is n*p.i;
  Let y.i be the number of observed data falling into
  cell c.i, i.e., y.i= |{i: a.i < x.i <= b.i}|.
  The chi-square statistic for the input data is

       S: =sum((y.i-n*p.i)^2/(n*p.i),i=1..k)

Interpretation:
  under the null hypothesis, S is a random variable.
  For large n, its distribution is approximately
  chi-square with k-1 degrees of freedom.
  Therefore, the attained significance level of S
  is 1 - stats::chisquareCDF(k-1)(S). 

  For further details see: 
     DAgostino: Goodness-of-fit Techniques, 
  or Knuth, Seminumerical Algorithms, vol.2 
------------------------------------------------------------*/

stats::csGOFT:=proc()
local data, n, 
      cells, c, cellSet, 
      eq, FLAG, f, ff, unknowns,
      eCDF, ecp, 
      p, oldp, newp, totalp,
      k0, k1, k2, lastRightBoundary,
      i, j, s, t, m;
begin
   if args(0) < 3  then
      error("expecting at least three arguments")
   end_if:

   //-----------------------------------------------------
   // check the distribution. If must have been specified
   // by an equation 'CDF = f', 'PDF = f' or 'PF = f'.
   //-----------------------------------------------------
   eq:= args(args(0)):

   if type(eq) <> "_equal" then
      error("the last argument must be an equation ".
            "specifying the distribution");
   end_if;

   FLAG:= op(eq,1):

   if FLAG <> PF and FLAG <> CDF and FLAG <> PDF then
      error("unknown distribution flag, expecting 'CDF' ".
            "or 'PDF' or 'PF'");
   end_if:

   f:= op(eq,2):
   if {domtype(f)} minus {DOM_PROC, DOM_FUNC_ENV} = {} then
        // f:=  x -> float(f(float(x)))
        f:= float@f@float;
   else if testtype(f, Type::Arithmetical) then
           // search for the identifiers and the indexed
           // identifiers in the expression f, subtract
           // the numerical constants PI, EULER, CATALAN: 

           // k1 =  all identifiers
           k1:= indets(f) minus Type::ConstantIdents;
 
           // k2 = all indexed identifiers. Presently, we
           // do not have indexed ContantIdents, but
           // maybe in the future?

           k2:= misc::subExpressions(f, "_index") minus
                Type::ConstantIdents;

           // problem: if the expression f has x[1],
           // then k1 = {x} and k2 = {x[1]}.
           // Each x[.] in k2 is also contained in
           // k1 as x. Elminate each such x from k1
           // and unify to the 'unknowns' = set of
           //  all identifiers and indexed identifiers

           unknowns:= k2 union (k1 minus indets(k2));

           // convert f to a function x -> f(x)

           case nops(unknowns) 
           of 0 do // f is a numerical constant
                   ff:= float(f);
                   if domtype(ff) <> DOM_FLOAT then
                      error("cannot identify the distribution function");
                   end_if;

                   f:= y -> ff;
                   break;
           of 1 do // f is a univariate expression

                   unknowns:= op(unknowns);
                   ff:= f;
                   f:= y -> float(eval(subs(ff, unknowns = float(y)))):
                   break;
           otherwise
                   error("expecting a *univariate* expression representing ".
                         "the distribution function. Several variables ". 
                         expr2text(op(unknowns))." found.");
           end_case;
      else // f is neither a procedure nor an arithmetical expression
           error("the distribution function must be a ".
                 "procedure or an arithmetical expression");
      end_if;
   end_if:

   //-------------------------------------------------
   // check the cell partitioning
   //          [[a1, b1], [a2, b2], ...],
   // Set c = number of cells
   //-------------------------------------------------
   cells:= args(args(0)-1):

   if domtype(cells) <> DOM_LIST then
      error("expecting a list of cells")
   end_if:

   c:= nops(cells):

   if c < 3 then
     error("expecting at least three cells")
   end_if:

   // convert the list of cells to a set of cells:
   cellSet:= {op(cells)}:

   // are the cells a list of sublists with 2 elements each?
   if map(cellSet, domtype) <> {DOM_LIST} or
      map(cellSet, nops) <> {2} then
      error("the cells (a.i, b.i] must be specified ".
            "by a list of lists [[a1, b1l, [a2, b2], ...]");
   end_if;

   // are all cell boundaries numerical or +/- infinity?
   cellSet:= map(cellSet, op);  // = {a1, b1, a2, b2, ..}

   if  map(cellSet, domtype@float) minus
       {DOM_FLOAT, stdlib::Infinity} <> {} then
       error("illegal cell boundaries. Cell boundaries must ".
             "be convertible to floats or +/- infinity"):
   end_if:

   //-------------------------------------------------
   // check the data
   //-------------------------------------------------
   data:= stats::getdata(testargs(), "numeric_only",
                         1, args(1..args(0) - 2)):

   if domtype(data) = DOM_STRING then
      // stats::getdata returns a string with an error 
      // message if something went wrong
      error(data);
   end_if:
 
   // now, the data are a list
   n:= nops(data):

   if n = [] then
      error("expecting a non-empty sample of data")
   end_if:

   data:= float(data):
 
  //-------------------------------------------
  //-------------------------------------------
  // Parameter checking done. The work starts.
  //-------------------------------------------
  //-------------------------------------------

  cells:= float(cells):

  //-------------------------------------------
  // compute the empirical cell probability for
  // the given data:
  //     ecp[i] = 1/n * (number of data falling
  //                     into cell i)
  //-------------------------------------------

  //-------------------------------------------
  // Some timings taken in MuPAD 2.1:
  // The call
  //      nops(select(data, x -> (k1 < x and x <= k2)))
  // with sample size n (and some suitable time unit)
  // costs  1.2*n (for n ranging from 10 to 10^5).
  // The call 
  //     eCDF:= stats::empiricalCDF(data) 
  // costs  a(n) * n * ln(n) with 
  //
  //      n =   5    10  10^2  10^3  10^4  10^5
  //   a(n) =  3.0  1.2  0.27  0.17  0.16  0.16
   //
  // (asymptotically, these are the costs for the 
  // internal sorting of the data).
  // Version 1 below costs 1.2*c*n ( c = nuumber of cells),
  // version 2 below costs a(n)*n*ln(n).
  // (The costs of calling eCDF(k2) below is O(ln(n))
  // and can be ignored.)
  // Assume n >> 10, c << n. Set a(n) = 0.3, say.
  // Choose the first version, if 1.2*c < a(n)*ln(n), 
  // i.e. if c  < 0.3 * n * ln(n) / (1.2 * ln(n))
  // If n = 10^4, this means:  c < 2.3.
  // If n = 10^5, this means:  c < 2.8.
  // If n = 10^6, this means:  c < 3.5.
  // Hence, the first version is competitive only
  // for extremely large sample size n and
  // extremely small number of cells c.
  // ---> forget version 1!
  //-------------------------------------------

  /*------- version 1 (not competitive)  ------
     ecp:= [0 $ i = 1..c]:
     // the first cell:
     k2:= cells[1][2]:
     ecp[1]:= 1/n*nops(select(data, x -> (x <= k2))):
     // the intermediate cells:
     for i from 2 to c-1 do
         [k1, k2]:= cells[i]:
         ecp[i] := 1/n*nops(select(data, x -> (k1 < x and x <= k2))):
     end_for:
     // the last cell:
     k1:= cells[c][1]:
     ecp[c]:= 1/n*nops(select(data, x -> (k1 < x))):
  -------------------------------------------*/

  //----------------------------------
  // version 2 of computing the
  // empirical cell probabilities.
  // Creating eCDF costs O(n*ln(n)),
  // each call to eCDF costs O(ln(n)):
  //----------------------------------

  eCDF:= stats::empiricalCDF(data):
  ecp:= [0 $ i = 1..c]:

  // the first cell:
  k2:= cells[1][2]:
  ecp[1] := eCDF(k2);
  lastRightBoundary:= k2;
  oldp:= ecp[1];
  // the intermediate cells:
  for i from 2 to c-1 do
      [k1, k2]:= cells[i]:
      newp:= eCDF(k2);
      if k1 < lastRightBoundary then
         error("illegal cell partitioning: the right boundary ".
               expr2text(lastRightBoundary) .
               " of the ".output::ordinal(i - 1).
               " cell is larger than the left boundary ".
               expr2text(k1) .
               " of the ".output::ordinal(i)." cell");
      end_if;
      if k1 = lastRightBoundary then
           // no need to compute eCDF(k1) again,
           // it is stored in oldp.
           ecp[i] := newp - oldp;
      else ecp[i] := newp - eCDF(k1);
      end_if;
      lastRightBoundary:= k2;
      oldp:= newp;
  end_for:
  // the last cell:
  k1:= cells[c][1]:
  if k1 = lastRightBoundary then
       ecp[c] := 1 - oldp;
  else ecp[c] := 1 - eCDF(k1);
  end_if;

  //-------------------------------------
  // compute the expected cell frequencies
  // for the specified distribution.
  // Also, compute the chisquare statistic
  //-------------------------------------
  
  m:= infinity: // minimum of the expected cell frequencies n*Prob(cell[i])
  s:= 0:        // the chi-square statistic

  case FLAG 
  of CDF do
     // the first cell: ignore the left boundary
     // and use -infinity instead by setting 
     // f(first left boundary) = 0
     k2:= cells[1][2]:
     lastRightBoundary:= k2:
     p:= float(f(float(k2)));
     oldp:= p;
     totalp:= p:
     m:= p:
     if m <= 0 then
        error("the expected cell frequency for the ".
               output::ordinal(1)." cell is zero"):
     end_if:
     s:= s + (ecp[1] - p)^2/p:
     for i from 2 to c - 1 do
        // i-th cell = (k1, k2]
        [k1, k2]:= cells[i]:
        if k1 = lastRightBoundary then
             // no nead to compute f(k1) again:
             // it is stored in oldp:
             newp:= float(f(float(k2)));
             p:= newp - oldp: 
        elif k1 > lastRightBoundary then
             newp:= float(f(float(k2)));
             p:= newp - float(f(float(k1))): 
        else // k1 < lastRightBoundary
             error("something's wrong with the cell partitioning: ".
                   "the left boundary of the ".output::ordinal(i). " cell ".
                   "is smaller than the right boundary of the previous cell");
        end_if;
        if domtype(p) <> DOM_FLOAT then
           error("the distribution function does not ".
                 "produce a real numerical value");
        end_if;
        lastRightBoundary:= k2;
        oldp:= newp;
        totalp:= totalp + p:
        m:= min(p,m):
        if m <= 0 then
           error("the expected cell frequency for the ".
                 output::ordinal(i)." cell is zero"):
        end_if:
        s:= s + (ecp[i] - p)^2/p:
     end_for:
     // the last cell: ignore the right boundary
     // and use infinity instead by setting 
     // f(last right boundary) = 1
     k1:= cells[c][1]:
     if k1 = lastRightBoundary then
          p:= 1 - oldp;
     else p:= 1 - float(f(float(k1)));
     end_if;
     totalp:= totalp + p:
     if specfunc::abs(1 - totalp) > 10^(5 - DIGITS) then
        error("the cell probabilities should add up to 1, but ".
              "they add up to ".expr2text(totalp));
     end_if;
     m:= min(p,m):
     if m <= 0 then
        error("the expected cell frequency for the ".
              output::ordinal(c)." cell is zero"):
     end_if:
     s:= s + (ecp[c] - p)^2/p:
     break;
  of PDF do
     t:= genident():
     totalp:= 0:
     for i from 1 to c do
        [k1, k2]:= cells[i]:
        p:= numeric::quadrature(f(t), t=k1..k2): 
        if domtype(p) <> DOM_FLOAT then
           error("the distribution function does not ".
                 "produce a real numerical value");
        end_if;
        totalp:= totalp + p:
        m:=min(p,m):
        if m <= 0 then
           error("the expected cell frequency for the ".
                 output::ordinal(i). " cell is zero"):
        end_if:
        s:= s + (ecp[i] - p)^2/p:
     end_for:
     if specfunc::abs(1 - totalp) > 10^(5 - DIGITS) then
        error("the cell probabilities should add up to 1, but ".
              "they add up to ".expr2text(totalp));
     end_if;
     break;
  of PF do
     totalp:= 0:
     for i from 1 to c do
        [k1, k2]:= cells[i]:
        if k1 = -infinity then 
           error("with the distribution flag 'PF', -infinity ".
                 "is not allowed as a left cell boundary");
        end_if;
        if k2 = infinity then
             k0:= cells[1][1]; // the leftmost cell boundary
             p:= 1 - _plus(f(float(j)) $ j=floor(k0) .. floor(k1)):
        else p:= _plus(f(float(j)) $ j=floor(k1)+1 .. floor(k2)):
        end_if:
        p:= float(p);
        if domtype(p) <> DOM_FLOAT then
           error("the distribution function does not ".
                 "produce a real numerical value");
        end_if;
        totalp:= totalp + p:
        m:=min(p,m):
        if m <= 0 then
           error("the expected cell frequency for the ".
                 output::ordinal(i)." cell is zero"):
        end_if:
        s:= s + (ecp[i] - p)^2/p:
     end_for:
     if specfunc::abs(1 - totalp) > 10^(5 - DIGITS) then
        error("the cell probabilities should add up to 1, but ".
              "they add up to ".expr2text(totalp));
     end_if;
     break;
  end_case;

  // scale from 'probabilities' to 'frequencies':
  s:= n*s;
  m:= n*m;

  /*-------------------------------------------------------------
  // avoid arithmetical underflow in igamma. This may
  // happen for really large s, because igamma(a, s) involves
  // exp(-s/2). Rough check if s is big enough to return float(0)
  // as the PValue. 
  -------------------------------------------------------------*/
  c:= float(c): 
  if   c-1 <=  10^1 and (c-1)*8*10^4 < s then p:= float(0); 
  elif c-1 <=  10^2 and (c-1)*8*10^3 < s then p:= float(0);
  elif c-1 <=  10^3 and (c-1)*8*10^2 < s then p:= float(0); 
  elif c-1 <=  10^4 and (c-1)*8*10^1 < s then p:= float(0);
  elif c-1 >   10^5 and (c-1)*8*10^0 < s then p:= float(0); 
  else  
       p:= igamma((c - 1)/2, s/2)/gamma((c-1)/2);  // = 1 - stats::chisquareCFD(c-1)(s)
  end_if;
  /*-------------------------------------------------------------
  // This still only leaves underflow problems if
  // c - 1 > 10000  and s <= (c-1)*8.
  // Note that igamma(a, s) yields an underflow for s > 10^7
  // but is ok for s < 10^7,
  // This would yield underflows only for the case
  // c - 1 >= s/1.5 > 7*10^6. I do not think that anybody
  // tries to use that many cells! We should be safe.
  -------------------------------------------------------------*/

  return([PValue = p, StatValue = s, MinimalExpectedCellFrequency = m]);

end_proc:
