/*---------------------------------------------------------------
Info:  The cumultative distribution function 
       of a finite data sample

Call(s): 
       stats::finiteCDF([x1, x2, ..], [p1, p2, ..])
       stats::finiteCDF([[x1, p1], [x2, p2], ..])
       stats::finiteCDF(s <, c1, c2>)

Parameters:
       x1, x2, .. -- arbitrary MuPAD objects. Implicitly, these
                     values are assumed to be ordered: x1 < x2 < ...
                     If some values are numerical, any pair violating
                     x.i < x.j for i < j results in a warning.
       p1, p2, .. -- probability values: symbols or positive 
                     numerical values. In case of numerical values,
                     they  must add up to 1
       s          -- a sample of type stats::sample 
       c1, c2     -- a column index: a positive integer.
                     Column c1 of s provides the data x1, x2, ..
                     Column c2 of s provides the data p1, p2, ..
                     There is no need to pass a column index,
                     if s has only two columns

Returns: a procedure

Details: 
   -  The function 
      f:= stats::finiteCDF([x1,x2,..],[p1, p2,..])
      can be called in the form f(x). If x is a symbol
      or x1, x2, .. contains symbols, then a symbolic
      hold(stats::finiteCDF)(...)(x) is returned.
      Otherwise, an appropiate sum of the p.i is returned.

   -  stats::finiteCDF([x1, x2, ..])(x) is 
      p1 + p2 + ... + p.k, where k is the index
      of the largest value satisfying x.k <= x.

   -  Note: if there are symbolic entries x1, x2, ...,
      the implicit assumption x1 < x2 < ... is used.
      In this case, the results depend on the ordering 
      of the input data!
      If all x.i are numerical, then the entries are
      automatically sorted internally.
      
  -   If x1, x2, ... , x.N  are all numerical, 
      stats::finiteCDF([x1, x2, ..], [1/N, 1/N ,  1/N])(x)
      coincides with
      stats::empiricalCDF([x1, .. , x.N])(x).

Examples:
   > f:= stats::finiteCDF([3, 2, PI, -1, 5], [0.1, 0.2, 0.3, 0.3, 0.1]):
   > f(-infinity), f(-2), f(1.3), f(3), f(PI), f(3.2), f(5), f(infinity)

                 0, 0, 0.3, 0.6, 0.9, 0.9, 1.0, 1.0
---------------------------------------------------------------*/

stats::finiteCDF := proc()
local data, p, CDF, n, i, x, lastx,
      symbolicData, symbolicProb,
      eliminateDuplicates;
option escape;
//------------------------------------------------------
// do use option remember: if you pass
// stats::finiteCDF(data)(x) to a plot function, say,
// stats::finiteCDF(data) is evaluated again and again,
// whenever a float is substituted for x!
//------------------------------------------------------
option remember;
begin

  if args(0) < 1 then
     error("expecting at least one argument")
  end_if:

  //------------------------------------------------------
  data:= stats::getdata(testargs(), "anything", 2, args(1..args(0))):
  if domtype(data) = DOM_STRING then
       error(data)
  end_if:

  if testargs() then
     if has([data] , []) then
        error("empty sample"): 
     end_if:
  end_if:

  //----------------------------------------------------
  // Now, data = [x1, x2, ..], [p1, p2, ..];
  //----------------------------------------------------

  // split data into names and probabilities:
  [data, p]:= [data];
  assert(domtype(data) = DOM_LIST):
  assert(nops(data) = nops(p)):

  // There may be duplicates in the data. Eliminate them:
  eliminateDuplicates:= proc(data, p)
  local m, n, i, j;
  begin
     n:= nops(data):
     m:= nops({op(data)}):
     if n = m then
        return([data, p]):
     else // warning("there seem to be duplicate data entries"):
        for j from n downto 1 do
        // find the *first* occurrence of data[j] in data
        i:= contains(data, data[j]);
        if i = j then
           next; // there is no duplicate of data[j]
        end_if;
        // i <> j: there are duplicates: data[i] = data[j]
        // delete data[j] and add p[j] to p[i]:
        p[i]:= p[i] + p[j];
        delete data[j]; // eliminate duplicate
        delete p[j];    // eliminate duplicate
        n:= n - 1; // current length of data and p
        if n = m then
           // we can be sure there are no more duplicates
           break;
        end_if;
      end_for;
    end_if;
    [data, p];
  end_proc:

  // Do the elimination of the duplicates
  [data, p]:= eliminateDuplicates(data, p);
  assert(nops(data) = nops(p));

  n:= nops(data);

  /* The following does not work: we wish to allow strings in the data! */
  // symbolicData:= bool(indets(data) minus Type::ConstantIdents <> {}):
  symbolicData:= bool(map({op(data)}, domtype@float) <> {DOM_FLOAT});
  symbolicProb:= bool(indets(p) minus Type::ConstantIdents <> {}):

  //----------------------------------------------------
  // Check the implicit assumption x.i <= x.(i+1).
  // Ignore symbolic x.i, just check numerical values.
  // If the user given data are not ascending, then
  // throw a warning
  //----------------------------------------------------
  if symbolicData then
       lastx:= RD_NINF;
       for i from 1 to n do
           x:= float(data[i]);
           if domtype(x) = DOM_FLOAT and
              domtype(lastx) = DOM_FLOAT and
              x <= lastx then
              warning("the sample data are not ascending");
              break; // one warning suffices
           end_if;
           lastx:= x;
       end_for;
  else
     // sort numerical data
     data:= sort([[data[i],p[i]] $ i = 1..n], (x, y) -> float(x[1]) < float(y[1]));
     [data, p]:= [map(data, op, 1), map(data, op, 2)];
  end_if:

  //---------------------------------------
  // Generate the CDF values. Use exact arithmetic
  // for adding up the p values:
  //---------------------------------------
  CDF:= [0 $ n]; // initialize
  CDF[1]:= p[1]:
  for i from 2 to n do
    CDF[i]:= CDF[i -1] + p[i];
  end_for:

  if not symbolicProb then
     if specfunc::abs(float(CDF[n] - 1)) > 10^(2-DIGITS) then
         error("the probabilities do not add up to 1");
     end_if;
  end_if;

  //---------------------------------------
  // return the following procedure. 
  //---------------------------------------
  proc(x)
  local _data, _p, _CDF, fx, indexSearch;
  begin
    if args(0) <> 1 then
       error("expecting one argument"):
    end_if;

    if symbolicData then
       // some of the symbolic values may have an up-date:
       _data:= context(data):
       if map({op(data)}, domtype@float) = {DOM_FLOAT} then
         // sort numerical data
         _data:= sort([[_data[i],p[i]] $ i = 1..n], (x, y) -> float(x[1]) < float(y[1]));
         [_data, p]:= [map(_data, op, 1), map(data, op, 2)];
       end_if;
    else
       _data:= data;
    end_if;

    if symbolicProb then
       _p:= context(p):
       _CDF:= context(CDF);
       //--------------------------------------------------
       // doublecheck consistency of the updated probability values
       //--------------------------------------------------
       if domtype(float(_CDF[n])) = DOM_FLOAT and
          specfunc::abs(float(_CDF[n] - 1)) > 10^(-DIGITS) then
          error("the probabilities do not add up to 1");
       end_if;
    else
       _p:= p:
       _CDF:= CDF;
    end_if;

    if symbolicData then
       [_data, _p]:= eliminateDuplicates(_data, _p);
       if nops(_p) <> nops(p) then
          // we need to recompute _CDF
          _CDF:= [0 $ nops(_p)]; // initialize
          _CDF[1]:= _p[1]:
          for i from 2 to nops(_p) do
            _CDF[i]:= _CDF[i-1] + _p[i];
          end_for:
       end_if;
    end_if;

    if x = -infinity then return(0) end_if;
    if x =  infinity then return(1) end_if;
    if x = RD_NINF then return(float(0)) end_if;
    if x = RD_INF then return(float(1)) end_if;

    //------------------------------------------------
    // Nothing can be done if there are symbolic data:
    //------------------------------------------------
    if map({op(_data)}, domtype@float) <> {DOM_FLOAT} then
       return(hold(stats::finiteCDF)(_data, _p)(args()));
    end_if:

    //---------------------------------------------------------------
    // Here we know that all entries in data can be converted to floats
    //---------------------------------------------------------------

    // ----------- check x ----------

    fx:= float(x):

    if domtype(fx) <> DOM_FLOAT then
       // x is symbolic, nothing can be done
       return(hold(stats::finiteCDF)(_data, _p)(args()));
    end_if;
    //---------------------------------------------
    // now we are sure that x is numerical
    //---------------------------------------------

    //--------------------------------------------------------
    // Find the data index i satisfying
    // data[i-1] < x <= data[i] by a binary search 
    // with runtime O(log(n)). 
    // It returns an index i with 1 <= i <= n.
    // Invariant of the search: x.k < z <= x.j.
    // (The case z <= x.1 is treated independently)
    //--------------------------------------------------------
    indexSearch:= proc(k, j, z)
    // search for i in k .. j such that _data[i] < z <= _data[i + 1]
    local m;
    option noDebug;
    begin
      while k < j - 1 do
      m:= (k + j) div 2;
       if float(z - _data[m]) < 0 then
            j:= m - 1;
       else k:= m;
       end_if;
      end_while;
      if float(z - _data[j]) < 0 then
         return(k)
      else
         return(j);
      end_if;
    end_proc;
    //--------------------------------------------------
    // Finally: go!
    //--------------------------------------------------
    if float(x - _data[1]) < 0 then
       if domtype(x) = DOM_FLOAT then
          return(float(0));
       else
          return(0);
       end_if;
    else
       if domtype(x) = DOM_FLOAT then
          return(float(_CDF[indexSearch(1, n, x)]));
       else
          return(_CDF[indexSearch(1, n, x)]);
       end_if;
    end_if;
  end_proc:
end_proc:
