/*---------------------------------------------------------------
Info:  The quantile function of a finite data sample

Call(s): 
       stats::finiteQuantile([x1, x2, ..], [p1, p2, ..])
       stats::finiteQuantile([[x1, p1], [x2, p2], ..])
       stats::finiteQuantile(s <, c1, c2>)

Parameters:
       x1, x2, .. -- arbitrary MuPAD objects. Implicitly, these
                     values are assumed to be ordered: x1 < x2 < ...
                     If some values are numerical, any pair violating
                     x.i < x.j for i < j results in a warning.
                     If all values are numerical, internal sorting
                     without a warning is done.
       p1, p2, .. -- probability values: symbols or positive 
                     numerical values. In case of numerical values,
                     they  must add up to 1 (within working precision
                     set by DIGITS, if p1, p2 etc are floats).
       s          -- a sample of type stats::sample 
       c1, c2     -- a column index: a positive integer.
                     Column c1 of s provides the data x1, x2, ..
                     Column c2 of s provides the data p1, p2, ..
                     There is no need to pass a column index,
                     if s has only two columns

Returns: a procedure

Details: 
   -  The function 
      f:= stats::finiteQuantile([x1,x2,..],[p1, p2,..])
      can be called in the form f(x). If x is a symbol
      or p1, p2, .. contains symbols, then a symbolic
      hold(stats::finiteQuantile)(...)(x) is returned.
      Otherwise, one of the elements x1, x2, ... is
      returned.

   -  Note: due to the implicit assumption x1 < x2 < ...,
      the results depend on the ordering of the input data,
      if some of these values are symbols!
      If all x1, x2, .. are numerical, these values are
      sorted automatically.
      
   -  y:= stats::finiteQuantile([x1, x2, ..], [p1, p2, ..])(x)
      is the smallest element of the data x1, x2, ..
      such that stats::finiteCDF([x1, x2, ..])(y) >= x.

  -   If x1, x2, ...  are all numerical, 
      stats::finiteQuantile([x1, x2, .., x.N], [1/N, 1/N, ..])(x)
      coincides with
      stats::empiricalQuantile([x1, x2, .. , x.N])(x).

Examples:
   > f:= stats::finiteQuantile([3, 2, PI, -1, 5], [0.1, 0.2, 0.3, 0.3, 0.1]):
   > f(0), f(0.1), f(0.3), f(5/10), f(1)

                        -1, -1, -1, 3, 5

   > f:= stats::finiteQuantile([[3, 0.1], [2, 0.2], [PI, 0.3], 
                                [-1, 0.3], [5, 0.1] ]):
   > f(0), f(0.1), f(0.3), f(5/10), f(1)

                        -1, -1, -1, 3, 5
---------------------------------------------------------------*/

stats::finiteQuantile := proc()
local data, p, CDF, n, i, x, lastx, 
      symbolicData, symbolicProb, 
      eliminateDuplicates; 
option escape;
//------------------------------------------------------
// do use option remember: if you pass
// stats::finiteQuantile(data)(x) to a plot function, say,
// stats::finiteQuantile(data) is evaluated again and again,
// whenever a float is substituted for x!
//------------------------------------------------------
option remember;
begin

  if args(0) < 1 then
     error("expecting at least one argument")
  end_if:

  //------------------------------------------------------------
  data:= stats::getdata(testargs(), "anything", 2, args(1..args(0))):
  if domtype(data) = DOM_STRING then
       error(data)
  end_if:

  if testargs() then
     if has([data] , []) then
        error("empty sample"): 
     end_if:
  end_if:

  //----------------------------------------------------
  // Now, data = [x1, x2, ..], [p1, p2, ..];
  //----------------------------------------------------

  // split data into names and probabilities:
  [data, p]:= [data];
  assert(domtype(data) = DOM_LIST):
  assert(nops(data) = nops(p)):

  // There may be duplicates in the data. Eliminate them:
  eliminateDuplicates:= proc(data, p)
  local m, n, i, j;
  begin
     n:= nops(data):
     m:= nops({op(data)}):
     if n = m then
        return([data, p]):
     else // warning("there seem to be duplicate data entries"):
        for j from n downto 1 do
        // find the *first* occurrence of data[j] in data
        i:= contains(data, data[j]);
        if i = j then
           next; // there is no duplicate of data[j]
        end_if;
        // i <> j: there are duplicates: data[i] = data[j]
        // delete data[j] and add p[j] to p[i]:
        p[i]:= p[i] + p[j];
        delete data[j]; // eliminate duplicate
        delete p[j];    // eliminate duplicate
        n:= n - 1; // current length of data and p
        if n = m then
           // we can be sure there are no more duplicates
           break;
        end_if;
      end_for;
    end_if;
    [data, p];
  end_proc:

  // Do the elimination of the duplicates
  [data, p]:= eliminateDuplicates(data, p);
  assert(nops(data) = nops(p));

  n:= nops(data):

  /* The following does not work: we wish to allow strings in the data! */
  // symbolicData:= bool(indets(data) minus Type::ConstantIdents <> {}):
  symbolicData:= bool(map({op(data)}, domtype@float) <> {DOM_FLOAT});
  symbolicProb:= bool(indets(p) minus Type::ConstantIdents <> {}):

  //----------------------------------------------------
  // Check the implicit assumption x.i <= x.(i+1).
  // Ignore symbolic x.i, just check numerical values.
  // If the user given data are not ascending, then
  // throw a warning
  //----------------------------------------------------
  if symbolicData then
       lastx:= RD_NINF;
       for i from 1 to n do
           x:= float(data[i]);
           if domtype(x) = DOM_FLOAT and
              domtype(lastx) = DOM_FLOAT and
              x <= lastx then
              warning("the sample data are not ascending");
              break; // one warning suffices
           end_if;
           lastx:= x;
       end_for;
  else
     // sort numerical data
     data:= sort([[data[i],p[i]] $ i = 1..n], (x, y) -> float(x[1]) < float(y[1]));
     [data, p]:= [map(data, op, 1), map(data, op, 2)];
  end_if:

  //----------------------------------------------------
  // Generate the CDF values. Use exact arithmetic
  // for adding up the p values:
  //----------------------------------------------------
  CDF:= [0 $ n]; // initialize
  CDF[1]:= p[1]:
  for i from 2 to n do
    CDF[i]:= CDF[i -1] + p[i];
  end_for:

  if not symbolicProb then
     if specfunc::abs(float(CDF[n] - 1)) > 10^(2-DIGITS) then
         error("the probabilities do not add up to 1");
     end_if;
  end_if;

  //---------------------------------------
  // return the following procedure. 
  //---------------------------------------
  proc(x)
  local _data, _p, _CDF, fx, i, indexSearch;
  begin
    if args(0) <> 1 then
       error("expecting one argument"):
    end_if;

    if symbolicData then
       // some of the symbolic values may have an up-date:
       _data:= context(data):
       if indets(data) minus Type::ConstantIdents = {} then
         // sort numerical data
         _data:= sort([[_data[i],p[i]] $ i = 1..n], (x, y) -> float(x[1]) < float(y[1]));
         [_data, p]:= [map(_data, op, 1), map(data, op, 2)];
       end_if;
    else
       _data:= data;
    end_if;

    if symbolicProb then
       _p:= context(p):
       _CDF:= context(CDF);
       //--------------------------------------------------
       // doublecheck consistency of the updated probability values
       //--------------------------------------------------
       if domtype(float(_CDF[n])) = DOM_FLOAT and
          specfunc::abs(float(_CDF[n] - 1)) > 10^(-DIGITS) then
          error("the probabilities do not add up to 1");
       end_if;
    else
       _p:= p:
       _CDF:= CDF;
    end_if;

    if symbolicData then
       [_data, _p]:= eliminateDuplicates(_data, _p);
       if nops(_p) <> nops(p) then
          // we need to recompute _CDF
          _CDF:= [0 $ nops(_p)]; // initialize
          _CDF[1]:= _p[1]:
          for i from 2 to nops(_p) do
            _CDF[i]:= _CDF[i-1] + _p[i];
          end_for:
       end_if;
    end_if;

    //------------------------------------------------
    // Nothing can be done if there are symbolic 
    // probability values:
    //------------------------------------------------
    if domtype(float(_CDF[n])) <> DOM_FLOAT then
       return(hold(stats::finiteQuantile)(_data, _p)(args()));
    end_if:

    //---------------------------------------------------------------
    // Here we know that all probability values can be converted to floats
    //---------------------------------------------------------------

    // ----------- check x ----------
    fx:= float(x):
    if domtype(fx) <> DOM_FLOAT then
       // x is symbolic, nothing can be done
       return(hold(stats::finiteQuantile)(data, p)(args()));
    end_if;
    //---------------------------------------------
    // now we are sure that x is numerical
    //---------------------------------------------
    if fx < 0 then
       error("expecting an argument 0 <= x <= 1"):
    end_if;
    if iszero(x) then
       return(_data[1])
    end_if;
/*
    //beware of p[n] = 0 !
    if iszero(1 - x) then
       return(_data[n])
    end_if;
*/
    if fx > 1 then
       error("expecting an argument 0 <= x <= 1"):
    end_if;
    //--------------------------------------------------
    // Now we are sure that x is numerical and 0 < x < 1
    //--------------------------------------------------

    //--------------------------------------------------------
    // Recursive utility for the quantile function:
    // Find the data index i satisfying
    // _CDF[i-1] < x <= _CDF[i] by a binary search 
    // with runtime O(log(n)). 
    // It returns an index i with 1 <= i <= n.
    // Invariant of the search: _CDF[k] < x <= _CDF[j].
    // (The case x <= _CDF[1] is treated independently)
    //--------------------------------------------------------
    indexSearch:= proc(k, j, x)
    // search for i in k .. j such that _CDF[i-1] < x <= _CDF[i]
    local m;
    option noDebug;
    begin
       if j <= k + 1 then 
          return(j) 
       end_if;
       m:= (k+j) div 2;
       if float(x - _CDF[m]) <= 0 then 
            indexSearch(k, m, x) // _CDF[k] < x <= _CDF[m]
       else indexSearch(m, j, x) // _CDF[m] < x <= _CDF[j]
       end_if;
    end_proc;

    //--------------------------------------------------
    // Finally: go!
    //--------------------------------------------------
    if float(x - _CDF[1]) <= 0 then
       return(_data[1]);
    else
       return(_data[indexSearch(1, n, x)]);
    end_if;
  end_proc:
end_proc:
