/*---------------------------------------------------------------
Info:  The quantile function of the empirical distribution
       of a finite data sample

Call(s): 
       stats::empiricalQuantile(x1, x2, ..)
       stats::empiricalQuantile([x1, x2, ..])
       stats::empiricalQuantile(s <, c>)

Parameters:
       x1, x2, .. -- statistical data: real numbers
       s          -- a sample of type stats::sample 
       c          -- a column index: a positive integer.
                     Column c of s provides the data x1, x2, ..
                     There is no need to pass a column index
                     if s has only one non-string column

Returns: a procedure

Details: 
   -  The function f:= stats::empiricalQuantile([x1, x2, ..])
      can be called in the form f(x) or f(x, Averaged).
   -  y:= stats::empiricalQuantile([x1, x2, ..])(x)
      is the smallest real value satisfying
      stats::empiricalCDF([x1, x2, ..])(y) >= x.
      In fact, y is the smallest element of the data x1, x2, ..
      such that stats::empiricalCDF([x1, x2, ..])(y) >= x.
  -   Assume x1 <= x2 <= ...
      Let y = x.k, then stats::empiricalCDF([x1, x2, ..](x.k)
      =  k/n,  i.e., k is the smallest index such that 
      k >= n*x, i.e., k = ceil(n*x).
   -  y:= stats::empiricalQuantile([x1, x2, ..])(x, Averaged)
      returns: 
         (x.k + x.(k+1)) if k = n*x is an integer 1 <= k <= n-1
               x.ceil(k) if k = n*x is not an integer 1 <= k <= n-1.

      Here, the x.i are the original data rearranged in
      non-descending order.

Examples:
  2.1.0 > f:= stats::empiricalQuantile([3, 2, PI, -1, 5]):
  2.1.0 > f(0), f(0.1), f(0.3), f(5/10), f(1)

                        -1, -1, 2, 3, 5

  2.1.0 > f(0.3) = f(0.3, Averaged)

                             2 = 2

  2.1.0 > f(2/5) <> f(2/5, Averaged)

                           2 <> 5/2

---------------------------------------------------------------*/

stats::empiricalQuantile := proc()
local data, n;
option escape;
//------------------------------------------------------
// do use option remember: if you pass
// stats::empiricalQuantile(data)(x) to a plot function, say,
// stats::empiricalQuantile(data) is evaluated again and again,
// whenever a float is substituted for x!
//------------------------------------------------------
option remember;
begin

  if args(0) < 1 then
     error("expecting at least one argument")
  end_if:

  //------------------------------------------------------------
  // stats::getdata accepts "all_data" and "numeric_only".
  // Use "all_data", because with "numeric_only" exact numerical
  // expressions such as sqrt(2), PI etc. would be converted to
  // floats. However, we do need the original input data.
  //------------------------------------------------------------

  data:= stats::getdata(testargs(), "all_data", 1, args(1..args(0))):
  if domtype(data) = DOM_STRING then
       error(data)
  end_if:

  n:= nops(data):

  if testargs() then
     if data = [] then
        error("empty sample"): 
     end_if:
     if nops(select(map(data, float), testtype, DOM_FLOAT)) <> n then
        error("some data could not be converted to floats")
     end_if:
  end_if:

  //----------------------------------------------------
  // sort the data in ascending order. However, we still
  // need the original data, so do not replace the data
  // by a list of float data before sorting. 
  // Note the following timings for some suitable list of
  // numerical data with many multiple elements:
  //   sort(data)                     needs   1 sec
  //   sort(data, (x, y) -> x <  y )  needs  14 sec
  //   sort(data, (x, y) -> x <= y )  needs 300 sec
  // (due to library callbacks, passing a sorting criterion
  //  slows down sort by a factor of 14. Oh, dear! ;-()
  //----------------------------------------------------

  if map({op(data)}, domtype) minus {DOM_FLOAT, DOM_INT, DOM_RAT} = {} then
       data:= sort(data);
  else data:= sort(data, (x, y) -> (float(x) < float(y)));
  end_if:

  //---------------------------------------
  // return the following procedure. It can
  // be called as f(x) or as f(x, Averaged)
  //---------------------------------------
  proc(x)
  local  averaged, fx, k, kk;
  begin
    if args(0) < 1 then
       error("expecting one argument"):
    end_if;
    if args(0) > 2 then
       error("expecting no more than two arguments"):
    end_if;
    averaged:= FALSE;
    if args(0) = 2 then
       if args(2) = Averaged then
         averaged:= TRUE;
       else 
          error("expecting the option 'Averaged' as second argument"):
       end_if;
    end_if;
 
    // ----------- check x ----------
    fx:= float(x):
    if domtype(fx) <> DOM_FLOAT then
       // x is symbolic, nothing can be done
       return(hold(stats::empiricalQuantile)(data)(args()));
    end_if;

    //---------------------------------------------
    // now we are sure that x is numeric
    //---------------------------------------------

    if fx < 0 then
       error("expecting an argument 0 <= x <= 1"):
    end_if;
    if iszero(x) then
       return(data[1])
    end_if;
    if iszero(1 - x) then
       return(data[n])
    end_if;
    if fx > 1 then
       error("expecting an argument 0 <= x <= 1"):
    end_if;

    //---------------------------------------------
    // now we are sure that x is numerical and 0 < x < 1
    //---------------------------------------------

    // trunc and ceil use floats. Clip k to 0<= k <= n via
    // max(.., 0), min(.., n) (just in case, trunc undershoots
    // or ceil overshoots due to float instabilities)

    k:= max(trunc(n*x), 0);
    kk:= min(ceil(n*x), n):

    if not averaged then
       return(data[kk])           
    else
       // S.Huckemann, Feb. 2006: proportional averaging as, e.g., in the R-statistics package
       // seems to be more appropriate than returning (data[k] + data[k + 1])/2:
       // return( (1-x)*data[k] + x*data[k + 1]);
       if k < kk then
            // k = n*x is not an integer  
            return(data[kk])           
       else // n*x is an integer
            // with k = trunc(n*x) = kk = ceil(n*x)
            // and  0 < x < 1 we know that
            // 1 <= k = kk <= n - 1, i.e.,
            // averaging is ok:
            return( (data[k] + data[k + 1])/2);
       end_if
    end_if;
  end_proc;
end_proc:
