/*---------------------------------------------------------------
Info: The empirical (discrete) cumulative distribution function
      of a finite data sample

Call(s): stats::empiricalCDF(x1, x2, ..)
         stats::empiricalCDF([x1, x2, ..])
         stats::empiricalCDF(s <, c>)

Parameters:  x1, x2, .. -- statistical data: real numbers
             s          -- a smaple of type stats::sample
             c          -- a column index: a positive integer.
                           Column c of s provides the data x1, x2, ..
                           There is no need to pass a column index
                           if s has only one non-string column


Returns: a procedure

Details:  f:= stats::empiricalCDF([x1, x2, ..]).
          For any real number x, f(x) returns the relative 
          frequencey of data elements not larger than x, i.e.:

             f(x) = 1/n * | { x.i; x.i <= x} |

          where n is the sample size. 
          y = f(x) returns a rational number x with 0 <= x <= 1.

Examples: 
  2.1.0 > f:= stats::empiricalCDF([3, 2, PI, -1, 5]):
  2.1.0 > f(-2), f(-1), f(-0.5), f(3), f(PI), f(infinity)

                   0, 1/5, 1/5, 3/5, 4/5, 1

---------------------------------------------------------------*/

stats::empiricalCDF:= proc()
local data, fdata, n, sorted;
option escape;
//------------------------------------------------------
// do use option remember: if you pass
// stats::empiricalCDF(data)(x) to a plot function, say,
// stats::empiricalCDF(data) is evaluated again and again,
// whenever a float is substituted for x!
//------------------------------------------------------
option remember;

begin
  if args(0) < 1 then 
     error("expecting at least one argument")
  end_if:

  //----------------------------------------------------
  // stats::getdata accepts "all_data" and "numeric_only".
  // Use "all_data", because with "numeric_only" exact numerical
  // expressions such as sqrt(2), PI etc. would be converted to
  // floats.
  //----------------------------------------------------

  sorted:= FALSE: // on input, the data may not be sorted

  data:= stats::getdata(testargs(), "all_data", 1, args(1..args(0))):
  if domtype(data) = DOM_STRING then
       error(data)
  end_if:

  n:= nops(data):
  fdata:= map(data, float):

  if testargs() then
     if data = [] then
        error("empty sample"):
     end_if:
     if nops(select(fdata, testtype, DOM_FLOAT)) <> n then
        error("some data could not be converted to floats")
     end_if:
  end_if:

  fdata:= sort(fdata):

  //-------------------------------
  // return the following procedure
  //-------------------------------
  proc(x)
  local  fx, lo, hi, middle;
  begin

    if args(0) <> 1 then
       error("expecting one argument"):
    end_if;

    if x = -infinity then return(0) end_if;
    if x =  infinity then return(1) end_if;

    fx:= float(x):

    if domtype(fx) <> DOM_FLOAT then
       // for consistency of symbolic returns with
       // stats::empiricalQuantile, do sort the
       // data. Mark the data as sorted to avoid
       // unnecessary further sorting. The sorted
       // data are stored in the environment of the
       // generating procedure stats::empiricalCDF:

       if not sorted then
          data:= sort(data, (x, y) -> float(x) < float(y)):
          sorted:= TRUE;
       end_if;
       return(hold(stats::empiricalCDF)(data)(x));
    end_if: 

    lo := 1;
    hi := n;
    if fx <  fdata[lo] then return(0) end_if:
    if fx >= fdata[hi] then return(1) end_if:
    // now, start binary search for index i with  1<= i < hi
    // and fdata[i] <= fx < fdata[i]
    while lo < hi - 1 do
      // middle:= floor((lo + hi)/2);
      // (lo + hi) div 2 = floor( (lo + hi) /2).
      // Note that div is about twice as fast as floor
      middle:= (lo + hi) div 2:
      if fx < fdata[middle] then 
           hi:= middle - 1:
      else lo:= middle:
      end_if;
    end_while;
    if fx < fdata[hi] then
         return(lo/n)
    else return(hi/n)
    end_if:
  end_proc:
end_proc:
