/* ------------------------------------------------------------------------
Calls:   
          stats::frequency(data, <option>)
          stats::frequency(data, n, <option>)
          stats::frequency(data, [n], <option>)
          stats::frequency(data, [a1..b1, a2..b2, ..], <option>)
          stats::frequency(data, [[a1, b1], [a2, b2], ..], <option>)
          stats::frequency(data, Classes = n, <option>)
          stats::frequency(data, Classes = [n], <option>)
          stats::frequency(data, Classes = [a1..b1, a2..b2, ..], <option>)
          stats::frequency(data, Classes = [[a1, b1], [a2, b2], ..], <option>)
          stats::frequency(data, Cells = n, <option>)
          stats::frequency(data, Cells = [n], <option>)
          stats::frequency(data, Cells = [a1..b1, a2..b2, ..], <option>)
          stats::frequency(data, Cells = [[a1, b1], [a2, b2], ..], <option>)

Parameter:

  data - a list, a set, a table, a matrix or an array
         of numerical real data (may be exact expressions
         such as PI, sin(2), 3 + sqrt(5) etc).
  n    - the number of cells (classes): a positive integer.
         create n cells of equal length covering the range
         from min(data) to max(data). For technical reasons,
         however, the left most cell extends to -infinity.
         If no cell specification is given, n = 10 is
         used by default. 
  a1, b1, a2, b2, ...
       - cell (class) boundaries: real numerical values (may
         be exact numerical values such as PI, sqrt(2) etc.). 
         Must satisfy
           a1 <= b1 <= a2 <= b2 <= a3 <= ....
         The cell (class) given by a.i, b.i is the semi-open 
         interval (a.i, b.i]. With CellsClosedLeft, the cells
         with boundaries a.i, b.i represend the semi-open interval
         [a.i, b.i)
         
         Data outside the union of these cells are ignored
         (not tallied into the classes).

Options:
         CellsClosed = Left/Right : 
             The class a.i..b.i is interpreted as [a.i, b.i) with 
             CellsClosed = Left and as (a.i, b.i] with
             CellsClosed = Right
             The default is CellsClosed = Right.
         ClassesClosed = Left/Right is identical with CellsClosed = Left/Right

Return Value:

 table(
   1 = [[a1, b1], NumberOfDataElementsInThisClass, ListOfDataElementsInThisClass],
   2 = [[a2, b2], NumberOfDataElementsInThisClass, ListOfDataElementsInThisClass],
   3 = [[a3, b3], NumberOfDataElementsInThisClass, ListOfDataElementsInThisClass],
  ):

------------------------------------------------------------------------ */

stats::frequency:= proc(data, cells)
local numericaldata, minval, maxval, i, closed_left,
      fcells, classdata, bucketsort1, bucketsort2;
begin
    //-----------------------------
    // check the data
    //-----------------------------
    if args(0) < 1 or 
       ( domtype(data) <> DOM_LIST and
         domtype(data) <> DOM_SET and
         domtype(data) <> DOM_TABLE and
         domtype(data) <> DOM_ARRAY and
         domtype(data) <> stats::sample and
         not data::dom::hasProp(Cat::Matrix) 
       )then
       error("expecting numerical data as a list, a set, a table, ".
             "an array, a matrix, or a stats::sample"):
    end_if;
    case domtype(data) 
    of DOM_LIST do break;
    of DOM_SET do
    of DOM_ARRAY do
         data:= [op(data)];
         break;
    of DOM_TABLE do
         data:= map([op(data)], op, 2);
         break;
    of stats::sample do
         data:= map(data::dom::convert_to(data, DOM_LIST), op);
         break;
    otherwise // data = matrix
         data:= [op(data)];
    end_case;

    if {op(map(data, domtype))} subset {DOM_INT, DOM_RAT, DOM_FLOAT} then
       numericaldata:= TRUE;
    else // there are expressions such as sqrt(2), PI, sin(5) etc.
       numericaldata:= FALSE;
       //-----------------------------------------------------
       // check that all data can be converted to real floats:
       //-----------------------------------------------------
       if {op(map(data, domtype@float))} <> {DOM_FLOAT} then
          error("cannot convert all data elements to real floats"):
       end_if;
    end_if;

    //--------------------------------
    // check for the option CellsClosed = Left
    //--------------------------------
    closed_left:= FALSE;
    if has([args(2..args(0))], CellsClosed = Left) or
       has([args(2..args(0))], ClassesClosed = Left) then
       closed_left:= TRUE;
    end_if;
    if has([args(2..args(0))], CellsClosed = Right) or
       has([args(2..args(0))], ClassesClosed = Right) then
       closed_left:= FALSE;
    end_if;
    
    //-----------------------------
    // check the cell specification
    //-----------------------------
    if args(0) = 1 then
       cells:= 10:  // default
    end_if;

    if type(cells) = "_equal" then
       if lhs(cells) <> Cells and lhs(cells) <> Classes then
          error("2nd argument: expecting an equation 'Classes = integer' ".
                "or 'Classes = [a1..b1, a2..b2, ...]' ".
                "or 'Classes = [[a1, b1], [a2, b2], ..]'");
       end_if;
       cells:= rhs(cells);
    end_if;

    case domtype(cells) 
    of DOM_INT do
       if cells <= 0 then
          error("2nd argument: expecting a positive number of classes");
       end_if;
       break;
    of DOM_LIST do
       if nops(cells) = 0 then
          error("expecting a specification of the classes. Got an empty list.");
       end_if;
       if nops(cells) = 1 and domtype(cells[1]) = DOM_INT then
          cells:= cells[1];
          if cells <= 0 then
             error("2nd argument: expecting a positive number of classes");
          end_if;
          break;
       end_if;
       // replace +- infinity by RD_INF/RD_NINF for speed:
       cells:= subs(cells, [-infinity = RD_NINF, infinity = RD_INF]):
       for i from 1 to nops(cells) do
         if type(cells[i]) = "_range" then
            cells[i]:= [op(cells[i])];
         end_if;
         if type(cells[i]) <> DOM_LIST or
            nops(cells[i]) <> 2 then
            error("2nd argument: wrong specification of class ".expr2text(i).
                  "; expecting a range a..b or a list [a, b]");
         end_if;
         if domtype(float(cells[i][1])) <> DOM_FLOAT and
            cells[i][1] <> -infinity then
            error("2nd argument: wrong specification of class ".expr2text(i).
                  "; the left border ".expr2text(cells[i][1]). 
                  " is not a real numerical value or -infinity");
         end_if;
         if type(float(cells[i][2])) <> DOM_FLOAT and
            cells[i][1] <> -infinity then
            error("2nd argument: wrong specification of class ".expr2text(i).
                  "; the right border ".expr2text(cells[i][1]). 
                  " is not a real numerical value or infinity");
         end_if;
       end_for:
    end_case;

    //-------------------------------------------------------
    // automatic generation of equidistant cells
    //-------------------------------------------------------
    if domtype(cells) = DOM_INT then
       // a value is tallied into cell[i] by bucketsort if
       //      cell[i][1] < value <= cell[i][2].
       // Unfortunately, this does not count the leftmost value.
       // Hence, decrease the minimal value a tiny bit to the left.
       minval := min(data);
       maxval:= max(data);
       if closed_left then
          cells:= [((cells-i)/cells*minval+i/cells*maxval)$i=0..cells-1, RD_INF];
       else
          cells:= [RD_NINF, ((cells-i)/cells*minval+i/cells*maxval)$i=1..cells];
       end_if;
       cells:= zip(cells, cells[2..-1], DOM_LIST);
       fcells:= float(cells);
    else // check the cells
       fcells:= float(cells):
       if {op(map(fcells, cell -> (domtype(cell[1]), domtype(cell[2]))))} <> {DOM_FLOAT}
          then
          error("cannot convert all class boundaries to real floats"):
       end_if;
    end_if;

    // check ordering of cells
    for i from 1 to nops(fcells) do
        if fcells[i][1] > fcells[i][2] then
           error("the left boundary of class ".expr2text(i). 
                 " is bigger than the right boundary"):
        end_if;
    end_for;
    for i from 1 to nops(fcells) - 1 do
        if fcells[i][2] > fcells[i + 1][1] then
           error("the right boundary of class ".expr2text(i).
                 " is bigger than the left boundary of the next class");
        end_if;
    end_for:

    //-------------------------------------------------------
    // tally data into cells
    //-------------------------------------------------------
    // bucketsort1 is faster than bucketsort2 for small nops(cells) 
    if closed_left then
       bucketsort1:= proc(data, buckets)
         local i, dummy;
       begin
            for i from 1 to nops(buckets) do
               if (i = 1 and buckets[i][1] <> RD_NINF) or
                  (i > 1 and buckets[i - 1][2] <= buckets[i][1]) then
                  // throw away all data below the current bucket
                  [dummy, data, dummy]:= split(data, _less, buckets[i][1]);
               end_if;
               [classdata[i], data, dummy]:= split(data, _less, buckets[i][2]);
            end_for:
         end_proc:
    else
       bucketsort1:= proc(data, buckets)
         local i, dummy;
       begin
         for i from 1 to nops(buckets) do
            if (i = 1 and buckets[i][1] <> RD_NINF) or
               (i > 1 and buckets[i - 1][2] < buckets[i][1]) then
               // throw away all data below the current bucket
               [dummy, data, dummy]:= split(data, _leequal, buckets[i][1]);
            end_if;
            [classdata[i], data, dummy]:= split(data, _leequal, buckets[i][2]);
         end_for:
       end_proc:
    end_if;

    // bucketsort2 is faster than bucketsort1 for large nops(cells) 
    if closed_left then
       bucketsort2:= proc(data, locellindex, hicellindex)
         local midcellindex, midvalue, lodata, dummy;
       begin
         if nops(data) = 0 then
           return()
         end_if;
           if locellindex = hicellindex then
             data:= select(data, _less, op(fcells[locellindex], 2));
             classdata[hicellindex]:= 
               select(data, _not@_less, op(fcells[locellindex], 1));
             return():
           end_if;
           midcellindex:= (locellindex + hicellindex) div 2;
           midvalue:= op(cells[midcellindex], 2):
           [lodata, data, dummy]:= split(data, _less, midvalue);
           bucketsort2(lodata, locellindex, midcellindex);
           bucketsort2(data, midcellindex+1, hicellindex);
       end_proc:
    else
       bucketsort2:= proc(data, locellindex, hicellindex)
         local midcellindex, midvalue, lodata, dummy;
       begin
           if nops(data) = 0 then
             return()
           end_if;
           if locellindex = hicellindex then
             data:= select(data, _leequal, op(fcells[locellindex], 2));
             classdata[hicellindex]:= 
               select(data, _not@_leequal, op(fcells[locellindex], 1));
             return():
           end_if;
           midcellindex:= (locellindex + hicellindex) div 2;
           midvalue:= op(cells[midcellindex], 2):
           [lodata, data, dummy]:= split(data, _leequal, midvalue);
           bucketsort2(lodata, locellindex, midcellindex);
           bucketsort2(data, midcellindex+1, hicellindex);
       end_proc:
    end_if;
 
    //-------------------------------------------------------
    // do the bucketsort:
    //-------------------------------------------------------
    classdata:= [[] $ nops(cells)]; // initialize empty buckets
    if numericaldata then
         if nops(cells) < 16 then
            bucketsort1(data, fcells); // sort into buckets
         else
            bucketsort2(data, 1, nops(cells)); // sort into buckets
         end_if;
    else // exaxt data
         if nops(cells) < 1000 then
            bucketsort1(data, fcells); // sort into buckets
         else
            bucketsort2(data, 1, nops(cells)); // sort into buckets
         end_if;
    end_if;

    if numericaldata then
         classdata:= map(classdata, sort);
    else classdata:= map(classdata, prog::sort, float):
    end_if;
    //-------------------------------------------------------
    // return a table
    //-------------------------------------------------------
    cells:= subs(cells, [RD_NINF = -infinity, RD_INF = infinity]):
    table((i = [cells[i], nops(classdata[i]), classdata[i]]) 
          $ i = 1..nops(cells));
end_proc:
