///
/// This file is part of Rheolef.
///
/// Copyright (C) 2000-2009 Pierre Saramito <Pierre.Saramito@imag.fr>
///
/// Rheolef is free software; you can redistribute it and/or modify
/// it under the terms of the GNU General Public License as published by
/// the Free Software Foundation; either version 2 of the License, or
/// (at your option) any later version.
///
/// Rheolef is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU General Public License for more details.
///
/// You should have received a copy of the GNU General Public License
/// along with Rheolef; if not, write to the Free Software
/// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
/// 
/// =========================================================================

#ifdef _RHEOLEF_HAVE_MPI
// Note: this file is recursively included by "polymorphic_array_mpi.cc" 
// with growing "N" and in the namespace "rheolef"

/*F:
NAME: mpi_polymorph_assembly_begin -- for polymorph_array (@PACKAGE@ @VERSION@)
DESCRIPTION:
  Start a dense polymorph array matrix assembly.
COMPLEXITY: 
  Assume that stash has indexes in increasing order.
  cpu complexity : O(stash.size + nproc)
  memory complexity : O(stash.size + nproc)
  msg size complexity : O(stash.size + nproc)
  
  When assembling a finite element matrix, the stash.size is at boundaries
  of the mesh partition, and stash.size = O(nrow^alpha), where alpha=(d-1)/d
  and d=2,3. Using nproc <= O(nrow^alpha) is performant.
NOTE:
  The stash may be sorted by increasing nows and column.

  Inspirated from Petsc (petsc/src/vec/vec/impls/mpi/pdvec.c), here with
  a pre-sorted stash, thanks to the stl::map data structure.
AUTHORS:
    LMC-IMAG, 38041 Grenoble cedex 9, France
    | Pierre.Saramito@imag.fr
DATE:   23 march 1999
END:
*/

//<mpi_polymorphic_assembly_begin:
template <class Stash, class Message>
struct mpi_polymorphic_assembly_begin_t<Stash,Message,N> {
  void operator() (
     // input:
        const Stash&                       stash,
        const distributor&                 ownership,
     // ouput:
        Message&                           receive,		// buffer
        Message&                           send, 	// buffer
        boost::array<typename Stash::size_type,N>& receive_max_size) const;
};


template <class Stash, class Message>
void
mpi_polymorphic_assembly_begin_t<Stash,Message,N>::operator() (
     // input:
        const Stash&                       stash,
        const distributor&                 ownership,
     // ouput:
        Message&                           receive,		    // buffer
        Message&                           send,		    // buffer
        boost::array<typename Stash::size_type,N>& receive_max_size) const
{
trace_macro ("assembly_begin...");
    typedef typename Stash::size_type      size_type;
    const size_type       _n_variant = Stash::_n_variant; // should be = N

    mpi::communicator     comm    = ownership.comm();
    size_type             my_proc = ownership.process();
    size_type             nproc   = ownership.n_process();
    boost::array<distributor::tag_type,_n_variant> tag;
    for  (size_type k = 0; k < _n_variant; k++) {
       tag[k] = distributor::get_new_tag();
    }
    // ----------------------------------------------------------------
    // 1) count the messages contained in stash by process id
    // ----------------------------------------------------------------
    // assume that stash elements are sorted by increasing stash_idx (e.g. stash = stl::map)
    // cpu complexity = O(stash.size + nproc)
    // mem complexity = O(stash.size + nproc)
    boost::array<std::vector<size_type>,_n_variant>  msg_size;
    boost::array<std::vector<size_type>,_n_variant>  msg_mark;
    for  (size_type k = 0; k < _n_variant; k++) {
      msg_size[k].resize (nproc);
      msg_mark[k].resize (nproc);
      std::fill (msg_size[k].begin(), msg_size[k].end(), 0);
      std::fill (msg_mark[k].begin(), msg_mark[k].end(), 0);
    }
    boost::array<size_type,_n_variant>  send_nproc;
    std::fill (send_nproc.begin(), send_nproc.end(), 0);
#define _RHEOLEF_counting(z,k,unused)							\
    {											\
      size_type iproc = 0;								\
      size_type i = 0;									\
      for (typename Stash::map##k##_type::const_iterator iter = stash._stack_##k.begin(),\
	                                                 last = stash._stack_##k.end();	\
		 iter != last; iter++, i++) { 						\
        size_type par_index = (*iter).first;						\
        for (; iproc < nproc; iproc++) {						\
          if (par_index >= ownership[iproc] && par_index < ownership[iproc+1]) {	\
            msg_size [k][iproc]++;							\
            if (!msg_mark[k][iproc]) {							\
                 msg_mark[k][iproc] = 1;						\
                 send_nproc[k]++;							\
            }										\
            break;									\
          }										\
        }										\
        assert_macro (iproc != nproc, "bad stash data: processor range error (0:nproc-1)."); \
      }											\
    } /* end block */
    BOOST_PP_REPEAT(N, _RHEOLEF_counting, ~)
#undef _RHEOLEF_counting
    // ----------------------------------------------------------------
    // 2) avoid to send message to my-proc in counting
    // ----------------------------------------------------------------
    for  (size_type k = 0; k < _n_variant; k++) {
      if (msg_size [k][my_proc] != 0) {
	  msg_size [k][my_proc] = 0;
	  msg_mark [k][my_proc] = 0;
	  send_nproc[k]--;
      }
    }
    // ----------------------------------------------------------------
    // 3) compute number of messages to be send to my_proc
    // ----------------------------------------------------------------
    // msg complexity : O(nproc) or O(log(nproc)), depending on reduce implementation 
    boost::array<size_type,_n_variant>  receive_nproc;
    std::vector<size_type> work (nproc);
    for  (size_type k = 0; k < _n_variant; k++) {
      std::fill (work.begin(), work.end(), 0);
      mpi::all_reduce (
	comm, 
        msg_mark[k].begin().operator->(),
	nproc,
	work.begin().operator->(),
	std::plus<size_type>());
      receive_nproc[k] = work [my_proc];
trace_macro ("assembly_begin: receive_nproc[T"<<k<<"] = " << receive_nproc[k]);
    }
    // ----------------------------------------------------------------
    // 4) compute messages max size to be send to my_proc
    // ----------------------------------------------------------------
    // msg complexity : O(nproc) or O(log(nproc)), depending on reduce implementation 
    for  (size_type k = 0; k < _n_variant; k++) {
      std::fill (work.begin(), work.end(), 0);
      mpi::all_reduce (
        comm,
        msg_size[k].begin().operator->(),
        nproc,
	work.begin().operator->(),
        mpi::maximum<size_type>());
      receive_max_size[k] = work [my_proc];
trace_macro ("assembly_begin: receive_max_size[T"<<k<<"] = " << receive_max_size[k]);
    }
    // ----------------------------------------------------------------
    // 5) post receive: exchange the buffer adresses between processes
    // ----------------------------------------------------------------
    // Each message will consist of ordered pairs (global index,value).
    // since we don't know how long each indiidual message is,
    // we allocate the largest : receive_nproc*receive_max_size
    // potentially, this is a lot of wasted space
    // TODO: how to optimize the receive.data buffer ?
    // cpu complexity : O(nproc)
    // mem complexity : O(nproc*(stash.size/nproc)) = O(stash.size), worst case ?
    // msg complexity : O(nproc) 
    boost::array<size_type,_n_variant> data_size;
    for  (size_type k = 0; k < _n_variant; k++) {
      data_size[k] = receive_nproc[k]*receive_max_size[k];
    }
    receive.data.resize (data_size);
#define _RHEOLEF_irecv(z,k,unused)							\
    for (size_type ik_receive = 0; ik_receive < receive_nproc[k]; ik_receive++) {	\
      mpi::request ik_req = comm.irecv (						\
	  mpi::any_source,								\
	  tag[k],									\
          receive.data._stack_##k.begin().operator->() + ik_receive*receive_max_size[k],\
	  receive_max_size[k]);								\
      receive.waits[k].push_back (std::make_pair(ik_receive, ik_req));			\
    }
    BOOST_PP_REPEAT(N, _RHEOLEF_irecv, ~)
#undef _RHEOLEF_irecv
    // ----------------------------------------------------------------
    // 6) copy stash in send buffer
    // ----------------------------------------------------------------
    // since the stash is sorted by increasing order => simple copy
    // cpu complexity : O(stash.size)
    // mem complexity : O(stash.size)
    send.data.build (stash);
    // ---------------------------------------------------------------------------
    // 7) do send
    // ---------------------------------------------------------------------------
    // cpu complexity : O(nproc)
    // mem complexity : O(send_nproc) \approx O(nproc), worst case
    // msg complexity : O(stash.size)
    for  (size_type k = 0; k < _n_variant; k++) {
      send.waits[k].resize (send_nproc[k]);
    }
    {
      boost::array<size_type,_n_variant> i_send;
      boost::array<size_type,_n_variant> i_start;
      std::fill (i_send.begin(),  i_send.end(),  0);
      std::fill (i_start.begin(), i_start.end(), 0);
      for (size_type iproc = 0; iproc < nproc; iproc++) {
#define _RHEOLEF_isend(z,k,unused)						\
        size_type i##k##_msg_size = msg_size [k][iproc];			\
        if (i##k##_msg_size != 0) {						\
	  trace_macro ("isend[T"<<k<<"]: msg_size[proc="<<iproc<<"]="<<i##k##_msg_size); \
          mpi::request i##k##_req = comm.isend (				\
	    iproc,								\
	    tag[k],								\
	    send.data._stack_##k.begin().operator->() + i_start[k],		\
	    i##k##_msg_size);							\
          send.waits[k].push_back (std::make_pair(i_send[k],i##k##_req));	\
          i_send [k]++;								\
          i_start[k] += i##k##_msg_size;					\
	}
        BOOST_PP_REPEAT(N, _RHEOLEF_isend, ~)
#undef _RHEOLEF_isend
      }
    } // end block
trace_macro ("assembly_begin done");
}
#endif // _RHEOLEF_HAVE_MPI
