/*
 * Written by Bastien Chevreux (BaCh)
 *
 * Copyright (C) 2006 and later by Bastien Chevreux
 *
 * All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the 
 * Free Software Foundation, Inc., 
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 * 
 */

// 	$Id$	

#ifndef lint
static char vcid3[] = "$Id$";
#endif /* lint */

#include "contig.H"

//#define CEBUGFLAG

#ifdef CEBUGFLAG
#define CEBUG(bla)   {cout << bla; cout.flush();}
#define CEBUGF(bla)  {cout << bla; cout.flush();}
#else
#define CEBUG(bla)
#define CEBUGF(bla)
#endif


#ifdef PARANOIABUGTRACKFLAG
#define paranoiaBUGSTAT(statement) { statement;}
#define paranoiaBUGIF(ifcond, statement) { if(ifcond) {statement;}}
#else
#define paranoiaBUGSTAT(statement)
#define paranoiaBUGIF(ifcond, statement)
#endif




bool Contig::hasEditableOvercallData() const
{
  (void) vcid3;

#if CPP_READ_SEQTYPE_END != 6
#error "This code is made for 6 sequencing types, adapt!"
#endif

  // TODO: PacBio???
  for(uint32 readnr=0; readnr<CON_reads.size(); readnr++){
    if(CON_reads[readnr].read.isSequencingType(Read::SEQTYPE_454GS20)) return true;
    if(CON_reads[readnr].read.isSequencingType(Read::SEQTYPE_IONTORRENT)) return true;
    //if(CON_reads[readnr].read.isSequencingType(Read::SEQTYPE_SOLEXA)) return true;
  }

  return false;
}

bool Contig::hasPacBioData() const
{
  (void) vcid3;

  for(uint32 readnr=0; readnr<CON_reads.size(); readnr++){
    if(CON_reads[readnr].read.isSequencingType(Read::SEQTYPE_PACBIO)) return true;
  }

  return false;
}




/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

#define CEBUG(bla)
//#define CEBUG(bla)   {cout << bla; cout.flush();}
uint32 Contig::createPacBioDarkStrobeEdits(list<pbdse_t> & pbdsev)
{
  FUNCSTART("uint32 Contig::createPacBioDarkStrobeEdits(list<pbdse_t> & pbdse)");

  pbdsev.clear();

  int32 maxce=0;

  string consseq;
  vector<base_quality_t> consqual;

  newConsensusGet(consseq, consqual);

  vector<contigread_t>::const_iterator crI=CON_reads.begin();

    int32 dbg_goodestim=0;
    int32 dbg_medestim=0;
    int32 dbg_badestim=0;
    int32 dbg_sumgoodestim=0;
    int32 dbg_summedestim=0;
    int32 dbg_sumbadestim=0;

  for(; crI != CON_reads.end(); crI++){
    if(crI->orpid<0
       || !crI->read.isSequencingType(Read::SEQTYPE_PACBIO)) continue;

    const Read & actread = crI->read;

    BUGIFTHROW(actread.getLenClippedSeq() == 0, "Read: " << actread.getName() << " has 0 length?");

    int32 cpos=crI->offset;
    int32 cposincr=1;
    if(crI->direction < 0) {
      cposincr=-1;
      cpos+=actread.getLenClippedSeq()-1;
    }
    int32 arpos=crI->contigPos2RawReadPos(cpos);

    cccontainer_t::const_iterator ccI=CON_counts.begin();
    advance(ccI,cpos);

    char rbase;
    char cbase;
    int32 runcounter=0;
    int32 ncounter=0;
    int32 gapcounter=0;
    int32 changeestim=0;

    bool runend=false;

    // start/stop of run in read (unclipped read pos)
    int32 arposs=0;
    int32 arpose=0;
    for(int32 ri=0; ri<actread.getLenClippedSeq(); ri++, arpos++, cpos+=cposincr, ccI+=cposincr){
      if(crI->direction < 0) {
	//rbase=actread.getBaseInComplementSequence(arpos);
	rbase=actread.getBaseInSequence(arpos);
      }else{
	rbase=actread.getBaseInSequence(arpos);
      }
      //cout << actread.getName() << " " << arpos << " " << rbase << "\t" << cpos << '\t' << consseq[cpos] << '\t' << changeestim << endl;

      cbase=consseq[cpos];
      if(cbase == '*'){
	if(ccI->N > ccI->star) cbase ='N';
      }

      if(runcounter>0){
	if(rbase=='*'){
	  runcounter++;
	  gapcounter++;
	  if(cbase!='*'){
	    changeestim++;
	  }
	}else if(rbase=='N' ||rbase=='n'){
	  runcounter++;
	  ncounter++;
	  if(cbase=='*'){
	    changeestim--;
	  }
	}else{
	  runend=true;
	  arpose=arpos-1;
	}
      }else{
	if(rbase=='*'){
	  arposs=arpos;
	  runcounter++;
	  gapcounter++;
	  if(cbase!='*'){
	    changeestim++;
	  }
	}else if(rbase=='N' ||rbase=='n'){
	  arposs=arpos;
	  runcounter++;
	  ncounter++;
	  if(cbase=='*'){
	    changeestim--;
	  }
	}
      }
      if(runend){
	CEBUG("runend. rc: " << runcounter << "\tgc: " << gapcounter << "\tnc: " << ncounter << "\tce: " << changeestim << "\ts: " << arposs << "\te: " << arpose << endl);
	if(ncounter>=5 && changeestim !=0){
	  // on 100/100 data
	  //direction good in 80% of the time when using 2/3 of change estimate
	  // on 100/100 data
	  // looks like 3/5 is sweet spot

	  //if(abs(changeestim)>2) {
	  //  changeestim=changeestim*3/5;
	  //} else if(abs(changeestim)==2) {
	  //  changeestim/=2;
	  //}
	  if(abs(changeestim)>3) {
	    changeestim=changeestim*3/5;
	  } else if(abs(changeestim)!=1) {
	    changeestim/=2;
	  }

	  {
	    pbdse_t ptmp;
	    ptmp.rid=crI->orpid;
	    ptmp.rposs=arposs;
	    ptmp.rpose=arpose;
	    ptmp.changeestim=changeestim;
	    pbdsev.push_back(ptmp);
	  }
	  CEBUG("Stored " << actread.getName() << "\tce: " << changeestim << endl);

	  maxce=max(abs(changeestim),maxce);
	  int32 newestim=ncounter+changeestim;
	  if(abs(100-ncounter) > abs(100-newestim)){
	    dbg_goodestim++;
	    dbg_sumgoodestim+=changeestim;
	  }else if(abs(100-ncounter) == abs(100-newestim)){
	    dbg_medestim++;
	    dbg_summedestim+=changeestim;
	  }else{
	    dbg_badestim++;
	    dbg_sumbadestim+=changeestim;
	  }
	}
	runcounter=0;
	gapcounter=0;
	ncounter=0;
	changeestim=0;
	arposs=0;
	arpose=0;
	runend=false;
      }
    }
  }

  CEBUG("Good: " << dbg_goodestim << "\tSum: " << dbg_sumgoodestim
	<< "\nMed : " << dbg_medestim << "\tSum: " << dbg_summedestim
	<< "\nBad : " << dbg_badestim << "\tSum: " << dbg_sumbadestim
	<< "\nMax : " << maxce
	<< endl);

  //pbdsev.sort(Contig::pbdse_t_comparator);

  FUNCEND();
  return static_cast<uint32>(maxce);
}
//#define CEBUG(bla)



/*************************************************************************
 *
 * check stars only / N only columns and remove them
 * readjust the offsets if needed
 *
 *************************************************************************/

void Contig::deleteStarOnlyColumns(int32 from, int32 to, bool alsononly, uint32 mincov, bool updateconcounts)
{
  FUNCSTART("void Contig::deleteStarOnlyColumns(int32 from, int32 to)");

  paranoiaBUGSTAT(checkContig());

  BUGIFTHROW(to<from, "to < from ?");

  if(CON_fixedconsseq.size()){
    nukeSTLContainer(CON_fixedconsseq);
    nukeSTLContainer(CON_fixedconsqual);
  }
  definalise();

  int32 checklen=to-from;

  if(checklen>0){
    if(from+checklen>CON_counts.size()) checklen=CON_counts.size()-from;


    CEBUG("Starcheck from: " << from << " to " << from+checklen << "Also N only" << alsononly <<endl);

    //{
    //  // give out the reads for debuging purposes
    //  vector<contigread_t>::iterator J=CON_reads.begin();
    //  for(uint32 j=0; j<CON_reads.size(); j++, J++){
    //	CEBUG("J->id "<< J->id << "      J->offset " << J->offset << "\n");
    //  }
    //}
    
    cccontainer_t::iterator I= CON_counts.begin();
    BOUNDCHECK(from+checklen-1, 0, CON_counts.size());
    advance(I, from+checklen-1);
    for(int32 actcontigpos=checklen-1; actcontigpos >= 0; --actcontigpos, --I){
      bool todelete=false;
      CEBUG("dsoc acp: " << actcontigpos << endl);// << *I << endl;
      if(I->total_cov >= mincov
	 && (I->backbonechar == '*'
	     || I->backbonechar == '@')){
	if(I->star==I->total_cov){
	  CEBUG("Complete star row. Deleting star in consensus and in other reads.\n");
	  todelete=true;
	} else if(alsononly){
	  if(I->A == I->N
	     && I->C == I->N
	     && I->G == I->N
	     && I->T == I->N
	     && I->X == 0){
	    CEBUG("Complete N row. Deleting in consensus and in other reads.\n");
	    todelete=true;
	  }
	}
      }
      
      if(todelete){
	vector<contigread_t>::iterator J=CON_reads.begin();
      
	// Deleting stars / Ns in other reads
	for(; J != CON_reads.end(); J++){
	  CEBUG("i " << i << "\tJ->id "<< J->id << "\tJ->offset " << J->offset << "\tJ->offset+J->read.getLenClippedSeq() " << J->offset+J->read.getLenClippedSeq());
	  // TODO: >, >=, <, <=, +1, -1?
	  if(from+actcontigpos >= J->offset&&
	     from+actcontigpos < J->offset+J->read.getLenClippedSeq()){
	    CEBUG("In range, deleting.");
	    // internaloffset is the offset of the * / N in this read
	    int32 internaloffset= (from-J->offset)+actcontigpos;
	    CEBUG("\tInternal offset: " << internaloffset);
	    
	    if(J->direction > 0){
	      J->read.deleteBaseFromClippedSequence(internaloffset);
	    }else{
	      J->read.deleteBaseFromClippedComplementSequence(internaloffset);
	    }
	    
	    //		vector<char>::iterator K=J->corrected.begin();
	    //		advance(K, internaloffset);
	    //		J->corrected.erase(K);
	  }
	  // Adjust start of reads beginning after the pos.
	  if(J->offset > from+actcontigpos) {
	    CEBUG("\tComes after. Down 1.");
	    J->offset-=1;
	  }
	  CEBUG(endl);
	}

	// push down consensus tags
	updateTagBaseDeleted(from+actcontigpos);

	if(updateconcounts){
	  // erase the place in CON_counts
	  CEBUG("counts_offset: " << (I-CON_counts.begin()) << endl);
	  CON_counts.erase(I);
	  I= CON_counts.begin();
	  BOUNDCHECK(from+1, 0, static_cast<int32>(CON_counts.size()));
	  advance(I, from+actcontigpos);
	}
      }
    }
  }

  // if CON_counts wasn't updated, this contig is pretty much invalid anyway
  if(updateconcounts) {paranoiaBUGSTAT(checkContig());}

  FUNCEND();

}




/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/


struct simplebasecounter_t {
  char base;
  uint32 counter;
};


//#define CEBUG(bla)   {cout << bla; cout.flush();}

uint32 Contig::editTrickyOvercalls(const bool onlysetPSHPtags, const bool noSRMreadsallowed, vector<bool> & readsmarkedsrm)
{
  FUNCSTART("uint32 Contig::editTrickyOvercalls(const bool onlysetPSHPtags, const bool noSRMreadsallowed, vector<bool> & readsmarkedsrm)");

  cout << "\nSearching for tricky 454 overcalls:\n";

  // if we were called with an incorrectly sized readsmarkedsrm vector
  //  (empty or whatever), assume no SRM frshly set
  if(readsmarkedsrm.size() != CON_readpool->size()){
    readsmarkedsrm.clear();
    readsmarkedsrm.resize(CON_readpool->size(),false);
  }

  // reserve enough space for one edit every 10th base in the consensus
  vector<edit454command_t> all454editcommands;
  all454editcommands.reserve(CON_counts.size()/10);

  // initialise the iterator for getting through the contig
  rcci_t rcci;
  {
    vector<int32> allowedstrainids; // empty would be all ids
    vector<uint8> allowedreadtypes;
    //allowedreadtypes.push_back(Read::SEQTYPE_454GS20);
    readColContigIteratorInit(rcci, 
			      allowedstrainids, 
			      allowedreadtypes, 
			      false,            // don't take rails
			      false,           // nor backbones
			      false);   // nor reads without readpool-reads
  }

  cccontainer_t::iterator ccI=CON_counts.begin();
  
  ProgressIndicator<int32> P(0, static_cast<int32>(CON_counts.size())-1);


  vector<simplebasecounter_t> basecounter(5);
  {
    string bases="ACGT*";
    for(uint32 i=0;i<basecounter.size(); i++){
      basecounter[i].base=bases[i];
      basecounter[i].counter=0;
    }
  }


  uint32 editlock=0;

  for(uint32 actcontigpos=0; actcontigpos<CON_counts.size() ;actcontigpos++, ccI++, readColContigIteratorAdvance(rcci)){

    P.progress(actcontigpos);

    if(editlock>0) {
      editlock--;
      continue;
    }

    {
      // have a look at all reads at this position: if one of them was 
      //  marked in this pass with a SRM, we don't do the 454 edit here
      //  (because some "corrections" might be due to misassemblies and 
      //  would be plain wrong)
      vector<int32>::const_iterator I=rcci.read_ids_in_col.begin();
      bool readfreshlymarkedsrm=false;
      for(;I!=rcci.read_ids_in_col.end();I++){
	if(readsmarkedsrm[CON_reads[*I].orpid]){
	  readfreshlymarkedsrm=true;
	  break;
	}
      }
      if(readfreshlymarkedsrm) {
	continue;
      }
    }

    // check for disagreement in this column
    uint32 setcount=0;

    if(ccI->A > 0) setcount++;
    if(ccI->C > 0) setcount++;
    if(ccI->G > 0) setcount++;
    if(ccI->T > 0) setcount++;
    if(ccI->star > 0) setcount++;

    // if more than 1, then there are bases in this column which disagree
    //  (caution: this might be also just an "N", or some other IUPACs)
    // tricky == those with * in column
    if(setcount>1 && ccI->star > 0) {
      CEBUG("cpos: " << actcontigpos << "\t" << ccI->A << " " << ccI->C << " " << ccI->G << " " << ccI->T << " " << ccI->star << endl);

      for(uint32 i=0;i<basecounter.size(); i++){
	basecounter[i].counter=0;
      }
      for(uint32 readnr=0; readnr<rcci.read_ids_in_col.size(); readnr++){
	char           base;
	int32          actreadid=rcci.read_ids_in_col[readnr];
	contigread_t & ric =CON_reads[actreadid];
	
	int32 readpos=contigPosToUnclippedReadPos(actcontigpos, ric);
	
	CEBUG("read: " << ric.read.getName() << '\n');
	
	//int32 realreadpos;
	if(ric.direction>0){
	  base=static_cast<char>(toupper(ric.read.getBaseInSequence(readpos)));
	  //realreadpos=readpos;
	}else{
	  base=static_cast<char>(toupper(ric.read.getBaseInComplementSequence(readpos)));
	  //realreadpos=ric.read.calcComplPos(readpos);
	}
	
	for(uint32 i=0;i<basecounter.size(); i++){
	  if(basecounter[i].base==base) basecounter[i].counter++;
	}
      }


      {
	size_t maxsize=0;
	int32 maxsize_i=-1;
	size_t runnerup=0;
	int32 runnerup_i=-1;
	
	for(uint32 i=0; i<basecounter.size(); i++){
	  CEBUG("ACGT*"[i] << "\t" << basecounter[i].counter << " ");
	  if(basecounter[i].counter>=maxsize){
	    runnerup=maxsize;
	    runnerup_i=maxsize_i;
	    maxsize=basecounter[i].counter;
	    maxsize_i=i;
	  }else if(basecounter[i].counter>=runnerup){
	    runnerup=basecounter[i].counter;
	    runnerup_i=i;
	  }
	}

	CEBUG("\nmaxsize: " << maxsize << "\ti: " << maxsize_i << "\n");
	CEBUG("runnerup: " << runnerup << "\ti: " << runnerup_i << "\n");
	

	// index==4 is gap
	if((maxsize_i==4 && runnerup_i>=0)
	   || (maxsize_i>=0 && runnerup_i==4)){
	  if(maxsize+runnerup == 0){
	    //addTagToConsensus(actcontigpos, actcontigpos,'=',"T454","DOH?");
	  }else{

	    if(100*runnerup/(maxsize+runnerup) >= 40){
	      ostringstream ostr;
	      ostr << static_cast<char>(basecounter[maxsize_i].base) << ": " << maxsize;
	      ostr << " " << static_cast<char>(basecounter[runnerup_i].base) << ": " << runnerup;
	      ostr << "  -  " << 100*runnerup/(maxsize+runnerup) << "%";
	      addTagToConsensus(actcontigpos, 
				actcontigpos,
				'=',
				multitag_t::getIdentifierStr(CON_tagentry_idDGPc).c_str(),
				ostr.str().c_str(),
				true);

	      CEBUG(ostr.str() << '\n');
	    }

	    if(basecounter[maxsize_i].base=='*'){
	      editlock=edit454checkTrickies(basecounter[runnerup_i].base,
					    actcontigpos,
					    rcci.read_ids_in_col,
					    all454editcommands,
					    onlysetPSHPtags,
					    noSRMreadsallowed);
	    }else{
	      editlock=edit454checkTrickies(basecounter[maxsize_i].base,
					    actcontigpos,
					    rcci.read_ids_in_col,
					    all454editcommands,
					    onlysetPSHPtags,
					    noSRMreadsallowed);
	    }
	    if(editlock>0) editlock+=2;
	  }
	}
      }
    }
  }
  P.finishAtOnce();

  CEBUG("Generated " << all454editcommands.size() << " 454 edit commands.\n");
  if(all454editcommands.size()) {
    if(onlysetPSHPtags){
      cout << "Marking tricky 454 runs in " << all454editcommands.size() << " cases.\n\n";
    }else{
      cout << "Performing 454 edits in " << all454editcommands.size() << " cases.\n\n";
    }
    sort(all454editcommands.begin(),
	 all454editcommands.end(), Contig::edit454command_t_comparator);

    for(uint32 i=0; i<all454editcommands.size(); i++){
      contigread_t & ric =CON_reads[all454editcommands[i].conreadid];
    
      if(onlysetPSHPtags){
	CEBUG("Mark read: " << ric.read.getName());
      }else{
	CEBUG("EDIT read: " << ric.read.getName());
      }
      CEBUG("\tbase: " << all454editcommands[i].base);
      CEBUG("\tpos: " << all454editcommands[i].readpos << endl);

      if(onlysetPSHPtags){
	uint32 zeroqualcounts=0;
	uint32 runfrom=0;
	uint32 runto=0;
	try{
	  uint32 runlength=getBaseRunLength(ric.read,
					    all454editcommands[i].readpos,
					    all454editcommands[i].base,
					    runfrom,
					    runto,
					    zeroqualcounts,
					    true);
	  if(runlength){
	    if(runfrom>0) runfrom--;
	    if(runto<ric.read.getLenSeq()-1) runto++;
	    ric.read.addTag(runfrom,runto, "PSHP", "");
	  }
	}
	catch(...){
	}
      }else{
	ric.read.deleteWeakestBaseInRun(all454editcommands[i].base,
					all454editcommands[i].readpos,
					true);
	
	// just in case addTag decided to throw ... 
	// too lazy to do it right at the moment
	try{
	  ric.read.addTag(all454editcommands[i].readpos-1,
			  all454editcommands[i].readpos+1,
			  "R454","");
	}
	catch(...){
	}
      }

    }

  }

  cout << "\n";


  FUNCEND();

  return static_cast<uint32>(all454editcommands.size());
}



struct trickyrunshistogram_t {
  vector<uint32> seen_ids;
  vector<uint32> realreadpos_of_ids;
  vector<char> realbasehypo_of_ids;
  uint32 count_zeroqualadjusted;
  uint32 count_havezeroqual;
};



// returns length of base run that is looked at if edited
// else 0
uint32 Contig::edit454checkTrickies(const char basehypo, const uint32 actcontigpos, const vector<int32> & read_ids_in_col,vector<edit454command_t> & editcommands, const bool onlysetPSHPtags, const bool noSRMreadsallowed)
{
  FUNCSTART("void Contig::edit454checkTrickies(const char basehypo, const uint32 actcontigpos, const vector<int32> & read_ids_in_col)");

  CEBUG("Checking tricky hypothesis: " << basehypo << " " << actcontigpos << endl);

  if(noSRMreadsallowed){
    for(uint32 readnr=0; readnr<read_ids_in_col.size(); readnr++){
      if(CON_reads[read_ids_in_col[readnr]].read.hasTag(Read::REA_tagentry_idSRMr)){
	CEBUG("Read with SRM found, but this is presently not allowed at this stage.\n");
	FUNCEND();
	return 0;
      }
    }
  }


  vector<trickyrunshistogram_t> trh;
  trh.reserve(100);

  // After stepping through all reads, the following variable shows
  //  whether additional bases in this column are all equal to "basehypo"
  bool cleanbasehypocolumn=true;

  uint32 maxspan=0;

  for(uint32 readnr=0; readnr<read_ids_in_col.size(); readnr++){
    int32          actreadid=read_ids_in_col[readnr];
    contigread_t & ric =CON_reads[actreadid];
    
    // do not analyse backbones and rails
    if(ric.read.isBackbone() 
       || ric.read.isRail()) continue;
    // or reads that are not from 454
    //|| !ric.read.isSequencingType(Read::SEQTYPE_454GS20)) continue;

    int32 readpos=contigPosToUnclippedReadPos(actcontigpos, ric);
    
    CEBUG("read: " << ric.read.getName()) ; cout.flush();
    
    // to simplify life, were going to have this routine
    //  only work with the forward direction of reads
    // for reads in reverse direction in contig, calculate the "real" values 
    int32 realreadpos=readpos;
    char realbasehypo=basehypo;
    if(ric.direction<0){
      realbasehypo=dptools::getComplementBase(realbasehypo);
      realreadpos=ric.read.calcComplPos(readpos);
    }

    CEBUG(" " << ric.read.getBaseInSequence(realreadpos) << endl);

    if(ric.read.getBaseInSequence(realreadpos) != '*'
       && ric.read.getBaseInSequence(realreadpos) != realbasehypo){
      cleanbasehypocolumn=false;
    }

    uint32 zeroqualcounts=0;
    uint32 runfrom=0;
    uint32 runto=0;
    CEBUG("search runlength" << endl);
    uint32 runlength=getBaseRunLength(ric.read,
				      realreadpos,
				      realbasehypo,
				      runfrom,
				      runto,
				      zeroqualcounts,
				      true);
    CEBUG("RL " << ric.read.getName() << ": " << runlength << " " << zeroqualcounts << endl);

    uint32 span=runto-runfrom;
    if(span>maxspan) maxspan=span;

    if(runlength>=trh.size()){
      CEBUG("extend histogramm" << endl);
      size_t initstart=trh.size();
      trh.resize(runlength+1);
      for(;initstart<trh.size(); initstart++){
	trh[initstart].count_zeroqualadjusted=0;
	trh[initstart].count_havezeroqual=0;
      }
    }
    CEBUG("rl in histogramm" << endl);
    trh[runlength].seen_ids.push_back(actreadid);
    trh[runlength].realreadpos_of_ids.push_back(realreadpos);
    trh[runlength].realbasehypo_of_ids.push_back(realbasehypo);
    if(zeroqualcounts>0) trh[runlength].count_havezeroqual++;;

    // we will delete only one zeroqual base
    if(zeroqualcounts>0 && runlength>0){
      trh[runlength-1].count_zeroqualadjusted++;
    } else {
      trh[runlength].count_zeroqualadjusted++;
    }
    CEBUG("done" << endl);
  }
  
  CEBUG("Cleanbasehypocolumn: " << cleanbasehypocolumn);
  CEBUG("\nsearch zqa\n");

  // search for maximum count_zeroqualadjusted in trh 
  //  (but not in the zero-length counts)
  uint32 maxcountzqa=0;
  uint32 maxcountzqa_index=0;
  for(uint32 i=0; i<trh.size(); i++){
    CEBUG(i << "\t" << trh[i].seen_ids.size() << "\t" << trh[i].count_zeroqualadjusted<< "\t" << trh[i].count_havezeroqual << endl);
    // changing > to >= favours longer runs with the exact same prediction
    //  of adjusted reads
    if(trh[i].count_zeroqualadjusted>=maxcountzqa){
      maxcountzqa=trh[i].count_zeroqualadjusted;
      maxcountzqa_index=i;
    }
  }

  CEBUG("search higher" << endl);

  uint32 runsatmczqa=static_cast<uint32>(trh[maxcountzqa_index].seen_ids.size());
  uint32 runslargermczqa=0;
  uint32 runswillingtoshorten=0;
  uint32 expectedlarger=0;
  for(uint32 i=maxcountzqa_index+1; i<trh.size(); i++){
    runslargermczqa+=static_cast<uint32>(trh[i].seen_ids.size());
    expectedlarger+=trh[i].count_zeroqualadjusted;
    runswillingtoshorten+=trh[i].count_havezeroqual;
  }

  bool edit=false;
  ostringstream ostr;


  // General routines destined to get out most annoying things
  // start them if edit not triggered previously
  if(!edit){
    CEBUG("General rules.\n");
    CEBUG("trh.size(): " << trh.size());
    CEBUG("\nmaxcountzqa_index: " << maxcountzqa_index);
    CEBUG("\ncleanbasehypocolumn: " << cleanbasehypocolumn);
    CEBUG("\nread_ids_in_col.size(): " << read_ids_in_col.size());
    CEBUG("\nrunsatmczqa: " << runsatmczqa);
    CEBUG('\n');
    
    if(trh.size()>maxcountzqa_index+2) {
      CEBUG("Funny looking histogram, no edit!\n");
      ostr << "Funny histogram ";
    }else{
      if(runslargermczqa>0){
	// we have some kind of overcalls
	if(runsatmczqa >= runslargermczqa*10){
	  /* if number of runs of probable length > 10x of the higher runs,
	     edit. E.g.
	     .....aAAA.....
	     .....*AAA.....
	     .....*AAA.....
	     .....*AAA.....
	     .....*AAA.....
	     etc.
	  */
	  edit=true;
	  ostr << "Smoking gun  ";
	}
      } else if(maxcountzqa_index==1 && cleanbasehypocolumn
		&& read_ids_in_col.size() >= runsatmczqa*10){
	/* if it's a some unmotivated bases hanging around, being all
	   the same and less than 10% of the column. E.g.
	   
	   Edit             Edit               No edit   
	   ....TaGGG.....   ....TaGGG..... 	....TaGGG.....
	   ....T*GGG.....   ....TaGGG..... 	....TcGGG.....
	   ....T*GGG.....   ....T*GGG..... 	....T*GGG.....
	   ....T*GGG.....   ....T*GGG..... 	....T*GGG.....
	   ....T*GGG.....   ....T*GGG..... 	....T*GGG.....
	   etc.             etc.               etc.
	*/
	edit=true;
	ostr << "Lone ranger  ";
	maxcountzqa_index=0;
      }
    }
  }
  if(edit) {
    CEBUG("Editing!\n");
    ostr << "Edit: " << basehypo;
    if(trh.size()>maxcountzqa_index+2) {
      /* ??? strategie leicht ndern in: wenn auch edits bei 
	 maxcountzqa_index+2, dann erst die */
      ostr << " Caution!";
      CEBUG("CAUTION!\n");
      maxcountzqa_index++;
    }
  }else{
    ostr << "No edit.";
  }
  ostr << " PL: " << maxcountzqa_index;      // probable length
  ostr << " RS: " << runsatmczqa;            // reads that are this length
  ostr << " RL: " << runslargermczqa;        // reads that are larger
  ostr << " RWTS: " << runswillingtoshorten; // reads willing to shorten
  ostr << " EL: " << expectedlarger;         // reads expected to be larger
                                             //  after editing
  
  addTagToConsensus(actcontigpos, actcontigpos,'=',"H454",ostr.str().c_str(),true);

  CEBUG(ostr.str() << '\n');

  if(edit){
    // current strategy:
    // now that we know that we need to shorten a few reads at this
    //  position, shorten *all* reads larger than the length we determined
    //  (maxcountzqa_index) by one base (the weakest base). However, the
    //  base quality of the weakest base now does not need to be 0.

    uint32 acttrh=maxcountzqa_index+1;

    // addition: if only marking PSHP runs, mark all reads at that position
    if(onlysetPSHPtags){
      acttrh=0;
    }
    
    for(; acttrh<trh.size(); acttrh++){
      for(uint32 readnr=0; readnr<trh[acttrh].seen_ids.size(); readnr++){
	CEBUG("Saving EDIT: read: " << CON_reads[trh[acttrh].seen_ids[readnr]].read.getName() << " " << CON_reads[trh[acttrh].seen_ids[readnr]].read.getBaseInSequence(trh[acttrh].realreadpos_of_ids[readnr]) << endl; cout.flush());
	//ric.read.deleteWeakestBaseInRun(realbasehypo, realreadpos);

	editcommands.resize(editcommands.size()+1);
	editcommands.back().conreadid=trh[acttrh].seen_ids[readnr]; //actreadid;
	editcommands.back().base=trh[acttrh].realbasehypo_of_ids[readnr]; //realbasehypo;
	editcommands.back().readpos=trh[acttrh].realreadpos_of_ids[readnr]; //realreadpos
      }
    }
  }

  FUNCEND();
  if(edit) return maxspan;
  return 0;
}















/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/


//#define CEBUG(bla)   {cout << bla; cout.flush();}

uint32 Contig::editSingleDiscrepancyNoHAFTag(vector<bool> & readsmarkedsrm)
{
  FUNCSTART("uint32 Contig::editSingleDiscrepancyNoHAFTag(vector<bool> & readsmarkedsrm)");

  cout << "\nSearching for single discr. without HAF tags:\n";

  uint32 numedits=0;

  // see whether we have some HAF tags at all (if not -> no edit!)
  {
    bool foundhaf=false;
    vector<contigread_t>::const_iterator crI=CON_reads.begin();
    for(; crI != CON_reads.end() && !foundhaf; ++crI){
      if(crI->orpid<0
	 || crI->read.isBackbone()
	 || crI->read.isRail()) continue;

      foundhaf=crI->read.hasTag(Read::REA_tagentry_idHAF3);
      if(!foundhaf) foundhaf=crI->read.hasTag(Read::REA_tagentry_idHAF5);
      if(!foundhaf) foundhaf=crI->read.hasTag(Read::REA_tagentry_idHAF4);
      if(!foundhaf) foundhaf=crI->read.hasTag(Read::REA_tagentry_idHAF6);
      if(!foundhaf) foundhaf=crI->read.hasTag(Read::REA_tagentry_idHAF7);
      if(!foundhaf) foundhaf=crI->read.hasTag(Read::REA_tagentry_idHAF2);
    }
    if(!foundhaf){
      FUNCEND();
      return 0;
    }
  }

  // if we were called with an incorrectly sized readsmarkedsrm vector
  //  (empty or whatever), assume no SRM frshly set
  if(readsmarkedsrm.size() != CON_readpool->size()){
    readsmarkedsrm.clear();
    readsmarkedsrm.resize(CON_readpool->size(),false);
  }

  // initialise the iterator for getting through the contig
  rcci_t rcci;
  {
    vector<int32> allowedstrainids; // empty would be all ids
    vector<uint8> allowedreadtypes;
    //allowedreadtypes.push_back(Read::SEQTYPE_454GS20);
    readColContigIteratorInit(rcci, 
			      allowedstrainids, 
			      allowedreadtypes, 
			      false,            // don't take rails
			      false,           // nor backbones
			      false);   // nor reads without readpool-reads
  }

  cccontainer_t::iterator ccI=CON_counts.begin();
  
  ProgressIndicator<int32> P(0, static_cast<int32>(CON_counts.size())-1);


  vector<simplebasecounter_t> basecounter(6);
  {
    static const string bases="ACGT*N";
    for(uint32 i=0;i<basecounter.size(); i++){
      basecounter[i].base=bases[i];
      basecounter[i].counter=0;
    }
  }


  for(uint32 actcontigpos=0; actcontigpos<CON_counts.size() ;actcontigpos++, ccI++, readColContigIteratorAdvance(rcci)){

    P.progress(actcontigpos);

    // must be at least at coverage 5
    if(ccI->total_cov<5) continue;

    {
      // have a look at all reads at this position: if one of them was 
      //  marked in this pass with a SRM, we don't do the 454 edit here
      //  (because some "corrections" might be due to misassemblies and 
      //  would be plain wrong)
      vector<int32>::const_iterator I=rcci.read_ids_in_col.begin();
      bool readfreshlymarkedsrm=false;
      for(;I!=rcci.read_ids_in_col.end();I++){
	if(readsmarkedsrm[CON_reads[*I].orpid]){
	  readfreshlymarkedsrm=true;
	  break;
	}
      }
      if(readfreshlymarkedsrm) {
	continue;
      }
    }

    // check for disagreement in this column
    uint32 setcount=0;

    if(ccI->A > 0) setcount++;
    if(ccI->C > 0) setcount++;
    if(ccI->G > 0) setcount++;
    if(ccI->T > 0) setcount++;
    if(ccI->N > 0) setcount++;
    if(ccI->star > 0) setcount++;

    // if more than 1, then there are bases in this column which disagree
    //  (caution: this might be also just an "N", or some other IUPACs)
    if(setcount>1) {
      CEBUG("cpos: " << actcontigpos << "\t" << ccI->A << " " << ccI->C << " " << ccI->G << " " << ccI->T << " " << ccI->N << " " << ccI->star << endl);

      for(uint32 i=0;i<basecounter.size(); i++){
	basecounter[i].counter=0;
      }
      for(uint32 readnr=0; readnr<rcci.read_ids_in_col.size(); readnr++){
	char           base;
	int32          actreadid=rcci.read_ids_in_col[readnr];
	contigread_t & ric =CON_reads[actreadid];
	
	int32 readpos=contigPosToUnclippedReadPos(actcontigpos, ric);
	
	CEBUG("read: " << ric.read.getName() << '\n');
	
	int32 realreadpos;
	if(ric.direction>0){
	  base=static_cast<char>(toupper(ric.read.getBaseInSequence(readpos)));
	  realreadpos=readpos;
	}else{
	  base=static_cast<char>(toupper(ric.read.getBaseInComplementSequence(readpos)));
	  realreadpos=ric.read.calcComplPos(readpos);
	}
	
	for(uint32 i=0;i<basecounter.size(); i++){
	  if(basecounter[i].base==base) basecounter[i].counter++;
	}
      }

      uint32 maxcount=0;
      for(uint32 i=0;i<basecounter.size(); i++){
	maxcount=max(maxcount,basecounter[i].counter);
	CEBUG("bc["<<i<<"]: " << basecounter[i].base << " " << basecounter[i].counter << endl);
      }

      // maxi = index of consensus base if >=0
      int32 maxi=-1;
      // singlei = index of single base if >=0
      int32 singlei=-1;
      for(uint32 i=0;i<basecounter.size(); i++){
	if(basecounter[i].counter==maxcount) {
	  if(maxi>=0){
	    maxi=-1;
	    break;
	  }
	  maxi=i;
	}
	if(basecounter[i].counter==1) {
	  if(singlei>=0){
	    singlei=-1;
	    break;
	  }
	  singlei=i;
	}
      }

      CEBUG("mc: " << maxcount << "\tsi: " << singlei << "\tmi: " << maxi << endl);

      if(maxi>=0 && singlei>=0){
	char repbase=static_cast<char>(tolower(basecounter[maxi].base));
	
	for(uint32 readnr=0; readnr<rcci.read_ids_in_col.size(); readnr++){
	  char           base;
	  int32          actreadid=rcci.read_ids_in_col[readnr];
	  contigread_t & ric =CON_reads[actreadid];
	  
	  int32 readpos=contigPosToUnclippedReadPos(actcontigpos, ric);
	  
	  CEBUG("read: " << ric.read.getName() << '\n');
	  
	  int32 realreadpos;
	  if(ric.direction>0){
	    base=static_cast<char>(toupper(ric.read.getBaseInSequence(readpos)));
	    realreadpos=readpos;
	  }else{
	    base=static_cast<char>(toupper(ric.read.getBaseInComplementSequence(readpos)));
	    realreadpos=ric.read.calcComplPos(readpos);
	  }

	  if(base==basecounter[singlei].base){
	    // not optimal regarding speed, but let's not care atm
	    bool foundtag=ric.read.hasTag(Read::REA_tagentry_idHAF3,realreadpos);
	    if(!foundtag) foundtag=ric.read.hasTag(Read::REA_tagentry_idHAF4,realreadpos);
	    if(!foundtag) foundtag=ric.read.hasTag(Read::REA_tagentry_idHAF5,realreadpos);
	    if(!foundtag) foundtag=ric.read.hasTag(Read::REA_tagentry_idHAF6,realreadpos);
	    if(!foundtag) foundtag=ric.read.hasTag(Read::REA_tagentry_idHAF7,realreadpos);

	    if(!foundtag) foundtag=ric.read.hasTag(Read::REA_tagentry_idHAF2,realreadpos);

	    if(!foundtag){
	      if(ric.direction>0){
		ric.read.changeBaseInSequence(repbase,0,realreadpos);
	      }else{
		ric.read.changeBaseInSequence(dptools::getComplementIUPACBase(repbase),0,realreadpos);
	      }
	      // replacing with a gap? technically, it's a deletion followed by an insertion
	      // therefore, the adjustment must be ... errr ... adjusted
	      if(repbase=='*' && ric.read.usesAdjustments()){
		ric.read.changeAdjustment(realreadpos,-1);
	      }
	      addTagToConsensus(actcontigpos, actcontigpos,'=',"ESDN","",true);
	      ++numedits;
	      break;
	    }
	  }
	}
      }
    }
  }

  P.finishAtOnce();

  if(numedits) rebuildConCounts();

  FUNCEND();
  return numedits;
}
