// Copyright (C)  2000 Intel Corporation.  All rights reserved.
//
// $Header: /usr/development/orp/orp/arch/ia32/ia32_o3_jit/code_emitter.cpp,v 1.8 2002/01/08 07:18:03 xhshi Exp $
//

#include "defines.h"
#include "ir.h"
#include "flow_graph.h"
#include "expression.h"
#include "x86_emitter.h"
#include "code_emitter.h"
#include "jit_intf.h"
#include "data_emitter.h"
#include "gc_eh_support.h"
#include "bit_vector_group.h"
#include "pldi.h"
#if defined(TRACE_O3) || defined(DUMP_JIT)
#include "dumpjit.h"
#endif // TRACE_O3
#ifdef STAT_INDIRECT_CALL
#include "jit_runtime_support.h"
#endif
#include "overridden.h"

#ifdef O3_VTune_Support
#ifndef STAT_INDIRECT_CALL
#include "jit_runtime_support.h"
#endif
#include "..\\ia32_o1_jit\\vtune.h"
#endif // VTune_Support

#define LOOP_PADDING_2

extern JIT_Handle O3_Jit_Handle;

#ifdef CAFFEINE_MARK
bool block_padding = false;
#endif

R_Opnd *X86_Opnd_Pool::get_r_opnd(Operand *opnd) {
    assert(opnd->assigned_preg() != n_reg);
    return r_opnd[opnd->assigned_preg()];
}

M_Opnd *X86_Opnd_Pool::get_m_opnd(Operand *opnd) {
    if (opnd->kind == Operand::Array) {
        //
        // set up [base + index *shift + offset]
        //
        Operand *b = opnd->base();
        assert(b != NULL && b->assigned_preg() != n_reg);
        m_indx->base_reg  = b->assigned_preg();
        Operand *i = opnd->index();
        assert(i != NULL && i->assigned_preg() != n_reg);
        m_indx->index_reg = i->assigned_preg();
        m_indx->shift_amount = ((Array_Operand*)opnd)->shift();
        m_indx->disp.value = ((Array_Operand*)opnd)->offset();
        return m_indx;
    } else if (opnd->kind == Operand::Field) {
        //
        // set up [base + offset]
        //
        Operand *b = opnd->base();
        assert(b != NULL && b->assigned_preg() != n_reg);
        m_base->base_reg  = b->assigned_preg();
        m_base->disp.value = ((Field_Operand*)opnd)->offset();
        return m_base;
    } else if (opnd->kind == Operand::GCTrack) {
        unsigned home_location = ((Reg_Operand*)opnd)->home_location();
        assert(opnd->assigned_preg() == n_reg && home_location != -1);
        m_base->base_reg = frame->base_reg;
        if (((Reg_Operand*)opnd)->use_arg_home_loc())
            m_base->disp.value = frame->var_offset(home_location);
        else 
            m_base->disp.value = frame->spill_offset(frame->n_spill - home_location - 1);
        return m_base;
    } else if (opnd->kind == Operand::Static) {
        m_opnd->disp.value = (int)((Static_Operand*)opnd)->addr;
        return m_opnd;
    }
    assert(0);
    return NULL;
}


RM_Opnd *X86_Opnd_Pool::get_rm_opnd(Operand *opnd) {
    if (opnd->assigned_preg() != n_reg) 
        return get_r_opnd(opnd);
    else 
        return get_m_opnd(opnd);
}

M_Opnd *X86_Opnd_Pool::get_m_opnd_lea(Operand *src1, Operand *src2, bool negate_src2) {
    X86_Reg_No src1reg = src1->assigned_preg();
    X86_Reg_No src2reg = src2->assigned_preg();
    if (src1reg != n_reg && src2reg != n_reg)
    {
        // create an array operand
        m_indx->base_reg = src1reg;
        m_indx->index_reg = src2reg;
        m_indx->shift_amount = 0;
        m_indx->disp.value = 0;
        return m_indx;
    }
    else
    {
        // create a field operand
        if (src1reg != n_reg)
        {
            m_base->base_reg = src1reg;
            m_base->disp.value = ((Imm_Operand *)src2)->imm();
            if (negate_src2)
                m_base->disp.value = -m_base->disp.value;
            return m_base;
        }
        else
        {
            m_base->base_reg = src2reg;
            m_base->disp.value = ((Imm_Operand *)src1)->imm();
            return m_base;
        }
    }
}

//
// generate push/pop src
//
static Inst *create_push_inst(Expressions& exprs, Operand_Exp *src,Inst *i) {
    Inst_Exp *push = exprs.lookup_inst_exp(Exp::Push,src,NULL,JIT_TYPE_INT);
    return new (exprs.mem) Push_Inst(src->opnd,push,i);
}
static Inst *create_pop_inst(Expressions& exprs, Operand_Exp *dst,Inst *i) {
    Inst_Exp *pop = exprs.lookup_inst_exp(Exp::Pop,NULL,NULL,JIT_TYPE_INT);
    return new (exprs.mem) Pop_Inst(dst->opnd,pop,i);
}

//
//	push	ebp			-- EBP frame only
//	mov		ebp,esp		-- EBP frame only
//	sub		esp,n_extra+n_spill words
//	push	ebx
//	push	ebp			-- save callee-save registers
//	push	esi
//	push	edi
//
// Returns the first instruction of the push sequence, for use in the GC map
// data structure.  This part assumes an esp-based frame, but it shouldn't be
// hard to do the appropriate thing for ebp-based frames.  However, it should
// only be done after the corresponding change has been made to the stack
// unwinding code.
//
static Inst *insert_prolog(Class_Handle c_handle,
                           Method_Handle m_handle,
                           Expressions& exprs, Frame& frame, 
                           Cfg_Node *prolog, unsigned callee_saved_regs) {
    
	Mem_Manager& mem = exprs.mem;    

#ifdef O3_VTune_Support_CALLGRAPH
	Inst *inst_head = prolog->IR_instruction_list()->next();
	Inst *pop_eax = create_pop_inst(exprs,exprs.lookup_reg_exp(eax_reg,JIT_TYPE_INT,0),prolog->IR_instruction_list()->next());
    Inst *pop_ecx = create_pop_inst(exprs,exprs.lookup_reg_exp(ecx_reg,JIT_TYPE_INT,0),pop_eax);
	Inst *pop_edx = create_pop_inst(exprs,exprs.lookup_reg_exp(edx_reg,JIT_TYPE_INT,0),pop_ecx);    
	
	Exp *ib_exp = exprs.lookup_inst_exp(Exp::VTuneMethodCall, inst_head->exp, NULL,JIT_TYPE_VOID);
	Inst* cll = new (mem) VTune_Call_Inst(ib_exp,pop_edx, true) ;
	
	Inst *push_edx = create_push_inst(exprs,exprs.lookup_reg_exp(edx_reg,JIT_TYPE_INT,0),cll);
	Inst *push_ecx = create_push_inst(exprs,exprs.lookup_reg_exp(ecx_reg,JIT_TYPE_INT,0),push_edx);
	Inst *push_eax = create_push_inst(exprs,exprs.lookup_reg_exp(eax_reg,JIT_TYPE_INT,0),push_ecx);	

//	Inst* cll = new (mem) VTune_Call_Inst(ib_exp,inst_head, true) ;




	cll->set_gc_unsafe() ;
#endif

	Inst *result = NULL;
    Inst *first = prolog->IR_instruction_list()->next();

    Operand_Exp *esp = exprs.lookup_reg_exp(esp_reg,JIT_TYPE_INT,0);
    // push ebp if frame is ebp based
    if (frame.base_reg == ebp_reg) {
        Operand_Exp *ebp = exprs.lookup_reg_exp(ebp_reg,JIT_TYPE_INT,0);
        create_push_inst(exprs,ebp,first); // push ebp
        Inst_Exp *mv = exprs.lookup_inst_exp(Exp::Assign,ebp,esp,JIT_TYPE_INT);
        new (mem) Assign_Inst(ebp->opnd,esp->opnd,mv,first);
    }
    //
    // adjust frame
    //
    unsigned size = (frame.n_extra + frame.n_spill) * sizeof(int);
    if (size > 0) {
        Operand_Exp *imm = exprs.lookup_imm_exp(size,JIT_TYPE_INT);
        Inst_Exp *sub = exprs.lookup_inst_exp(Exp::Sub,esp,imm,JIT_TYPE_INT);
        Inst *i = new (mem) Sub_Inst(Sub_Inst::sub,esp->opnd,imm->opnd,sub,first);
        i->set_dst(esp->opnd);
    }

    //
    // push callee-save registers
    //
    assert(!(callee_saved_regs & callee_saved_ebp_mask) || frame.base_reg != ebp_reg);
	if (callee_saved_regs & callee_saved_ebx_mask)  // push ebx
    {
        Inst *t = create_push_inst(exprs,exprs.lookup_reg_exp(ebx_reg,JIT_TYPE_INT,0),first);
        if (result == NULL) result = t;
    }
	if (callee_saved_regs & callee_saved_ebp_mask)  // push ebp
    {
        Inst *t = create_push_inst(exprs,exprs.lookup_reg_exp(ebp_reg,JIT_TYPE_INT,0),first);
        if (result == NULL) result = t;
    }
	if (callee_saved_regs & callee_saved_esi_mask)  // push esi
    {
        Inst *t = create_push_inst(exprs,exprs.lookup_reg_exp(esi_reg,JIT_TYPE_INT,0),first);
        if (result == NULL) result = t;
    }
	if (callee_saved_regs & callee_saved_edi_mask)  // push edi
    {
        Inst *t = create_push_inst(exprs,exprs.lookup_reg_exp(edi_reg,JIT_TYPE_INT,0),first);
        if (result == NULL) result = t;
    }

    assert(frame.base_reg == esp_reg);  // see comment above
    return result;
}

//  < ... call MonExit ...>         -- for synchronized method
//	pop edi
//	pop esi
//	pop ebp							-- ESP frame only
//	pop ebx
//	mov esp, ebp					-- EBP frame only
//	pop ebp							-- EBP frame only
//	add	esp, n_locals+n_spill_words	-- ESP frame only
//
// Returns the first instruction of the pop sequence, for use in the GC map
// data structure.  See the comments for insert_prolog().
//
static Inst *insert_epilog(Class_Handle c_handle, 
                           Method_Handle m_handle,
                           Expressions& exprs, Frame& frame, 
                           Cfg_Node *epilog, unsigned callee_saved_regs) {
    if (epilog == NULL)
        return NULL;

    Mem_Manager& mem = exprs.mem;

#ifdef O3_VTune_Support_CALLGRAPH
	Inst *inst_head = epilog->IR_instruction_list()->next();
//    Inst *t1 = create_push_inst(exprs,exprs.lookup_reg_exp(eax_reg,JIT_TYPE_INT,0),inst_head);
	Inst *pop_eax = create_pop_inst(exprs,exprs.lookup_reg_exp(eax_reg,JIT_TYPE_INT,0),epilog->IR_instruction_list()->prev());
    Inst *pop_ecx = create_pop_inst(exprs,exprs.lookup_reg_exp(ecx_reg,JIT_TYPE_INT,0),pop_eax);
	Inst *pop_edx = create_pop_inst(exprs,exprs.lookup_reg_exp(edx_reg,JIT_TYPE_INT,0),pop_ecx);    

	Exp *ib_exp = exprs.lookup_inst_exp(Exp::VTuneMethodCall, inst_head->exp, NULL,JIT_TYPE_VOID);
//	Inst* cll = new (mem) VTune_Call_Inst(ib_exp,inst_head, false) ;
	Inst* cll = new (mem) VTune_Call_Inst(ib_exp, pop_edx, false) ;
//    Inst *t2 = create_pop_inst(exprs,exprs.lookup_reg_exp(eax_reg,JIT_TYPE_INT,0),epilog->IR_instruction_list()->prev());

	Inst *push_edx = create_push_inst(exprs,exprs.lookup_reg_exp(edx_reg,JIT_TYPE_INT,0),cll);
	Inst *push_ecx = create_push_inst(exprs,exprs.lookup_reg_exp(ecx_reg,JIT_TYPE_INT,0),push_edx);
	Inst *push_eax = create_push_inst(exprs,exprs.lookup_reg_exp(eax_reg,JIT_TYPE_INT,0),push_ecx);	

	cll->set_gc_unsafe() ;
#endif

    Inst *result = NULL;
    Inst *last = epilog->IR_instruction_list()->prev();

    Operand_Exp *esp = exprs.lookup_reg_exp(esp_reg,JIT_TYPE_INT,0);
    Operand_Exp *ebp = exprs.lookup_reg_exp(ebp_reg,JIT_TYPE_INT,0);
    //
    // pop callee-save registers
    //
    assert(!(callee_saved_regs & callee_saved_ebp_mask) || frame.base_reg != ebp_reg);
	if (callee_saved_regs & callee_saved_edi_mask)  // pop edi
    {
        Inst *t = create_pop_inst(exprs,exprs.lookup_reg_exp(edi_reg,JIT_TYPE_INT,0),last);
        if (result == NULL) result = t;
    }
	if (callee_saved_regs & callee_saved_esi_mask)  // pop esi
    {
        Inst *t = create_pop_inst(exprs,exprs.lookup_reg_exp(esi_reg,JIT_TYPE_INT,0),last);
        if (result == NULL) result = t;
    }
	if (callee_saved_regs & callee_saved_ebp_mask)  // pop ebp
    {
        Inst *t = create_pop_inst(exprs,ebp,last);
        if (result == NULL) result = t;
    }
	if (callee_saved_regs & callee_saved_ebx_mask)  // pop ebx
    {
        Inst *t = create_pop_inst(exprs,exprs.lookup_reg_exp(ebx_reg,JIT_TYPE_INT,0),last);
        if (result == NULL) result = t;
    }
    //
    // adjust frame 
    //
    if (frame.base_reg == ebp_reg) { // EBP frame
        Inst_Exp *mv = exprs.lookup_inst_exp(Exp::Assign,esp,ebp,JIT_TYPE_INT);
        new (mem) Assign_Inst(esp->opnd,ebp->opnd,mv,last);
        create_pop_inst(exprs,ebp,last); // pop ebp
    } else {  // ESP frame
        unsigned size = (frame.n_extra + frame.n_spill) * sizeof(int);
        if (size > 0) {
            Operand_Exp *imm = exprs.lookup_imm_exp(size,JIT_TYPE_INT);
            Inst_Exp *add = exprs.lookup_inst_exp(Exp::Add,esp,imm,JIT_TYPE_INT);
            Inst *i = new (mem) Add_Inst(Add_Inst::add,esp->opnd,imm->opnd,add,last);
            i->set_dst(esp->opnd);
        }
    }
    //
    // ret instruction
    //
    assert(frame.base_reg == esp_reg);  // see comment above
    return result;
}

class Emitter_Closure : public Closure {
public:
    Emitter_Closure(O3_Emitter& em, X86_Opnd_Pool& xp) : 
      emitter(em), x86_opnds(xp) {}
    O3_Emitter&    emitter;
    X86_Opnd_Pool& x86_opnds;
};

// If there are no outgoing edges, then we don't need a branch.
// If the last instruction exits the method, then we don't need a branch.
// Check the first outgoing edge.
// If it's a lookupswitch, then we need a branch.
// If the target will be emitted next, then we don't need a branch.
// Otherwise, we need a branch.
static bool need_a_fallthrough_branch(Cfg_Node *node, Cfg_Node *next_node_emitted)
{
    if (node->out_edge_size() == 0)
        return false;
    Inst *last = node->IR_instruction_list()->prev();
    if (last != node->IR_instruction_list() && last->exits_method())
        return false;
    if (node->extra_info != NULL && !node->extra_info->is_tableswitch()) {
        //
        // if default block is the immediately following block, then no jump is needed
        //
        if (node->linearization_node()->next() != node->out_edges(0)->linearization_node())
            return true;
        else
            return false;
    }
    Cfg_Node *target = node->out_edges(0);
    if (target == next_node_emitted)
        return false;
    return true;
}

static bool add_full_padding(Cfg_Node *node)
{
    bool result = false;
#ifdef LOOP_PADDING_1
    result = (node->mark() == 'N');
#endif // LOOP_PADDING_1

#ifdef LOOP_PADDING_2
    // Add padding if there's at least 1 incoming edge,
    // and no incoming edges have been emitted.
    Cfg_Int e;
    for (e=0; e<node->in_edge_size(); e++)
    {
        if (node->in_edges(e)->code_offset() != -1)
            break;
    }
    result = (e > 0 && e >= node->in_edge_size());
#endif // LOOP_PADDING_2
    return result;
}

static bool add_partial_padding(Cfg_Node *node)
{
    bool result = false;
#ifdef LOOP_PADDING_1
    result = (node->mark() == 'N');
#endif // LOOP_PADDING_1

#ifdef LOOP_PADDING_2
    // Add padding if at least 1 incoming edge has not been emitted, and it's not
    // an edge from a cold node (i.e., a non-inlined call emitted out-of-line).
    Cfg_Int e;
    for (e=0; e<node->in_edge_size(); e++)
    {
        if (node->in_edges(e)->code_offset() == -1 && !node->in_edges(e)->is_cold())
        {
            result = true;
            break;
        }
    }
#endif // LOOP_PADDING_2
    return result;
}

#if 0
static bool prev_inst_sets_flags(Inst *inst, Inst *head)
{
    Operand *tested = inst->src(0);
    Inst *prev = inst->prev();
    while (prev != head)
    {
        if (prev->sets_zero_flag() && !prev->must_use_lea() && prev->dst() == tested)
            return true;
        if (prev->affects_flags())
            return false;
        if (prev->is_assignment() && prev->dst() == tested)
            tested = prev->src(0);
        prev = prev->prev();
    }
    return false;
}
#endif // 0

static void emit_block(Flow_Graph *fg, Cfg_Node *node, unsigned short traversal_number, 
                       Mem_Manager &mm, O3_Emitter& xe, X86_Opnd_Pool& xp,
                       Frame& frame, bool is_prolog, GC_Map *gcmap,
                       Cfg_Node *next_node_emitted, bool &should_start_new_bb)
{
    assert(node->latest_traversal <= traversal_number);
    //
    // create a frame for tracking esp (push/pop)
    //
    Frame *fm;
    ESP_Frame esp_fm(frame.n_args,frame.n_vars,frame.n_extra,frame.n_spill,frame.n_callee);
    EBP_Frame ebp_fm(frame.n_args,frame.n_vars,frame.n_extra,frame.n_spill,frame.n_callee);
    if (frame.base_reg == esp_reg)
         fm = &esp_fm;
    else fm = &ebp_fm;
    //
    // if node is the prolog, then we set n_stack = -n_callee so that
    // subsequent pushes set n_stack back to zero.
    //
    int num_words_pushed;
    if (is_prolog) {
        fm->pop(frame.n_callee);
        xe.n_words_pushed = -(int)frame.n_callee;
        num_words_pushed = xe.n_words_pushed - frame.n_extra - frame.n_spill;
    } else {
        xe.n_words_pushed = 0;
        num_words_pushed = 0;
    }
    xp.frame = fm;

    if (add_full_padding(node))
    {
        unsigned cur_offset = xe.get_offset();
        unsigned alignment = (cur_offset & 0xf);
        unsigned padding = (0x10 - alignment) & 0xf;
#if 0
        if (padding >= 8)
            padding = 0;
#endif // 0
#if 0
        if (true || padding > 0)
            cout << "Adding " << padding << " bytes of loop padding in"
            << class_get_name(fg->c_handle()) << "." << method_get_name(fg->m_handle()) << endl;
#endif // 0
        unsigned i;
        for (i=0; i<padding; i++)
            xe.emit_nop();
    }
    else if (add_partial_padding(node))
    {
        unsigned cur_offset = xe.get_offset();
        unsigned alignment = (cur_offset & 0xf);
        unsigned padding = (0x10 - alignment) & 0xf;
        if (padding >= 8)
            padding = 0;
#if 0
        if (true || padding > 0)
            cout << "Adding " << padding << " bytes of loop padding in"
            << class_get_name(fg->c_handle()) << "." << method_get_name(fg->m_handle()) << endl;
#endif // 0
        xe.emit_padding(padding);
    }

#ifdef CAFFEINE_MARK
    if (!is_prolog && block_padding)
    {   // cheating by adding nops
        unsigned i;
        for (i=0; i<150; i++)
            xe.emit_nop();
    }
#endif

    // Emit the existing code for the basic block.
    xe.curr_node = node;
    node->set_code_offset(xe.get_offset());
    if (should_start_new_bb)
        gcmap->start_bb(node, xe.get_offset(), num_words_pushed, frame);
    bool need_fallthrough_branch = need_a_fallthrough_branch(node, next_node_emitted);
    Inst *head = node->IR_instruction_list();

    // Decide whether we're better off commuting the branch condition.
    if (need_fallthrough_branch &&
        node->out_edge_size() == 2 &&
        head->next() != head &&
        head->prev()->is_branch() &&
        node->get_branch_target() == next_node_emitted &&
        ((Branch_Inst*)head->prev())->can_commute())
    {
        ((Branch_Inst*)head->prev())->commute_condition();
        node->swap_edges();
        need_fallthrough_branch = false;
    }

#ifdef PLDI
    // See whether the start of the BB would be a traditional GC-safe
    // point; i.e., whether it is the target of a backward branch.
    bool target_of_backward_branch = false;
    Cfg_Int tmp_edge;
    for (tmp_edge=0; tmp_edge<node->in_edge_size(); tmp_edge++)
    {
        if (node->in_edges(tmp_edge)->code_offset() == -1)
        {
            target_of_backward_branch = true;
            break;
        }
    }
    if (target_of_backward_branch)
        pldi_std_gc_safe ++;
#endif // PLDI
    Inst *i = head->next(); 
    while (i != head) {
        Inst *next = i->next();

#ifdef STAT_INDIRECT_CALL
//		if(i->is_call() && ((Call_Inst*)i)->kind == Call_Inst::stat_indirect_call){
		if(i->is_stat_call()){
/*			void* call_addr = orp_get_rt_support_addr(ORP_RT_STAT_INDIRECT_CALL) ;
			assert(call_addr) ;
			xe.emit_call((char*)call_addr);
			unsigned patch_offset = xe.get_offset() - 4;
			xe.code_patch = new (xe.mem) 
				Call_Patch(xe.code_patch, patch_offset, (char*)call_addr);
			i->unlink() ;
			i = next;
			continue ;
*/
		}
#endif

#ifdef PLDI
        // Count it if it's a call site, but not if it's the first instruction
        // of the BB and the BB was already counted as a target of a backward
        // branch.
        if (i->is_call() && !(i == head->next() && target_of_backward_branch))
            pldi_std_gc_safe ++;
#endif // PLDI
        if (i->is_target_of_optimized_sw_br()) {
            assert(xe.patch_switch_br != NULL);
            *xe.patch_switch_br = xe.get_next() - xe.patch_switch_br - 1;
            xe.patch_switch_br = NULL;
        }
        //
        // detect if inst i is redundant (e.g., eax = eax)
        //
        Inst *prev;
        Operand *src1 = (i->n_srcs == 2) ? i->src(1) : NULL;
        if (i->is_same_reg_copying() && !i->is_fp_pop() && i->can_eliminate())
            i->unlink();
        else if (i->is_compare() && ((Compare_Inst*)i)->kind == Compare_Inst::test &&
            (prev = i->prev()) != head && prev->dst() == i->src(0) &&
            prev->sets_zero_flag() && !prev->must_use_lea()) // eliminate   eax = eax - 1
            i->unlink();                                     //             test eax,eax
        else if (src1 != NULL && src1->kind == Operand::Immediate && // eax = eax + 0 or eax = eax * 1 
             ((i->is_add() && ((Add_Inst*)i)->kind == Add_Inst::add && ((Imm_Operand*)src1)->imm() == 0) ||
			 (i->is_mul() && ((Mul_Inst*)i)->kind == Mul_Inst::mul && ((Imm_Operand*)src1)->imm() == 1)) &&
#ifdef TURN_OFF_FOR_DEBUG
			 i->dst()->bv_position() < n_reg && i->dst()->bv_position() == i->src(0)->bv_position())
#else
			  ((i->dst()->is_reg() && i->dst()->bv_position() < n_reg && 
			  i->src(0)->is_reg() && i->dst()->bv_position() == i->src(0)->bv_position() ) ||(i->dst()->is_mem() && i->dst() == i->src(0) )))
#endif
				i->unlink() ;
        else {
            //
            // emit x86 instruction
            //
            if (i == fg->first_push_inst)
                gcmap->set_first_push_offset(xe.get_offset());
            else if (i == fg->first_pop_inst)
                gcmap->set_first_pop_offset(xe.get_offset());
            else if (i == fg->return_inst)
                gcmap->set_return_offset(xe.get_offset());
            unsigned pre_offset = xe.get_offset();
            i->emit_inst(xe, xp);

			const Inst::Info* info = i->info();

            gcmap->add_inst(i, pre_offset, xe.get_offset(), frame);

#ifdef PLDI_OVERRIDDEN
            static int n_overridden = 0;
            if (i->is_compare() && ((Compare_Inst*)i)->is_vtable_compare() &&
                !method_is_overridden(((Compare_Inst*)i)->method_inlined())) {
                n_overridden++;
                // if (n_overridden > 4 && n_overridden < 7) {
                if (i->next()->is_branch()) {
                    Method_Handle callee_mh = ((Compare_Inst*)i)->method_inlined();
                    xe.code_patch = new (xe.mem) 
                    Overridden_Patch(xe.code_patch,
                                    pre_offset,
                                    xe.get_offset() - pre_offset,
                                    xe.curr_node->get_fallthrough(),
                                    callee_mh);
                    method_set_inline_assumption(fg->cmpl_handle(),
                                                 fg->m_handle(),
                                                 callee_mh);
                    //cout << "overridden" << endl;
                }
            }
#endif

        }
        i = next;
    }
    
	// Now check whether we need to emit a branch at the end.
    if (need_fallthrough_branch)
    {
        Jump_Inst *new_inst = new(mm) Jump_Inst(NULL, head);
        new_inst->set_fallthrough();
        unsigned pre_offset = xe.get_offset();
        new_inst->emit_inst(xe, xp);
        gcmap->add_inst(new_inst, pre_offset, xe.get_offset(), frame);
    }
    node->set_code_length(xe.get_offset() - node->code_offset());
    if (true &&
        node->out_edge_size() == 1 &&
        node->out_edges(0) == next_node_emitted &&
        node->eh_out_edge() == NULL &&
        node->get_enclosing_subr() == next_node_emitted->get_enclosing_subr())
        should_start_new_bb = false;
    else
    {
        should_start_new_bb = true;
        gcmap->end_bb(node, xe.get_offset());
    }
}

void Flow_Graph::emit_code(O3_Emitter& xe, X86_Opnd_Pool& xp, Frame& frame, Expressions &exprs)
{
    // XXX- use a temporary memory manager
    GC_Map gcmap(mem_manager, this, exprs.reg_map.curr_tmp_reg_id());
    // First, determine the linear ordering.
    //linearize();
    Cfg_Node_List *last_node = &linear_node_ordering;
    bool is_prolog = true;
    bool should_start_new_bb = true;
    Cfg_Node_List *cur_node;
    for (cur_node=last_node->next(); cur_node!=last_node; cur_node=cur_node->next())
    {
        emit_block(this, cur_node->node(), traversal_number, mem_manager, xe, xp, frame,
            is_prolog, &gcmap, cur_node->next()->node(), should_start_new_bb);
        is_prolog = false;
    }
    assert(should_start_new_bb);


    //
    // create code block and copy native code
    //
    unsigned code_size = xe.get_size();
    assert(code_size > 0);
    code_block = (char*)method_allocate_code_block(m_handle(), O3_Jit_Handle, code_size);
    xe.copy(code_block);
#if 0
    if ((unsigned)code_block % 16 != 0)
        cout << "Code_block alignment: " << (unsigned)code_block % 16 << " in "
        << class_get_name(c_handle()) << "." << method_get_name(m_handle()) << endl;
#endif // 0
#ifdef PLDI_OVERRIDDEN
    //
    // create overridden rec
    //
    overridden_rec = create_overridden_rec(xe.code_patch, code_block, m_handle(), cmpl_handle());
#endif

    BitStream bits;
    unsigned mi_offset_bits = 32;
    gcmap.emit_all(this, frame, m_handle(), exprs, bits, mi_offset_bits, code_size, true);
    gcmap.emit_all(this, frame, m_handle(), exprs, bits, mi_offset_bits, code_size, false);
    unsigned mi_size = (bits.get_offset() + 7) / 8;
    Byte *mi = method_allocate_info_block(m_handle(), O3_Jit_Handle, mi_size);
    bits.copy_into(mi);

#if 0
    static unsigned total_code_size = 0;
    static unsigned total_info_size = 0;
    static unsigned method_count = 0;
    total_code_size += code_size;
    total_info_size += mi_size;
    method_count ++;
    cout << class_get_name(c_handle()) << "." << method_get_name(m_handle())
         << " (method " << method_count << "): " << endl;
    cout << "\tcode_size " << code_size << endl;
    cout << "\tinfo_size " << mi_size << " (" << bits.get_offset() << " bits)" << endl;
    cout << "\tavg       " << ((double) mi_size) / code_size
         << " info bytes per code byte" << endl;
    cout << "\tCumulative code_size " << total_code_size << endl;
    cout << "\tCumulative info_size " << total_info_size << endl;
    cout << "\tCumulative avg:      " << ((double) total_info_size) / total_code_size << endl;
#endif
#ifdef PLDI
    pldi_num_methods ++;
    pldi_code_size += code_size;
    pldi_info_size += mi_size;
#endif // PLDI
}

void Flow_Graph::register_exceptions()
{
    // First, determine the number of EH entries.  Each Eh_Node contributes
    // (in_edge_size*out_edge_size) EH entries.
    unsigned num_eh_entries = 0;
    Eh_Node *eh;
    Eh_Node *last_eh = &_handlers;
    for (eh=_handlers.next(); eh!=last_eh; eh=eh->next())
        num_eh_entries += (eh->in_edge_size() * eh->out_edge_size());
    method_set_num_target_handlers(m_handle(), O3_Jit_Handle, num_eh_entries);
    // Next, repeat the loop and report the actual entries to the ORP.
    unsigned cur_eh_entry = 0;
    for (eh=_handlers.next(); eh!=last_eh; eh=eh->next())
    {
        Cfg_Int i;
        for (i=0; i<eh->in_edge_size(); i++)
        {
            Cfg_Node *n = eh->in_edges(i);
            char *start_ip = code_block + n->code_offset();
            char *end_ip = code_block + n->code_offset() + n->code_length();
            Cfg_Int j;
            for (j=0; j<eh->out_edge_size(); j++)
            {
                Cfg_Node *handler = eh->out_edges(j)->handler;
                char *handler_ip = code_block + handler->code_offset();
                Class_Handle catch_ch = eh->out_edges(j)->class_handle;
                method_set_target_handler_info(m_handle(), O3_Jit_Handle, cur_eh_entry,
                    start_ip, end_ip, handler_ip, catch_ch,
                    !handler->live->is_live_var(eax_reg) // whether eax (exc obj) is dead
                    );
                cur_eh_entry ++;
            }
        }
    }
    assert(cur_eh_entry == num_eh_entries);
}

#define MAX_ARG_REGS 2

void code_emission(Compile_Handle cmpl_handle,
                   Class_Handle   c_handle,
                   Method_Handle  m_handle, 
                   Expressions&   exprs, 
                   Flow_Graph     *fg, 
                   unsigned       bytecode_size,
                   unsigned       args_on_stack
#ifdef CAFFEINE_MARK
                   , bool do_block_padding
#endif
                   ) {
    //
    // determine how many callee-saved registers are used
    //
    unsigned callee_saved_regs = fg->callee_saved_registers_used();
    unsigned n_callee = 0;
    if (callee_saved_regs) {
        int i;
        for (i = 0; i < n_reg; i++) 
            if (callee_saved_regs & callee_saved_mask & (1<<i)) n_callee++;
    }

    unsigned home_loc_size = fg->num_home_locations();

    ESP_Frame frame(args_on_stack,0,0,home_loc_size,n_callee);

    fg->first_push_inst =
        insert_prolog(c_handle,m_handle,exprs,frame,fg->prolog(),callee_saved_regs); // create prolog

    fg->first_pop_inst =
        insert_epilog(c_handle,m_handle,exprs,frame,fg->epilog(),callee_saved_regs); // create epilog

    //
    // create data emitter
    //
    unsigned data_size = exprs.data_space();
    char *data_block = (char *)method_allocate_data_block(m_handle, O3_Jit_Handle, data_size*sizeof(int));
    O3_Data_Emitter data_emitter(data_block,data_size*sizeof(int));
    //
    // create X86_Emitter
    //
    Mem_Manager mem(bytecode_size*sizeof(int));
    O3_Emitter emitter(data_emitter,mem,bytecode_size);
    //
    // create X86_Opnd_Pool 
    //
    M_Opnd       m_opnd(0);
    M_Base_Opnd  m_base(eax_reg,0);
    M_Index_Opnd m_indx(eax_reg,eax_reg,0,0);
    X86_Opnd_Pool x86_opnds(&m_opnd,&m_base,&m_indx);

    //
    // Identify cycle-inducing edges in the flow graph, exception handlers
    //
    //fg->partition_loop_edges();
    //
    // for each block, call emit()
    //
#ifdef CAFFEINE_MARK
    block_padding = do_block_padding;
#endif
	fg->emit_code(emitter,x86_opnds,frame, exprs);
    fg->register_exceptions();
    
	//
    // fix branch patches
    //
    Patch *p;
    for (p = emitter.code_patch;p != NULL; p = p->next())
        p->apply(fg->code_block);
    //
    // fill in switch table
    //
    Table_Switch_Patch *ts; 
    for (ts = emitter.table_switch; ts != NULL; ts = ts->next)
        ts->apply(fg->code_block);

#if defined(DUMP_JIT) && defined(PRINTABLE_O3) && defined(ORP_NT) 
    O3_dump_jit(fg);
	O3_dump_jit_for_profile(fg);
#endif // TRACE_O3
}

void Branch_Patch::apply(char *code_buffer) {
	char *inst = code_buffer + offset;
    unsigned target_offset = target->code_offset();
    assert(target_offset != -1);
	Imm_Opnd(target_offset - offset - 4).emit32(inst);
}

#ifdef PLDI_OVERRIDDEN
void Overridden_Patch::apply(char *code_buffer) {
	char *inst = code_buffer + offset;
    unsigned target_offset = target->code_offset();
    assert(length > 2 && target_offset != -1);
    if (target_offset - offset - 2 < (1u<<7))
    {
        jump8(inst,&Imm_Opnd(target_offset - offset - 2));
        unsigned i;
        for (i = 2; i < length; i++)
            nop(inst+i);
    }
    else
    {
        assert(0) ; //ERROR! for atomic violation
        jump32(inst,&Imm_Opnd(target_offset - offset - 2));
        unsigned i;
        for (i = 5; i < length; i++)
            nop(inst+i);
    }
}
#endif

void Call_Patch::apply(char *code_buffer) {
	char *inst = code_buffer + offset;
	Imm_Opnd(target - inst - 4).emit32(inst);
}

void NextPC_Patch::apply(char *code_buffer) {
    char *inst = code_buffer + offset;
    unsigned target_offset = (unsigned)(code_buffer + succ->code_offset());
    Imm_Opnd(target_offset).emit32(inst);
}

//
// fill in switch table entry
//
void Table_Switch_Patch::apply(char *code_buffer) {
    assert( target->code_offset() != -1);
    *(int*)data_label = (int)code_buffer + target->code_offset();
}
