// Copyright (C)  2000 Intel Corporation.  All rights reserved.
//
// $Header: /usr/development/orp/orp/arch/ia32/base/jit_lock_rt_support_ia32.cpp,v 1.21 2001/12/29 09:29:22 xli18 Exp $
//

#include "platform.h"
#include <iostream.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#include "orp_synch.h"
#include "../x86/x86.h"
#include "orp_stats.h"
#include "internal_jit_intf.h"
#include "nogc.h"

#ifndef OBJECT_LOCK_V2 
#include "mon_enter_exit.h"
#else
#include "mon_enter_exit_olv2.h"
#endif

#ifdef ORP_VTUNE_SUPPORT
//M:
#include "orp_vtune.h"
#endif


#ifndef OBJECT_LOCK_V2 

#ifdef ORP_STATS

static void __stdcall monitor_enter_instrumented(Java_java_lang_Object *) stdcall__;
static void __stdcall monitor_exit_instrumented(Java_java_lang_Object *) stdcall__;


static void __stdcall
monitor_enter_instrumented(Java_java_lang_Object *object)
{
    orp_stats_total.num_monitor_enter++;
    orp_monitor_enter(object);
} //monitor_enter_instrumented



static void __stdcall
monitor_exit_instrumented(Java_java_lang_Object *object)
{
    orp_stats_total.num_monitor_exit++;
    orp_monitor_exit(object);
} //monitor_exit_instrumented

#endif // ORP_STATS

#ifdef MONITOR_STO
void * getaddress__orp_monitor_enter_naked(){
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 72;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

#ifdef ORP_STATS
	ss = inc(ss, &M_Opnd((unsigned)&(orp_stats_total.num_monitor_enter)));
#endif

	// *offset_in_array == p_obj
	ss = mov(ss, &ecx_opnd, &M_Base_Opnd(esp_reg, 4));

	ss = mov(ss, &eax_opnd, &M_Opnd((unsigned)&num_lazylock));
	ss = mov(ss, &M_Base_Opnd(eax_reg, 0), &ecx_opnd);

#ifdef _DEBUG
	ss = call(ss, (char *)lazy_monitor_enter_check);
#endif

	ss = alu(ss, add_opc, &M_Opnd((unsigned)&num_lazylock),&Imm_Opnd(4));
	ss = ret(ss, &Imm_Opnd(4));
    
    addr = stub;
    assert((ss - stub) < stub_size);

#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_monitor_enter_naked",(Byte*) stub,ss-stub);
#endif
	return addr;
} //getaddress__orp_monitor_enter_naked

void * getaddress__orp_monitor_exit_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 126;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG_EVENT
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

#ifdef ORP_STATS
	ss = inc(ss, &M_Opnd((unsigned)&(orp_stats_total.num_monitor_exit)));
#endif

#ifdef _DEBUG
	ss = call(ss, (char *)lazy_monitor_exit_check);
#endif

	ss = alu(ss, sub_opc, &M_Opnd((unsigned)&num_lazylock),&Imm_Opnd(4));
	ss = mov(ss, &eax_opnd, &M_Opnd((unsigned)&num_lazylock));
	ss = mov(ss, &ecx_opnd, &M_Base_Opnd(eax_reg, 0));
	ss = alu(ss, cmp_opc, &M_Base_Opnd(esp_reg, 4), &ecx_opnd);

    ss = branch8(ss, cc_ne, &Imm_Opnd(0), 0);
    char *backpatch_address__lazy_monitor_failed = ((char *)ss) - 1;
#ifdef _DEBUG
	//zero the recorded lock to make GC safe
	ss = mov(ss, &M_Base_Opnd(eax_reg, 0), &Imm_Opnd(0));
#endif
	ss = ret(ss, &Imm_Opnd(4));

    signed offset = (signed)ss - (signed)backpatch_address__lazy_monitor_failed - 1;
    *backpatch_address__lazy_monitor_failed = offset;

	ss = alu(ss, add_opc, &M_Opnd((unsigned)&num_lazylock),&Imm_Opnd(4));

    ss = gen_setup_j2n_frame(ss);

    ss = push(ss, &M_Base_Opnd(esp_reg, sizeof(J2N_Saved_State)));

    ss = call(ss, (char *)orp_monitor_exit);

    ss = gen_pop_j2n_frame(ss);

    ss = ret(ss, &Imm_Opnd(4));

    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_monitor_exit_naked",(Byte*) stub,ss-stub);
#endif

    return addr;
} //getaddress__orp_monitor_exit_naked


char * restore__orp_monitor_enter_naked(char * ss)
{
	const int stub_size = 72;
	const char *stub = ss;
	
#ifdef ORP_STATS
	ss = inc(ss, &M_Opnd((unsigned)&(orp_stats_total.num_monitor_enter)));
#endif

    ss = push(ss, &M_Base_Opnd(esp_reg, 4));

    ss = call(ss, (char *)orp_monitor_cmp_value);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(4));

    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(esp_reg, 4));

#ifndef OLD_OBJ_LAYOUT
    ss = alu(ss, add_opc, &ecx_opnd, &Imm_Opnd(4));
#else
    ss = alu(ss, sub_opc, &ecx_opnd, &Imm_Opnd(4));
#endif // OLD_OBJ_LAYOUT 
    ss = mov(ss, &edx_opnd, &eax_opnd);
    ss = mov(ss, &eax_opnd, &Imm_Opnd(UNCONTESTED_HEADER_VALUE));

    ss = prefix(ss, lock_prefix);
    ss = cmpxchg(ss, &M_Base_Opnd(ecx_reg, 0), &edx_opnd);
    ss = alu(ss, sub_opc, &eax_opnd, &Imm_Opnd(UNCONTESTED_HEADER_VALUE));

    ss = branch8(ss, cc_ne, &Imm_Opnd(0), 0);
    char *backpatch_address__fast_monitor_failed = ((char *)ss) - 1;
    ss = ret(ss, &Imm_Opnd(4));

    signed offset = (signed)ss - (signed)backpatch_address__fast_monitor_failed - 1;
    *backpatch_address__fast_monitor_failed = offset;

    ss = gen_setup_j2n_frame(ss);

    ss = push(ss, &M_Base_Opnd(esp_reg, sizeof(J2N_Saved_State)));

    ss = call(ss, (char *)orp_monitor_enter);

    ss = gen_pop_j2n_frame(ss);

    ss = ret(ss, &Imm_Opnd(4));
	
	//cout << "MT monitor enter stubsize :" << (ss - stub) << endl;
	assert((ss - stub) < stub_size);
    
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("restore_orp_monitor_enter_naked",(Byte*) stub,ss-stub);
#endif
	return ss;
} //restore_orp_monitor_enter_naked

char * restore__orp_monitor_exit_naked(char * ss)
{

	const char *stub = ss;
	const int stub_size = 236;

#ifdef ORP_STATS
	ss = inc(ss, &M_Opnd((unsigned)&(orp_stats_total.num_monitor_exit)));
#endif
    ss = push(ss, &M_Base_Opnd(esp_reg, 4));

    ss = call(ss, (char *)orp_monitor_cmp_value);
    // We adjust the esp to pop the argument later.
    // Now, we temporarily re-use that space on the stack
    // to save the result of orp_monitor_cmp_value
    ss = mov(ss, &M_Base_Opnd(esp_reg, 0), &eax_opnd);

	ss = mov(ss, &ecx_opnd, &M_Base_Opnd(esp_reg, 12));

    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(esp_reg, 8));
#ifndef OLD_OBJ_LAYOUT
    ss = alu(ss, add_opc, &ecx_opnd, &Imm_Opnd(4));
#else
    ss = alu(ss, sub_opc, &ecx_opnd, &Imm_Opnd(4));
#endif // OLD_OBJ_LAYOUT
    ss = mov(ss, &edx_opnd, &Imm_Opnd(UNCONTESTED_HEADER_VALUE));

    ss = prefix(ss, lock_prefix);
    ss = cmpxchg(ss, &M_Base_Opnd(ecx_reg, 0), &edx_opnd);

    // Now pop the argument to orp_monitor_cmp_value!
    ss = pop(ss, &ecx_opnd);

    ss = alu(ss, sub_opc, &eax_opnd, &ecx_opnd);

    ss = branch8(ss, cc_ne, &Imm_Opnd(0), 0);
    char *backpatch_address__fast_monitor_failed = ((char *)ss) - 1;
    ss = ret(ss, &Imm_Opnd(4));

    signed offset = (signed)ss - (signed)backpatch_address__fast_monitor_failed - 1;
    *backpatch_address__fast_monitor_failed = offset;

    ss = gen_setup_j2n_frame(ss);

    ss = push(ss, &M_Base_Opnd(esp_reg, sizeof(J2N_Saved_State)));

    ss = call(ss, (char *)orp_monitor_exit);

    ss = gen_pop_j2n_frame(ss);

    ss = ret(ss, &Imm_Opnd(4));

    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("restore__orp_monitor_exit_naked",(Byte*) stub,ss-stub);
#endif

    return ss;
} //restore__orp_monitor_exit_naked

#else //#ifdef MONITOR_STO

void * getaddress__orp_monitor_enter_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 86;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

#ifdef ORP_STATS
    ss = inc(ss, &M_Opnd((unsigned)&(orp_stats_total.num_monitor_enter)));
#endif

    ss = push(ss, &M_Base_Opnd(esp_reg, 4));

    ss = call(ss, (char *)orp_monitor_cmp_value);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(4));

    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(esp_reg, 4));
#ifndef OLD_OBJ_LAYOUT
    ss = alu(ss, add_opc, &ecx_opnd, &Imm_Opnd(4));
#else
    ss = alu(ss, sub_opc, &ecx_opnd, &Imm_Opnd(4));
#endif // OLD_OBJ_LAYOUT
    ss = mov(ss, &edx_opnd, &eax_opnd);
    ss = mov(ss, &eax_opnd, &Imm_Opnd(UNCONTESTED_HEADER_VALUE));

    ss = prefix(ss, lock_prefix);
    ss = cmpxchg(ss, &M_Base_Opnd(ecx_reg, 0), &edx_opnd);
    ss = alu(ss, sub_opc, &eax_opnd, &Imm_Opnd(UNCONTESTED_HEADER_VALUE));

    ss = branch8(ss, cc_ne, &Imm_Opnd(0), 0);
    char *backpatch_address__fast_monitor_failed = ((char *)ss) - 1;

    ss = ret(ss, &Imm_Opnd(4));

    signed offset = (signed)ss - (signed)backpatch_address__fast_monitor_failed - 1;
    *backpatch_address__fast_monitor_failed = offset;
    
    ss = gen_setup_j2n_frame(ss);

    ss = push(ss, &M_Base_Opnd(esp_reg, sizeof(J2N_Saved_State)));

    ss = call(ss, (char *)orp_monitor_enter);

    ss = gen_pop_j2n_frame(ss);

    ss = ret(ss, &Imm_Opnd(4));

    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_monitor_enter_naked",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_monitor_enter_naked


void * getaddress__orp_monitor_exit_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 236;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

#ifdef ORP_STATS
	ss = inc(ss, &M_Opnd((unsigned)&(orp_stats_total.num_monitor_exit)));
#endif

    ss = push(ss, &M_Base_Opnd(esp_reg, 4));

    ss = call(ss, (char *)orp_monitor_cmp_value);
    // We adjust the esp to pop the argument later.
    // Now, we temporarily re-use that space on the stack
    // to save the result of orp_monitor_cmp_value
    ss = mov(ss, &M_Base_Opnd(esp_reg, 0), &eax_opnd);

    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(esp_reg, 8));

#ifndef OLD_OBJ_LAYOUT
    ss = alu(ss, add_opc, &ecx_opnd, &Imm_Opnd(4));
#else
    ss = alu(ss, sub_opc, &ecx_opnd, &Imm_Opnd(4));
#endif // OLD_OBJ_LAYOUT

    ss = mov(ss, &edx_opnd, &Imm_Opnd(UNCONTESTED_HEADER_VALUE));

    ss = prefix(ss, lock_prefix);
    ss = cmpxchg(ss, &M_Base_Opnd(ecx_reg, 0), &edx_opnd);

    // Now pop the argument to orp_monitor_cmp_value!
    ss = pop(ss, &ecx_opnd);

    ss = alu(ss, sub_opc, &eax_opnd, &ecx_opnd);

    ss = branch8(ss, cc_ne, &Imm_Opnd(0), 0);
    char *backpatch_address__fast_monitor_failed = ((char *)ss) - 1;
    ss = ret(ss, &Imm_Opnd(4));

    signed offset = (signed)ss - (signed)backpatch_address__fast_monitor_failed - 1;
    *backpatch_address__fast_monitor_failed = offset;
    
    ss = gen_setup_j2n_frame(ss);

    ss = push(ss, &M_Base_Opnd(esp_reg, sizeof(J2N_Saved_State)));

    ss = call(ss, (char *)orp_monitor_exit);

    ss = gen_pop_j2n_frame(ss);

    ss = ret(ss, &Imm_Opnd(4));

    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_monitor_exit_naked",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_monitor_exit_naked

#endif //ifdef MONITOR_STO else

#else  //#ifndef OBJECT_LOCK_V2
const char * string_of_IllegalMonitorStateException = "java/lang/IllegalMonitorStateException";
#ifdef LAZY_LOCK
void * getaddress__orp_monitor_enter_naked(){
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 72;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

#ifdef ORP_STATS
	ss = inc(ss, &M_Opnd((unsigned)&(orp_stats_total.num_monitor_enter)));
#endif

	// *offset_in_array == p_obj
	ss = mov(ss, &ecx_opnd, &M_Base_Opnd(esp_reg, 4));

	ss = mov(ss, &eax_opnd, &M_Opnd((unsigned)&num_lazylock));
	ss = mov(ss, &M_Base_Opnd(eax_reg, 0), &ecx_opnd);

#ifdef ORP_STATS
	ss = call(ss, (char *)lazy_monitor_enter_check);
#endif

	ss = alu(ss, add_opc, &M_Opnd((unsigned)&num_lazylock),&Imm_Opnd(4));
	ss = ret(ss, &Imm_Opnd(4));
    
    addr = stub;
    assert((ss - stub) < stub_size);

#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_monitor_enter_naked",(Byte*) stub,ss-stub);
#endif
	return addr;
} //getaddress__orp_monitor_enter_naked

void * getaddress__orp_monitor_exit_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 126;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG_EVENT
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

#ifdef ORP_STATS
	ss = inc(ss, &M_Opnd((unsigned)&(orp_stats_total.num_monitor_exit)));
#endif

#ifdef ORP_STATS
	ss = call(ss, (char *)lazy_monitor_exit_check);
#endif

	ss = alu(ss, sub_opc, &M_Opnd((unsigned)&num_lazylock),&Imm_Opnd(4));
	ss = mov(ss, &eax_opnd, &M_Opnd((unsigned)&num_lazylock));
	ss = mov(ss, &ecx_opnd, &M_Base_Opnd(eax_reg, 0));
	ss = alu(ss, cmp_opc, &M_Base_Opnd(esp_reg, 4), &ecx_opnd);

    ss = branch8(ss, cc_ne, &Imm_Opnd(0), 0);
    char *backpatch_address__lazy_monitor_failed = ((char *)ss) - 1;
#ifdef _DEBUG
	//zero the recorded lock to make GC safe
	ss = mov(ss, &M_Base_Opnd(eax_reg, 0), &Imm_Opnd(0));
#endif
	ss = ret(ss, &Imm_Opnd(4));

    signed offset = (signed)ss - (signed)backpatch_address__lazy_monitor_failed - 1;
    *backpatch_address__lazy_monitor_failed = offset;

    ss = push(ss,  &M_Opnd((unsigned)&(string_of_IllegalMonitorStateException) ) );
	ss = call(ss, (char *)throw_java_exception_wrapper);
    ss = ret(ss, &Imm_Opnd(4));

    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_monitor_exit_naked",(Byte*) stub,ss-stub);
#endif

    return addr;
} //getaddress__orp_monitor_exit_naked

void * restore__orp_monitor_enter_naked(void * code_addr)
{
	const int stub_size = 86;
	char *stub = (char *)code_addr;

#ifdef _DEBUG
    memset(stub, 0xcc, stub_size);
#endif

	char *ss =stub;
#ifdef ORP_STATS
    ss = inc(ss, &M_Opnd((unsigned)&(orp_stats_total.num_monitor_enter)));
#endif

	ss = mov(ss, &edx_opnd, &esp_opnd);
	ss = alu(ss, xor_opc, &eax_opnd, &eax_opnd);
	ss = mov(ss, &ecx_opnd, &M_Base_Opnd(esp_reg, INPUT_ARG_OFFSET));
    ss = shift(ss, shr_opc, &edx_opnd, &Imm_Opnd(STACK_KEY_SHIFT));
    ss = prefix(ss, lock_prefix);
    ss = cmpxchg(ss, &M_Base_Opnd(ecx_reg, HEADER_OFFSET + STACK_KEY_OFFSET), &edx_opnd, opnd_16);
    ss = branch8(ss, cc_ne, &Imm_Opnd(0), 0);
    char *backpatch_address__fast_monitor_failed = ((char *)ss) - 1;

    ss = ret(ss, &Imm_Opnd(4));

    signed offset = (signed)ss - (signed)backpatch_address__fast_monitor_failed - 1;
    *backpatch_address__fast_monitor_failed = offset;
    
    ss = gen_setup_j2n_frame(ss);

    ss = push(ss, &M_Base_Opnd(esp_reg, sizeof(J2N_Saved_State)));

    ss = call(ss, (char *)orp_monitor_enter_slow);

    ss = gen_pop_j2n_frame(ss);

    ss = ret(ss, &Imm_Opnd(4));

    assert((ss - stub) < stub_size);
    return ss;
} //getaddress__orp_monitor_enter_naked

void * restore__orp_monitor_exit_naked(void * code_addr)
{

	char *stub = (char *)code_addr;
	const int stub_size = 106;

#ifdef _DEBUG
    memset(stub, 0xcc, stub_size);
#endif

	char *ss = stub;
#ifdef ORP_STATS
    ss = inc(ss, &M_Opnd((unsigned)&(orp_stats_total.num_monitor_exit)));
#endif
    ss = mov(ss, &edx_opnd, &esp_opnd);
    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(esp_reg, INPUT_ARG_OFFSET));
	ss = shift(ss, shr_opc, &edx_opnd, &Imm_Opnd(STACK_KEY_SHIFT));
	ss = mov(ss, &eax_opnd, &M_Base_Opnd(ecx_reg, HEADER_OFFSET + STACK_KEY_OFFSET ), opnd_16);
	ss = alu(ss, cmp_opc, &eax_opnd, &edx_opnd, opnd_16);
    ss = branch8(ss, cc_ne, &Imm_Opnd(0), 0);
    char *backpatch_address__illegal_monitor_failed = ((char *)ss) - 1;

	ss = mov(ss, &eax_opnd, &M_Base_Opnd(ecx_reg,HEADER_OFFSET + HASH_CONTENTION_AND_RECURSION_OFFSET), opnd_16);
	ss = alu(ss, cmp_opc, &eax_opnd, &Imm_Opnd(0),false);
    ss = branch8(ss, cc_ne, &Imm_Opnd(0), 0);
    char *backpatch_address__recursed_monitor_failed = ((char *)ss) - 1;

	//release the lock
    ss = mov(ss, &edx_opnd, &Imm_Opnd(0));
    ss = mov(ss, &M_Base_Opnd(ecx_reg, HEADER_OFFSET + STACK_KEY_OFFSET), &edx_opnd, opnd_16);

    //ss = alu(ss, and_opc, &al_opnd, &Imm_Opnd(CONTENTION_MASK));
	ss = mov(ss, &edx_opnd, &eax_opnd);
	ss = alu(ss, and_opc, &edx_opnd, &Imm_Opnd(0x80));
	//ss = alu(ss, cmp_opc, &dl_opnd,  &Imm_Opnd(0));

    ss = branch8(ss, cc_ne, &Imm_Opnd(0), 0);
    char *backpatch_address__contended_monitor_failed = ((char *)ss) - 1;
    ss = ret(ss, &Imm_Opnd(4));

    signed 
    offset = (signed)ss-(signed)backpatch_address__contended_monitor_failed - 1;
    *backpatch_address__contended_monitor_failed = offset;
	ss = push(ss, &M_Base_Opnd(esp_reg, INPUT_ARG_OFFSET));
	ss = call(ss, (char *)find_an_interested_thread);
	ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(4));
    ss = ret(ss, &Imm_Opnd(4));

    offset = (signed)ss-(signed)backpatch_address__recursed_monitor_failed - 1;
    *backpatch_address__recursed_monitor_failed = offset;
    ss = alu(ss, sub_opc, &eax_opnd, &Imm_Opnd(1), false);
	ss = mov(ss,  &M_Base_Opnd(ecx_reg, HEADER_OFFSET + RECURSION_OFFSET), &eax_opnd, opnd_8, false);
    ss = ret(ss, &Imm_Opnd(4));

    offset = (signed)ss-(signed)backpatch_address__illegal_monitor_failed - 1;
    *backpatch_address__illegal_monitor_failed = offset;
    ss = push(ss,  &M_Opnd((unsigned)&(string_of_IllegalMonitorStateException) ) );
	ss = call(ss, (char *)throw_java_exception_wrapper);
    ss = ret(ss, &Imm_Opnd(4));

    assert((ss - stub) < stub_size);
    return ss; 

}
#else //ifdef LAZY_LOCK
void * getaddress__orp_monitor_enter_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 86;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc, stub_size);
#endif
    char *ss = stub;

#ifdef ORP_STATS
    ss = inc(ss, &M_Opnd((unsigned)&(orp_stats_total.num_monitor_enter)));
#endif

	ss = mov(ss, &edx_opnd, &esp_opnd);
	ss = alu(ss, xor_opc, &eax_opnd, &eax_opnd);
	ss = mov(ss, &ecx_opnd, &M_Base_Opnd(esp_reg, INPUT_ARG_OFFSET));
    ss = shift(ss, shr_opc, &edx_opnd, &Imm_Opnd(STACK_KEY_SHIFT));
    ss = prefix(ss, lock_prefix);
    ss = cmpxchg(ss, &M_Base_Opnd(ecx_reg, HEADER_OFFSET + STACK_KEY_OFFSET), &edx_opnd, opnd_16);
    ss = branch8(ss, cc_ne, &Imm_Opnd(0), 0);
    char *backpatch_address__fast_monitor_failed = ((char *)ss) - 1;

    ss = ret(ss, &Imm_Opnd(4));

    signed offset = (signed)ss - (signed)backpatch_address__fast_monitor_failed - 1;
    *backpatch_address__fast_monitor_failed = offset;
    
    ss = gen_setup_j2n_frame(ss);

    ss = push(ss, &M_Base_Opnd(esp_reg, sizeof(J2N_Saved_State)));

    ss = call(ss, (char *)orp_monitor_enter_slow);

    ss = gen_pop_j2n_frame(ss);

    ss = ret(ss, &Imm_Opnd(4));

    addr = stub;
    assert((ss - stub) < stub_size);
    return addr;
} //getaddress__orp_monitor_enter_naked

void * getaddress__orp_monitor_exit_naked(){
    //return (void *)orp_monitor_exit;

    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 106;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc, stub_size);
#endif
    char *ss = stub;

#ifdef ORP_STATS
    ss = inc(ss, &M_Opnd((unsigned)&(orp_stats_total.num_monitor_exit)));
#endif
    ss = mov(ss, &edx_opnd, &esp_opnd);
    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(esp_reg, INPUT_ARG_OFFSET));
	ss = shift(ss, shr_opc, &edx_opnd, &Imm_Opnd(STACK_KEY_SHIFT));
	ss = mov(ss, &eax_opnd, &M_Base_Opnd(ecx_reg, HEADER_OFFSET + STACK_KEY_OFFSET ), opnd_16);
	ss = alu(ss, cmp_opc, &eax_opnd, &edx_opnd, opnd_16);
    ss = branch8(ss, cc_ne, &Imm_Opnd(0), 0);
    char *backpatch_address__illegal_monitor_failed = ((char *)ss) - 1;

	ss = mov(ss, &eax_opnd, &M_Base_Opnd(ecx_reg,HEADER_OFFSET + HASH_CONTENTION_AND_RECURSION_OFFSET), opnd_16);
	ss = alu(ss, cmp_opc, &eax_opnd, &Imm_Opnd(0),false);
    ss = branch8(ss, cc_ne, &Imm_Opnd(0), 0);
    char *backpatch_address__recursed_monitor_failed = ((char *)ss) - 1;

	//release the lock
    ss = mov(ss, &edx_opnd, &Imm_Opnd(0));
    ss = mov(ss, &M_Base_Opnd(ecx_reg, HEADER_OFFSET + STACK_KEY_OFFSET), &edx_opnd, opnd_16);

    //ss = alu(ss, and_opc, &al_opnd, &Imm_Opnd(CONTENTION_MASK));
	ss = mov(ss, &edx_opnd, &eax_opnd);
	ss = alu(ss, and_opc, &edx_opnd, &Imm_Opnd(0x80));
	//ss = alu(ss, cmp_opc, &dl_opnd,  &Imm_Opnd(0));

    ss = branch8(ss, cc_ne, &Imm_Opnd(0), 0);
    char *backpatch_address__contended_monitor_failed = ((char *)ss) - 1;
    ss = ret(ss, &Imm_Opnd(4));

    signed 
    offset = (signed)ss-(signed)backpatch_address__contended_monitor_failed - 1;
    *backpatch_address__contended_monitor_failed = offset;
	ss = push(ss, &M_Base_Opnd(esp_reg, INPUT_ARG_OFFSET));
	ss = call(ss, (char *)find_an_interested_thread);
	ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(4));
    ss = ret(ss, &Imm_Opnd(4));

    offset = (signed)ss-(signed)backpatch_address__recursed_monitor_failed - 1;
    *backpatch_address__recursed_monitor_failed = offset;
    ss = alu(ss, sub_opc, &eax_opnd, &Imm_Opnd(1), false);
	ss = mov(ss,  &M_Base_Opnd(ecx_reg, HEADER_OFFSET + RECURSION_OFFSET), &eax_opnd, opnd_8, false);
    ss = ret(ss, &Imm_Opnd(4));

    offset = (signed)ss-(signed)backpatch_address__illegal_monitor_failed - 1;
    *backpatch_address__illegal_monitor_failed = offset;
    ss = push(ss,  &M_Opnd((unsigned)&(string_of_IllegalMonitorStateException) ) );
	ss = call(ss, (char *)throw_java_exception_wrapper);
    ss = ret(ss, &Imm_Opnd(4));

    addr = stub;
    assert((ss - stub) < stub_size);
    return addr; 

}
#endif //LAZY_LOCK
/*
#define LOCK_PREFIX lock
__declspec(naked) __stdcall orp_try_monitor_enter(Java_java_lang_Object *p_obj)
{
    // todo --> run tests on UP machine to find out if we need a "no lock" cmpxchg version
   __asm {
            xor eax, eax
            mov ecx, esp
            mov ecx, [ecx + INPUT_ARG_OFFSET]
            mov edx, esp
            shr edx, STACK_KEY_SHIFT             // dx now holds stack_key
            LOCK_PREFIX cmpxchg [ecx + HEADER_OFFSET + STACK_KEY_OFFSET], dx
            jnz recursion_test
            ret 4  // returns eax = 0 because we just got the lock

recursion_test:
            cmp ax, dx
            jne another_thread_has_the_lock
            // bugbug somehow verify that inc[...] ALWAYS increments a byte, NOT an integer
            inc [ecx + HEADER_OFFSET + RECURSION_OFFSET]  // increment the recursion_count
            cmp [ecx + HEADER_OFFSET + RECURSION_OFFSET], 0
            je recursion_overflow
            ret 4   // returns eax = 0 if we already owned the lock

recursion_overflow:
            push [ecx + INPUT_ARG_OFFSET]
            call mon_enter_recursion_overflowed
            pop eax  //throw away the input arg 
            mov eax, 0 //got the lock, return eax = 0
            ret 4

another_thread_has_the_lock:
            ret 4  // return eax != 0 which means we did not get the lock
    }
}   

void * getaddress__orp_monitor_enter_naked(){

    return SMP_orp_monitor_enter;
}

__declspec(naked) __stdcall uint16 SMP_orp_monitor_enter(Java_java_lang_Object *p_obj)
{
   __asm {
            xor eax, eax
            mov ecx, esp
            mov ecx, [ecx + INPUT_ARG_OFFSET]
            mov edx, esp
            shr edx, STACK_KEY_SHIFT             // dx now holds stack_key
            lock cmpxchg [ecx + HEADER_OFFSET + STACK_KEY_OFFSET], dx
            jnz recursion_test
            ret 4  

recursion_test:
            cmp ax, dx
            jne another_thread_has_the_lock
            // bugbug somehow verify that inc[...] ALWAYS increments a byte
            inc [ecx + HEADER_OFFSET + RECURSION_OFFSET]  // increment the recursion_count
            cmp [ecx + HEADER_OFFSET + RECURSION_OFFSET], 0
            je recursion_overflow
            ret 4  

recursion_overflow:
            push [ecx + INPUT_ARG_OFFSET]
            call mon_enter_recursion_overflowed
            pop eax  //throw away the input arg 
            ret 4

another_thread_has_the_lock:
            
            push SPIN_LOOP_COUNT //todo --> get cpuid, use it to calculate number of spins

top_of_spin_loop:
            dec [esp]       // decrement the spin count
            cmp [esp], 0
            je spin_did_not_work

            nop
            nop     // pause to let other threads access the bus
            cmp [ecx + HEADER_OFFSET + STACK_KEY_OFFSET], 0 // just a read, no bus lock
            jne top_of_spin_loop

            // the header is zero, try to grab it
            xor eax, eax
            lock cmpxchg [ecx + HEADER_OFFSET + STACK_KEY_OFFSET], dx
            jnz top_of_spin_loop

            pop eax // throw away the spin count
            ret 4
            
spin_did_not_work:  
            pop eax // throw away the spin count

            // thus do a "setup_java_to_native_frame", then call into C code that may block

            call get_addr_of_orp_last_java_frame // addr of ljf now in eax
            push ebp 
            push ebx
            push esi
            push edi
            push 0      // used for JNI, see "struct J2N_Saved_State"
            push eax    // ljf from get_address_of_orp_last_java_frame above
            push [eax]
            mov [eax], esp

            push [esp + INPUT_ARG_OFFSET + SIZEOF_J2N_Saved_State]  // repush the object ref to be locked
            call block_on_mon_enter
            add esp, 4 // throw away the input arg

            // now do a "pop_java_to_native_frame"

            pop esi
            pop ebx
            mov [ebx], esi
            add esp, 4
            pop edi
            pop esi
            pop ebx
            pop ebp     // done with popping the java to native frame
            
            ret 4   // throw away the input arg
    }
}   

__declspec(naked) __stdcall STO_and_UP_orp_monitor_enter(Java_java_lang_Object *p_obj)
{
    assert(0);  // remove when hit, not debugged yet
   __asm {
            // same as SMP_orp_monitor_enter except no "lock" prefix, no spin
            // but we still need to handle contention -- this thread may block waiting for
            // a sleeping thread to wake up and release a desired lock
            // use this for both SMP STO and also UP STO cases

            // this works for SMP STO case even if the single thread migrates to different
            // CPUs -- a (no lock) cmpxchg is still atomic even if the thread migrates

            xor eax, eax
            mov ecx, esp
            mov ecx, [ecx + INPUT_ARG_OFFSET]
            mov edx, esp
            shr edx, STACK_KEY_SHIFT             // dx now holds stack_key
            cmpxchg [ecx + HEADER_OFFSET + STACK_KEY_OFFSET], dx  // NOTE: no "lock cmpxchg"
            jnz recursion_test
            ret 4  

recursion_test:
            cmp ax, dx
            jne another_thread_has_the_lock
            inc [ecx + HEADER_OFFSET + RECURSION_OFFSET]  // increment the recursion_count
            cmp [ecx + HEADER_OFFSET + RECURSION_OFFSET], 0
            je recursion_overflow
            ret 4  

recursion_overflow:
            push [ecx + INPUT_ARG_OFFSET]
            call mon_enter_recursion_overflowed
            pop eax  //throw away the input arg 
            ret 4

another_thread_has_the_lock:
            
            // don't bother to spin, this only works on SMP when multiple threads are 
            // contending for the same object lock.  Also, spinning is a waste on UP
            // under all circumstances

            // thus do a "setup_java_to_native_frame", then call into C code that may block

            call get_addr_of_orp_last_java_frame // addr of ljf now in eax
            push ebp 
            push ebx
            push esi
            push edi
            push 0      // used for JNI, see "struct J2N_Saved_State"
            push eax    // ljf from get_address_of_orp_last_java_frame above
            push [eax]
            mov [eax], esp

            push [esp + INPUT_ARG_OFFSET + SIZEOF_J2N_Saved_State]  // repush the object ref to be locked
            call block_on_mon_enter
            add esp, 4 // throw away the input arg

            // now do a "pop_java_to_native_frame"

            pop esi
            pop ebx
            mov [ebx], esi
            add esp, 4
            pop edi
            pop esi
            pop ebx
            pop ebp     // done with popping the java to native frame
            
            ret 4   // throw away the input arg
    }
}

__declspec(naked) uint16 __stdcall orp_monitor_exit(Java_java_lang_Object *p_obj)
{
   __asm {
            mov ecx, [esp + INPUT_ARG_OFFSET]
            mov edx, esp
            shr edx, STACK_KEY_SHIFT
            mov ax, [ecx + HEADER_OFFSET + STACK_KEY_OFFSET]
            cmp ax, dx
            jne IllegalMonitorStateException
            mov ax, [ecx + HEADER_OFFSET + HASH_CONTENTION_AND_RECURSION_OFFSET]  // get recursion and contention bits
            cmp ah, 0
            jne decrement_recursion

            mov edx, 0
            mov [ecx + HEADER_OFFSET + STACK_KEY_OFFSET], dx  // release the lock, NOTE: no "lock cmpxchg
            
            // al already loaded in mov ax,.... above
            and al, CONTENTION_MASK
            cmp al, 0
            jne contended_lock

            ret 4

contended_lock:
            push [esp + INPUT_ARG_OFFSET] // repush the object ref to be unlocked
            call find_an_interested_thread
            add esp, 4

            ret 4
            
decrement_recursion:
            sub ah, 1
            mov [ecx + HEADER_OFFSET + RECURSION_OFFSET], ah
            ret 4   // still holding the lock

IllegalMonitorStateException:
            push string_of_IllegalMonitorStateException
            call throw_java_exception_wrapper

            ret 4
   }
}


void set_hash_bits(Java_java_lang_Object *p_obj)
{
    unsigned int hb = (unsigned int)p_obj;

    // lowest 3 bits are not random enough so get rid of them
    hb = hb >> 3;
    hb &= HASH_MASK;
    if (hb == 0)
        hb = 23;  // NO hash = zero allowed, thus hard map hb = 0 to a fixed prime number

    __asm {
        mov eax, 0
        mov ecx, p_obj
        mov edx, hb

        // don't care if the cmpxchg fails -- just means someone else already set the hash
        lock cmpxchg [ecx + HEADER_OFFSET + HASH_CONTENTION_OFFSET], dl
    }
}

*/
#endif //#ifndef OBJECT_LOCK_V2 else
