Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20190128' into staging (3a183e33) · Commits · SUMMER2020 / students / proj-2021291

accel/tcg/cputlb.c

+186 −6

Original line number	Diff line number	Diff line
		@@ -74,6 +74,166 @@ QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data));
		QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);
		#define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1)

		static inline size_t sizeof_tlb(CPUArchState *env, uintptr_t mmu_idx)
		{
		return env->tlb_mask[mmu_idx] + (1 << CPU_TLB_ENTRY_BITS);
		}

		static void tlb_window_reset(CPUTLBWindow *window, int64_t ns,
		size_t max_entries)
		{
		window->begin_ns = ns;
		window->max_entries = max_entries;
		}

		static void tlb_dyn_init(CPUArchState *env)
		{
		int i;

		for (i = 0; i < NB_MMU_MODES; i++) {
		CPUTLBDesc *desc = &env->tlb_d[i];
		size_t n_entries = 1 << CPU_TLB_DYN_DEFAULT_BITS;

		tlb_window_reset(&desc->window, get_clock_realtime(), 0);
		desc->n_used_entries = 0;
		env->tlb_mask[i] = (n_entries - 1) << CPU_TLB_ENTRY_BITS;
		env->tlb_table[i] = g_new(CPUTLBEntry, n_entries);
		env->iotlb[i] = g_new(CPUIOTLBEntry, n_entries);
		}
		}

		/**
		* tlb_mmu_resize_locked() - perform TLB resize bookkeeping; resize if necessary
		* @env: CPU that owns the TLB
		* @mmu_idx: MMU index of the TLB
		*
		* Called with tlb_lock_held.
		*
		* We have two main constraints when resizing a TLB: (1) we only resize it
		* on a TLB flush (otherwise we'd have to take a perf hit by either rehashing
		* the array or unnecessarily flushing it), which means we do not control how
		* frequently the resizing can occur; (2) we don't have access to the guest's
		* future scheduling decisions, and therefore have to decide the magnitude of
		* the resize based on past observations.
		*
		* In general, a memory-hungry process can benefit greatly from an appropriately
		* sized TLB, since a guest TLB miss is very expensive. This doesn't mean that
		* we just have to make the TLB as large as possible; while an oversized TLB
		* results in minimal TLB miss rates, it also takes longer to be flushed
		* (flushes can be _very_ frequent), and the reduced locality can also hurt
		* performance.
		*
		* To achieve near-optimal performance for all kinds of workloads, we:
		*
		* 1. Aggressively increase the size of the TLB when the use rate of the
		* TLB being flushed is high, since it is likely that in the near future this
		* memory-hungry process will execute again, and its memory hungriness will
		* probably be similar.
		*
		* 2. Slowly reduce the size of the TLB as the use rate declines over a
		* reasonably large time window. The rationale is that if in such a time window
		* we have not observed a high TLB use rate, it is likely that we won't observe
		* it in the near future. In that case, once a time window expires we downsize
		* the TLB to match the maximum use rate observed in the window.
		*
		* 3. Try to keep the maximum use rate in a time window in the 30-70% range,
		* since in that range performance is likely near-optimal. Recall that the TLB
		* is direct mapped, so we want the use rate to be low (or at least not too
		* high), since otherwise we are likely to have a significant amount of
		* conflict misses.
		*/
		static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
		{
		CPUTLBDesc *desc = &env->tlb_d[mmu_idx];
		size_t old_size = tlb_n_entries(env, mmu_idx);
		size_t rate;
		size_t new_size = old_size;
		int64_t now = get_clock_realtime();
		int64_t window_len_ms = 100;
		int64_t window_len_ns = window_len_ms * 1000 * 1000;
		bool window_expired = now > desc->window.begin_ns + window_len_ns;

		if (desc->n_used_entries > desc->window.max_entries) {
		desc->window.max_entries = desc->n_used_entries;
		}
		rate = desc->window.max_entries * 100 / old_size;

		if (rate > 70) {
		new_size = MIN(old_size << 1, 1 << CPU_TLB_DYN_MAX_BITS);
		} else if (rate < 30 && window_expired) {
		size_t ceil = pow2ceil(desc->window.max_entries);
		size_t expected_rate = desc->window.max_entries * 100 / ceil;

		/*
		* Avoid undersizing when the max number of entries seen is just below
		* a pow2. For instance, if max_entries == 1025, the expected use rate
		* would be 1025/2048==50%. However, if max_entries == 1023, we'd get
		* 1023/1024==99.9% use rate, so we'd likely end up doubling the size
		* later. Thus, make sure that the expected use rate remains below 70%.
		* (and since we double the size, that means the lowest rate we'd
		* expect to get is 35%, which is still in the 30-70% range where
		* we consider that the size is appropriate.)
		*/
		if (expected_rate > 70) {
		ceil *= 2;
		}
		new_size = MAX(ceil, 1 << CPU_TLB_DYN_MIN_BITS);
		}

		if (new_size == old_size) {
		if (window_expired) {
		tlb_window_reset(&desc->window, now, desc->n_used_entries);
		}
		return;
		}

		g_free(env->tlb_table[mmu_idx]);
		g_free(env->iotlb[mmu_idx]);

		tlb_window_reset(&desc->window, now, 0);
		/* desc->n_used_entries is cleared by the caller */
		env->tlb_mask[mmu_idx] = (new_size - 1) << CPU_TLB_ENTRY_BITS;
		env->tlb_table[mmu_idx] = g_try_new(CPUTLBEntry, new_size);
		env->iotlb[mmu_idx] = g_try_new(CPUIOTLBEntry, new_size);
		/*
		* If the allocations fail, try smaller sizes. We just freed some
		* memory, so going back to half of new_size has a good chance of working.
		* Increased memory pressure elsewhere in the system might cause the
		* allocations to fail though, so we progressively reduce the allocation
		* size, aborting if we cannot even allocate the smallest TLB we support.
		*/
		while (env->tlb_table[mmu_idx] == NULL \|\| env->iotlb[mmu_idx] == NULL) {
		if (new_size == (1 << CPU_TLB_DYN_MIN_BITS)) {
		error_report("%s: %s", __func__, strerror(errno));
		abort();
		}
		new_size = MAX(new_size >> 1, 1 << CPU_TLB_DYN_MIN_BITS);
		env->tlb_mask[mmu_idx] = (new_size - 1) << CPU_TLB_ENTRY_BITS;

		g_free(env->tlb_table[mmu_idx]);
		g_free(env->iotlb[mmu_idx]);
		env->tlb_table[mmu_idx] = g_try_new(CPUTLBEntry, new_size);
		env->iotlb[mmu_idx] = g_try_new(CPUIOTLBEntry, new_size);
		}
		}

		static inline void tlb_table_flush_by_mmuidx(CPUArchState *env, int mmu_idx)
		{
		tlb_mmu_resize_locked(env, mmu_idx);
		memset(env->tlb_table[mmu_idx], -1, sizeof_tlb(env, mmu_idx));
		env->tlb_d[mmu_idx].n_used_entries = 0;
		}

		static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx)
		{
		env->tlb_d[mmu_idx].n_used_entries++;
		}

		static inline void tlb_n_used_entries_dec(CPUArchState *env, uintptr_t mmu_idx)
		{
		env->tlb_d[mmu_idx].n_used_entries--;
		}

		void tlb_init(CPUState *cpu)
		{
		CPUArchState *env = cpu->env_ptr;
		@@ -82,6 +242,8 @@ void tlb_init(CPUState *cpu)

		/* Ensure that cpu_reset performs a full flush. */
		env->tlb_c.dirty = ALL_MMUIDX_BITS;

		tlb_dyn_init(env);
		}

		/* flush_all_helper: run fn across all cpus
		@@ -122,7 +284,7 @@ void tlb_flush_counts(size_t pfull, size_t ppart, size_t *pelide)

		static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
		{
		memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));
		tlb_table_flush_by_mmuidx(env, mmu_idx);
		memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
		env->tlb_d[mmu_idx].large_page_addr = -1;
		env->tlb_d[mmu_idx].large_page_mask = -1;
		@@ -224,13 +386,24 @@ static inline bool tlb_hit_page_anyprot(CPUTLBEntry *tlb_entry,
		tlb_hit_page(tlb_entry->addr_code, page);
		}

		/**
		* tlb_entry_is_empty - return true if the entry is not in use
		* @te: pointer to CPUTLBEntry
		*/
		static inline bool tlb_entry_is_empty(const CPUTLBEntry *te)
		{
		return te->addr_read == -1 && te->addr_write == -1 && te->addr_code == -1;
		}

		/* Called with tlb_c.lock held */
		static inline void tlb_flush_entry_locked(CPUTLBEntry *tlb_entry,
		static inline bool tlb_flush_entry_locked(CPUTLBEntry *tlb_entry,
		target_ulong page)
		{
		if (tlb_hit_page_anyprot(tlb_entry, page)) {
		memset(tlb_entry, -1, sizeof(*tlb_entry));
		return true;
		}
		return false;
		}

		/* Called with tlb_c.lock held */
		@@ -241,7 +414,9 @@ static inline void tlb_flush_vtlb_page_locked(CPUArchState *env, int mmu_idx,

		assert_cpu_is_self(ENV_GET_CPU(env));
		for (k = 0; k < CPU_VTLB_SIZE; k++) {
		tlb_flush_entry_locked(&env->tlb_v_table[mmu_idx][k], page);
		if (tlb_flush_entry_locked(&env->tlb_v_table[mmu_idx][k], page)) {
		tlb_n_used_entries_dec(env, mmu_idx);
		}
		}
		}

		@@ -258,7 +433,9 @@ static void tlb_flush_page_locked(CPUArchState *env, int midx,
		midx, lp_addr, lp_mask);
		tlb_flush_one_mmuidx_locked(env, midx);
		} else {
		tlb_flush_entry_locked(tlb_entry(env, midx, page), page);
		if (tlb_flush_entry_locked(tlb_entry(env, midx, page), page)) {
		tlb_n_used_entries_dec(env, midx);
		}
		tlb_flush_vtlb_page_locked(env, midx, page);
		}
		}
		@@ -435,8 +612,9 @@ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
		qemu_spin_lock(&env->tlb_c.lock);
		for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
		unsigned int i;
		unsigned int n = tlb_n_entries(env, mmu_idx);

		for (i = 0; i < CPU_TLB_SIZE; i++) {
		for (i = 0; i < n; i++) {
		tlb_reset_dirty_range_locked(&env->tlb_table[mmu_idx][i], start1,
		length);
		}
		@@ -591,13 +769,14 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
		* Only evict the old entry to the victim tlb if it's for a
		* different page; otherwise just overwrite the stale data.
		*/
		if (!tlb_hit_page_anyprot(te, vaddr_page)) {
		if (!tlb_hit_page_anyprot(te, vaddr_page) && !tlb_entry_is_empty(te)) {
		unsigned vidx = env->tlb_d[mmu_idx].vindex++ % CPU_VTLB_SIZE;
		CPUTLBEntry *tv = &env->tlb_v_table[mmu_idx][vidx];

		/* Evict the old entry into the victim tlb. */
		copy_tlb_helper_locked(tv, te);
		env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index];
		tlb_n_used_entries_dec(env, mmu_idx);
		}

		/* refill the tlb */
		@@ -649,6 +828,7 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
		}

		copy_tlb_helper_locked(te, &tn);
		tlb_n_used_entries_inc(env, mmu_idx);
		qemu_spin_unlock(&env->tlb_c.lock);
		}

accel/tcg/tcg-runtime-gvec.c

+257 −0

Original line number	Diff line number	Diff line
		@@ -512,6 +512,39 @@ void HELPER(gvec_orc)(void d, void a, void *b, uint32_t desc)
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_nand)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(vec64)) {
		(vec64 )(d + i) = ~((vec64 )(a + i) & (vec64 )(b + i));
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_nor)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(vec64)) {
		(vec64 )(d + i) = ~((vec64 )(a + i) \| (vec64 )(b + i));
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_eqv)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(vec64)) {
		(vec64 )(d + i) = ~((vec64 )(a + i) ^ (vec64 )(b + i));
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_ands)(void d, void a, uint64_t b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		@@ -995,3 +1028,227 @@ void HELPER(gvec_ussub64)(void d, void a, void *b, uint32_t desc)
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_smin8)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(int8_t)) {
		int8_t aa = (int8_t )(a + i);
		int8_t bb = (int8_t )(b + i);
		int8_t dd = aa < bb ? aa : bb;
		(int8_t )(d + i) = dd;
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_smin16)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(int16_t)) {
		int16_t aa = (int16_t )(a + i);
		int16_t bb = (int16_t )(b + i);
		int16_t dd = aa < bb ? aa : bb;
		(int16_t )(d + i) = dd;
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_smin32)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(int32_t)) {
		int32_t aa = (int32_t )(a + i);
		int32_t bb = (int32_t )(b + i);
		int32_t dd = aa < bb ? aa : bb;
		(int32_t )(d + i) = dd;
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_smin64)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(int64_t)) {
		int64_t aa = (int64_t )(a + i);
		int64_t bb = (int64_t )(b + i);
		int64_t dd = aa < bb ? aa : bb;
		(int64_t )(d + i) = dd;
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_smax8)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(int8_t)) {
		int8_t aa = (int8_t )(a + i);
		int8_t bb = (int8_t )(b + i);
		int8_t dd = aa > bb ? aa : bb;
		(int8_t )(d + i) = dd;
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_smax16)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(int16_t)) {
		int16_t aa = (int16_t )(a + i);
		int16_t bb = (int16_t )(b + i);
		int16_t dd = aa > bb ? aa : bb;
		(int16_t )(d + i) = dd;
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_smax32)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(int32_t)) {
		int32_t aa = (int32_t )(a + i);
		int32_t bb = (int32_t )(b + i);
		int32_t dd = aa > bb ? aa : bb;
		(int32_t )(d + i) = dd;
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_smax64)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(int64_t)) {
		int64_t aa = (int64_t )(a + i);
		int64_t bb = (int64_t )(b + i);
		int64_t dd = aa > bb ? aa : bb;
		(int64_t )(d + i) = dd;
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_umin8)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
		uint8_t aa = (uint8_t )(a + i);
		uint8_t bb = (uint8_t )(b + i);
		uint8_t dd = aa < bb ? aa : bb;
		(uint8_t )(d + i) = dd;
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_umin16)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
		uint16_t aa = (uint16_t )(a + i);
		uint16_t bb = (uint16_t )(b + i);
		uint16_t dd = aa < bb ? aa : bb;
		(uint16_t )(d + i) = dd;
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_umin32)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
		uint32_t aa = (uint32_t )(a + i);
		uint32_t bb = (uint32_t )(b + i);
		uint32_t dd = aa < bb ? aa : bb;
		(uint32_t )(d + i) = dd;
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_umin64)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
		uint64_t aa = (uint64_t )(a + i);
		uint64_t bb = (uint64_t )(b + i);
		uint64_t dd = aa < bb ? aa : bb;
		(uint64_t )(d + i) = dd;
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_umax8)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
		uint8_t aa = (uint8_t )(a + i);
		uint8_t bb = (uint8_t )(b + i);
		uint8_t dd = aa > bb ? aa : bb;
		(uint8_t )(d + i) = dd;
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_umax16)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
		uint16_t aa = (uint16_t )(a + i);
		uint16_t bb = (uint16_t )(b + i);
		uint16_t dd = aa > bb ? aa : bb;
		(uint16_t )(d + i) = dd;
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_umax32)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
		uint32_t aa = (uint32_t )(a + i);
		uint32_t bb = (uint32_t )(b + i);
		uint32_t dd = aa > bb ? aa : bb;
		(uint32_t )(d + i) = dd;
		}
		clear_high(d, oprsz, desc);
		}

		void HELPER(gvec_umax64)(void d, void a, void *b, uint32_t desc)
		{
		intptr_t oprsz = simd_oprsz(desc);
		intptr_t i;

		for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
		uint64_t aa = (uint64_t )(a + i);
		uint64_t bb = (uint64_t )(b + i);
		uint64_t dd = aa > bb ? aa : bb;
		(uint64_t )(d + i) = dd;
		}
		clear_high(d, oprsz, desc);
		}

accel/tcg/tcg-runtime.h

+23 −0

Original line number	Diff line number	Diff line
		@@ -200,6 +200,26 @@ DEF_HELPER_FLAGS_4(gvec_ussub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_ussub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_ussub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

		DEF_HELPER_FLAGS_4(gvec_smin8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_smin16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_smin32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_smin64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

		DEF_HELPER_FLAGS_4(gvec_smax8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_smax16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_smax32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_smax64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

		DEF_HELPER_FLAGS_4(gvec_umin8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_umin16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_umin32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_umin64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

		DEF_HELPER_FLAGS_4(gvec_umax8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_umax16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_umax32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_umax64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

		DEF_HELPER_FLAGS_3(gvec_neg8, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
		DEF_HELPER_FLAGS_3(gvec_neg16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
		DEF_HELPER_FLAGS_3(gvec_neg32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
		@@ -211,6 +231,9 @@ DEF_HELPER_FLAGS_4(gvec_or, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_xor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_andc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_orc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_nand, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_nor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
		DEF_HELPER_FLAGS_4(gvec_eqv, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

		DEF_HELPER_FLAGS_4(gvec_ands, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
		DEF_HELPER_FLAGS_4(gvec_xors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)

include/exec/cpu-defs.h

+39 −32

Original line number	Diff line number	Diff line
		@@ -67,37 +67,23 @@ typedef uint64_t target_ulong;
		#define CPU_TLB_ENTRY_BITS 5
		#endif

		/* TCG_TARGET_TLB_DISPLACEMENT_BITS is used in CPU_TLB_BITS to ensure that
		* the TLB is not unnecessarily small, but still small enough for the
		* TLB lookup instruction sequence used by the TCG target.
		*
		* TCG will have to generate an operand as large as the distance between
		* env and the tlb_table[NB_MMU_MODES - 1][0].addend. For simplicity,
		* the TCG targets just round everything up to the next power of two, and
		* count bits. This works because: 1) the size of each TLB is a largish
		* power of two, 2) and because the limit of the displacement is really close
		* to a power of two, 3) the offset of tlb_table[0][0] inside env is smaller
		* than the size of a TLB.
		*
		* For example, the maximum displacement 0xFFF0 on PPC and MIPS, but TCG
		* just says "the displacement is 16 bits". TCG_TARGET_TLB_DISPLACEMENT_BITS
		* then ensures that tlb_table at least 0x8000 bytes large ("not unnecessarily
		* small": 2^15). The operand then will come up smaller than 0xFFF0 without
		* any particular care, because the TLB for a single MMU mode is larger than
		* 0x10000-0xFFF0=16 bytes. In the end, the maximum value of the operand
		* could be something like 0xC000 (the offset of the last TLB table) plus
		* 0x18 (the offset of the addend field in each TLB entry) plus the offset
		* of tlb_table inside env (which is non-trivial but not huge).
		*/
		#define CPU_TLB_BITS \
		MIN(8, \
		TCG_TARGET_TLB_DISPLACEMENT_BITS - CPU_TLB_ENTRY_BITS - \
		(NB_MMU_MODES <= 1 ? 0 : \
		NB_MMU_MODES <= 2 ? 1 : \
		NB_MMU_MODES <= 4 ? 2 : \
		NB_MMU_MODES <= 8 ? 3 : 4))
		#define CPU_TLB_DYN_MIN_BITS 6
		#define CPU_TLB_DYN_DEFAULT_BITS 8

		#define CPU_TLB_SIZE (1 << CPU_TLB_BITS)
		# if HOST_LONG_BITS == 32
		/* Make sure we do not require a double-word shift for the TLB load */
		# define CPU_TLB_DYN_MAX_BITS (32 - TARGET_PAGE_BITS)
		# else /* HOST_LONG_BITS == 64 */
		/*
		* Assuming TARGET_PAGE_BITS==12, with 222 entries we can cover 2(22+12) ==
		* 2**34 == 16G of address space. This is roughly what one would expect a
		* TLB to cover in a modern (as of 2018) x86_64 CPU. For instance, Intel
		* Skylake's Level-2 STLB has 16 1G entries.
		* Also, make sure we do not size the TLB past the guest's address space.
		*/
		# define CPU_TLB_DYN_MAX_BITS \
		MIN(22, TARGET_VIRT_ADDR_SPACE_BITS - TARGET_PAGE_BITS)
		# endif

		typedef struct CPUTLBEntry {
		/* bit TARGET_LONG_BITS to TARGET_PAGE_BITS : virtual address
		@@ -141,6 +127,18 @@ typedef struct CPUIOTLBEntry {
		MemTxAttrs attrs;
		} CPUIOTLBEntry;

		/**
		* struct CPUTLBWindow
		* @begin_ns: host time (in ns) at the beginning of the time window
		* @max_entries: maximum number of entries observed in the window
		*
		* See also: tlb_mmu_resize_locked()
		*/
		typedef struct CPUTLBWindow {
		int64_t begin_ns;
		size_t max_entries;
		} CPUTLBWindow;

		typedef struct CPUTLBDesc {
		/*
		* Describe a region covering all of the large pages allocated
		@@ -152,6 +150,8 @@ typedef struct CPUTLBDesc {
		target_ulong large_page_mask;
		/* The next index to use in the tlb victim table. */
		size_t vindex;
		CPUTLBWindow window;
		size_t n_used_entries;
		} CPUTLBDesc;

		/*
		@@ -176,6 +176,13 @@ typedef struct CPUTLBCommon {
		size_t elide_flush_count;
		} CPUTLBCommon;

		# define CPU_TLB \
		/* tlb_mask[i] contains (n_entries - 1) << CPU_TLB_ENTRY_BITS */ \
		uintptr_t tlb_mask[NB_MMU_MODES]; \
		CPUTLBEntry *tlb_table[NB_MMU_MODES];
		# define CPU_IOTLB \
		CPUIOTLBEntry *iotlb[NB_MMU_MODES];

		/*
		* The meaning of each of the MMU modes is defined in the target code.
		* Note that NB_MMU_MODES is not yet defined; we can only reference it
		@@ -184,9 +191,9 @@ typedef struct CPUTLBCommon {
		#define CPU_COMMON_TLB \
		CPUTLBCommon tlb_c; \
		CPUTLBDesc tlb_d[NB_MMU_MODES]; \
		CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE]; \
		CPU_TLB \
		CPUTLBEntry tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE]; \
		CPUIOTLBEntry iotlb[NB_MMU_MODES][CPU_TLB_SIZE]; \
		CPU_IOTLB \
		CPUIOTLBEntry iotlb_v[NB_MMU_MODES][CPU_VTLB_SIZE];

		#else

include/exec/cpu_ldst.h

+8 −1

Original line number	Diff line number	Diff line
		@@ -139,7 +139,14 @@ static inline target_ulong tlb_addr_write(const CPUTLBEntry *entry)
		static inline uintptr_t tlb_index(CPUArchState *env, uintptr_t mmu_idx,
		target_ulong addr)
		{
		return (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
		uintptr_t size_mask = env->tlb_mask[mmu_idx] >> CPU_TLB_ENTRY_BITS;

		return (addr >> TARGET_PAGE_BITS) & size_mask;
		}

		static inline size_t tlb_n_entries(CPUArchState *env, uintptr_t mmu_idx)
		{
		return (env->tlb_mask[mmu_idx] >> CPU_TLB_ENTRY_BITS) + 1;
		}

		/* Find the TLB entry corresponding to the mmu_idx + address pair. */