1
0
Files
2022-09-29 17:59:04 +03:00

499 lines
14 KiB
C

/**************************************************************************
* *
* Copyright (C) 1989-1996 Silicon Graphics, Inc. *
* *
* These coded instructions, statements, and computer programs contain *
* unpublished proprietary information of Silicon Graphics, Inc., and *
* are protected by Federal copyright law. They may not be disclosed *
* to third parties or copied or duplicated in any form, in whole or *
* in part, without the prior written consent of Silicon Graphics, Inc. *
* *
**************************************************************************/
#ifndef __SYS_PAGE_H__
#define __SYS_PAGE_H__
#ident "$Revision: 1.39 $"
#include "sys/sema.h"
#if SN
#include <sys/SN/arch.h>
#include <sys/SN/memsupport.h>
#if defined (SN0)
#include <sys/SN/SN0/bte.h>
#endif /* SN0 */
#endif /* SN */
#include "sys/pfdat.h"
/*
* Contains page free list and related data structure definitions.
* Should be know only in the VM subsystem.
*/
#if EVEREST || SN || IP30
/* On Everest TFP systems, no need for a stale or in-transit queue since
* the coherency protocol on the bus keeps all physical addresses up to date
* and there is no danger of VCEs occuring due to 16K page size and 16K
* d-cache & i-cache sizes.
*
* On non-TFP systems we will implement a scheme to avoid VCEs on
* the kernel stack and take other VCEs. All page lists will be treated
* as clean and the flushcaches will be avoided.
*
* In a SN0 system, pages may be poisonous after being migrated.
* A poisonous page has the poisonous bit set in all its directory entries,
* causing any access to it to generate a "poisonous bus error",
*/
#define CLEAN_ASSOC 0 /* Clean, disk association */
#define CLEAN_NOASSOC 1 /* Clean, no disk association */
#define POISONOUS 2 /* Poisonous page */
#define STALE_ASSOC CLEAN_ASSOC
#define STALE_NOASSOC CLEAN_NOASSOC
#define INTRANS_ASSOC CLEAN_ASSOC
#define INTRANS_NOASSOC CLEAN_NOASSOC
#define PH_SHORT 1
#define PH_NLISTS 3
#if SN0
#define BTE_PAGE_POISON(addr, len) bte_poison_range(addr, len)
#define POISON_STATE_CLEAR(pfn) poison_state_clear((pfn))
#else
#define BTE_PAGE_POISON(addr, len)
#define POISON_STATE_CLEAR(pfn)
#endif
#if defined(NUMA_BASE)
#define PAGE_POISON(pfd) page_poison(pfd)
#define PAGE_UNPOISON(pfd) page_unpoison((pfd))
#define PHEAD_UNPOISON(node, pheadp, pfd) phead_unpoison((node), (pheadp), (pfd))
#else
#define PAGE_POISON(pfd)
#define PAGE_UNPOISON(pfd)
#define PHEAD_UNPOISON(node, pheadp, pfd)
#endif
#else /* !EVEREST && !SN*/
/*
* The six free page lists define the state of the page, I-cache wise
*/
#define CLEAN_ASSOC 0 /* Clean, disk association */
#define INTRANS_ASSOC 1 /* Being cleaned, no disk association */
#define STALE_ASSOC 2 /* Stale, disk association */
#define CLEAN_NOASSOC 3 /* Clean, no disk association */
#define INTRANS_NOASSOC 4 /* Being cleaned, disk association */
#define STALE_NOASSOC 5 /* Stale, no disk association */
#define PH_LONG 1
#define PH_NLISTS 6
#define POISON_STATE_CLEAR(pfn)
#define PAGE_UNPOISON(pfd)
#define PHEAD_UNPOISON(node, pheadp, pfd)
#endif /* !EVEREST && !SN */
#if _PAGESZ == 4096
#define NUM_PAGE_SIZES 7 /* Number of page sizes */
#elif _PAGESZ == 16384
#define NUM_PAGE_SIZES 6 /* Number of page sizes */
#else
#error "Unknown page size"
#endif
#define LPG_STAT /* Large page statistics turned on for opt. kernels */
#ifdef LPG_STAT
#define NUM_STAT_WORDS 32
#endif
#define TRUE 1
#define FALSE 0
#define ALLOC_SUCCESS TRUE
#define ALLOC_FAIL FALSE
/*
* List header type for free page table -- match link pointers in pfd_t.
*/
typedef struct plist {
struct pfdat *pf_next;
union {
struct pfdat *prev;
sm_swaphandle_t swphdl;
} p_swpun;
} plist_t;
/*
* Free page table entry
*/
typedef struct phead {
plist_t ph_list[PH_NLISTS]; /* list headers */
int ph_count; /* total count on all lists */
int ph_flushcount;
int ph_flags;
int ph_poison_count; /* Number of poisoned pages */
} phead_t;
#ifdef TILES_TO_LPAGES
/*
* On IP32, we partition the MIN_PGSZ_INDEX list into
* 3 separate pools:
* - 4K pages in unfragmented tiles
* - 4K pages in fragmented tiles
* - 4K pages in low (!DMAable) memory
*/
typedef struct tphead_s {
phead_t *phead; /* phead for this tile page pool */
phead_t *pheadend; /* end of pheads for this pool */
phead_t *pheadrotor; /* rotor for pool */
int count; /* total count in all pheads */
} tphead_t;
#define NTPHEADS 3
#define TPH_UNFRAG 0
#define TPH_FRAG 1
#define TPH_LOWMEM 2
extern tphead_t tphead[];
extern void tilepages_to_frag(pfd_t *);
extern void tilepages_to_unfrag(pfd_t *);
#endif /* TILES_TO_LPAGES */
/*
* Readonly structures per node.
* This structure contains information used allocate
* and free pages. All the fields in the structure are computed once at
* boot time. So they are replicated in every node.
*/
typedef struct pg_free {
phead_t *phead[NUM_PAGE_SIZES]; /* Heads of free page lists. */
int hiwat[NUM_PAGE_SIZES];
phead_t *pheadend[NUM_PAGE_SIZES]; /* points to just past end */
pfd_t *pfd_low, /* Page with lowest address for a node */
*pfd_high; /* Page with highest address for a node */
} pg_free_t;
/*
* This structure encapsulates several data structres needed for page
* allocation per node. It is embedded in the nodepda.
*/
typedef struct pg_data {
struct pg_free *pg_freelst;
/*
* Lock for the free list on a specific node.
* This should NOT be part of pg_freelst (locks are not replicable)
* Access to this lock is through the nodepda of the specific node.
*/
lock_t pg_freelst_lock;
/* Number of free base(NBPP) pages per node */
int node_freemem;
/*
* Future number of free pages per node, according to memsched.
* This field is used to implement a very light weight reservation
* scheme. The memory scheduler tries to reserve some amount of
* free memory on the node it places an MLD on, so that even if
* memory hasn't been effectively allocated, the memory scheduler
* knows that there is some potential future pressure on this
* particular node.
*/
int node_future_freemem;
/*
* Per node free memory with NO hash association
* Used by user level system state monitors
*/
int node_emptymem;
/*
* Total number of pages present in a node after bringup.
* Used in calculating hiwater marks for large pages.
*/
int node_total_mem;
/* Number of free pages of a specific page size */
int num_free_pages[NUM_PAGE_SIZES];
/* Free page hash mask for a specific page size */
int pheadmask[NUM_PAGE_SIZES];
/* shift amount for pheadmask for a specific page size */
int pheadshift[NUM_PAGE_SIZES];
/*
* Free page table rotor
* for VM_UNSEQ allocations
*/
phead_t *pheadrotor[NUM_PAGE_SIZES];
#ifdef LPG_STAT
/*
* Total number of large pages of a specific page size
* in the system.
*/
int num_total_pages[NUM_PAGE_SIZES];
long lpage_stats[NUM_PAGE_SIZES][NUM_STAT_WORDS];
#endif
} pg_data_t;
#ifdef _KERNEL
/*
* Page free list operation macros
*/
#define isempty(l) ((l)->pf_next == (pfd_t *)(l))
#define makeempty(l) ((l)->pf_next = (l)->pf_prev = (pfd_t *)(l))
#define append(l, p) { (p)->pf_prev = (l)->pf_prev; \
(p)->pf_prev->pf_next = (p); \
(l)->pf_prev = (p); \
(p)->pf_next = (pfd_t *)(l); \
}
#define prefix(l, p) { (p)->pf_next = (l)->pf_next; \
(p)->pf_next->pf_prev = (p); \
(l)->pf_next = (p); \
(p)->pf_prev = (pfd_t *)(l); \
}
#define combine(lf, lb) { (lf)->pf_prev->pf_next = (lb)->pf_next; \
(lb)->pf_next->pf_prev = (lf)->pf_prev; \
(lf)->pf_prev = (lb)->pf_prev; \
(lb)->pf_prev->pf_next = (pfd_t *)(lf); \
}
extern lock_t memory_lock;/* spin lock for memory operations */
#define NODE_PG_DATA(cnode) (NODEPDA(cnode)->node_pg_data)
#define mem_lock() mutex_spinlock(&memory_lock)
#define mem_unlock(T) mutex_spinunlock(&memory_lock, T)
#if 0
#define PAGE_FREELIST_LOCK(node) mem_lock()
#define PAGE_FREELIST_UNLOCK(node, T) mem_unlock(T)
#ifdef DEBUG
#define PAGE_FREELIST_ISLOCKED(node) spinlock_islocked(&memory_lock)
#endif /* DEBUG */
#endif
#define PAGE_FREELIST_LOCK(node) \
mutex_spinlock(&NODE_PG_DATA(node).pg_freelst_lock)
#define PAGE_FREELIST_UNLOCK(node, T) \
mutex_spinunlock(&NODE_PG_DATA(node).pg_freelst_lock, (T))
#ifdef DEBUG
#define PAGE_FREELIST_ISLOCKED(node) \
spinlock_islocked(&NODE_PG_DATA(node).pg_freelst_lock)
#endif /* DEBUG */
/*
* Return a pointer to the pg_free_t for a specific node (passed as
* compact node id).
*/
#define GET_NODE_PFL(cnode) ((pg_free_t *)&(nodepda->node_pg_data.\
pg_freelst[(cnode)]))
#define PFD_LOW(cnode) ((GET_NODE_PFL(cnode))->pfd_low)
#define PFD_HIGH(cnode) ((GET_NODE_PFL(cnode))->pfd_high)
/*
* Define the per node variables.
* For non-NUMA systems this defaults to referencing the global
* Nodepda.
*/
#define NODE_FREEMEM(cnode) \
(NODEPDA(cnode)->node_pg_data.node_freemem)
#define NODE_TOTALMEM(cnode) \
(NODEPDA(cnode)->node_pg_data.node_total_mem)
#define NODE_FREEMEM_REL(cnode) \
( NODE_TOTALMEM(cnode) ? \
((NODE_FREEMEM(cnode) * 100) / NODE_TOTALMEM(cnode)) : 0 )
#define NODE_FUTURE_FREEMEM(cnode) \
(NODEPDA(cnode)->node_pg_data.node_future_freemem)
#define NODE_FUTURE_FREEMEM_REL(cnode) \
( NODE_TOTALMEM(cnode) ? \
((NODE_FUTURE_FREEMEM(cnode) * 100) / NODE_TOTALMEM(cnode)) : 0 )
#define NODE_MIN_FUTURE_FREEMEM(cnode) \
(( NODE_FREEMEM(cnode) < NODE_FUTURE_FREEMEM(cnode) ) ? \
NODE_FREEMEM(cnode) : NODE_FUTURE_FREEMEM(cnode))
#define NODE_MIN_FUTURE_FREEMEM_REL(cnode) \
(( NODE_FREEMEM(cnode) < NODE_FUTURE_FREEMEM(cnode) ) ? \
NODE_FREEMEM_REL(cnode) : NODE_FUTURE_FREEMEM_REL(cnode))
#define NODE_EMPTYMEM(cnode) \
(NODEPDA(cnode)->node_pg_data.node_emptymem)
#define NODE_TOTALMEM(cnode) \
(NODEPDA(cnode)->node_pg_data.node_total_mem)
/*
* Operations done on node_future_freemem only (memsched)
*/
#define ADD_NODE_FUTURE_FREEMEM_ATOMIC(cnode, val) \
(atomicAddInt(&(NODE_FUTURE_FREEMEM(cnode)), (val)))
#define SUB_NODE_FUTURE_FREEMEM_ATOMIC(cnode, val) \
(atomicAddInt(&(NODE_FUTURE_FREEMEM(cnode)), -(val)))
#define SET_NODE_FUTURE_FREEMEM_ATOMIC(cnode, val) \
(atomicSetInt(&(NODE_FUTURE_FREEMEM(cnode)), (val)))
/*
* Operations on node_freemem & node_future_freemem
*/
#define ADD_NODE_FREEMEM(cnode, val) \
(NODE_FREEMEM(cnode) += (val))
#define SUB_NODE_FREEMEM(cnode, val) \
(NODE_FREEMEM(cnode) -= (val))
#if (!defined(EVEREST) || !defined(SN0))
#define INC_GLOBAL_FREEMEM(val) (GLOBAL_FREEMEM_VAR += (val))
#define DEC_GLOBAL_FREEMEM(val) (GLOBAL_FREEMEM_VAR -= (val))
#endif
/*
* Operations on node_emptymem
*/
#define ADD_NODE_EMPTYMEM(cnode, val) \
(atomicAddInt(&(NODE_EMPTYMEM(cnode)), (val)))
#define SUB_NODE_EMPTYMEM(cnode, val) \
(atomicAddInt(&(NODE_EMPTYMEM(cnode)), -(val)))
extern void pagedequeue(pfd_t *, phead_t *);
extern int _pagefree_size(pfd_t *, size_t, uint);
extern pfd_t * _pagealloc_size_node(cnodeid_t, uint, int, size_t);
#ifdef DEBUG
extern int check_freemem_node(cnodeid_t);
extern int freelist_sanity(cnodeid_t);
extern int freelist_sanity_nolock(cnodeid_t,int);
#endif
#ifdef NUMA_BASE
pgno_t contmemall_node(cnodeid_t, int, int, int);
#else
#define contmemall_node(node, npgs, align, flags) contmemall(npgs, align, flags)
#endif
/*
* Declarations and
* Macros to manage vm system relaxed global variables
* We keep a relaxed freemem variable, updated only
* once a second, or whenever an accurate value is needed (for now,
* when vhand is deciding whether to continue or not).
*/
extern pfn_t relaxed_global_freemem;
extern pfn_t global_freemem_calculate(int);
extern pfn_t global_freemem_init(void);
#define GLOBAL_FREEMEM_UPDATE() { \
relaxed_global_freemem = \
global_freemem_calculate(1); \
}
#define GLOBAL_FREEMEM_INIT() { \
relaxed_global_freemem = \
global_freemem_init(); \
}
#define GLOBAL_FREEMEM_GET() global_freemem_calculate(1) /* update */
#define GLOBAL_FREEMEM_SNAP() global_freemem_calculate(0) /* no update */
#define GLOBAL_FREEMEM() (relaxed_global_freemem)
#define GLOBAL_FREEMEM_VAR (relaxed_global_freemem)
#ifdef NUMA_BASE
extern void memfit_init(void);
extern void memfit_master_update(pfn_t cm_freemem);
#define MEMFIT_INIT() memfit_init()
#define MEMFIT_MASTER_UPDATE(mf) memfit_master_update((mf))
#else
#define MEMFIT_INIT()
#define MEMFIT_MASTER_UPDATE(mf)
#endif
/*
* Per node low freemem threshold to update relaxed global
* freemem to cause the next clock tick to accurately check
* whether vhand needs to be waken up or not.
*/
#ifdef NUMA_BASE
extern int numa_paging_node_freemem_low_threshold;
#define NODE_FREEMEM_LOW_THRESHOLD() numa_paging_node_freemem_low_threshold
#else /* !NUMA_BASE */
#define NODE_FREEMEM_LOW_THRESHOLD() tune.t_gpgslo
#endif /* !NUMA_BASE*/
/*
* Relaxed emptymem
*/
extern int relaxed_global_emptymem;
extern pfn_t global_emptymem_calculate(void);
#define GLOBAL_EMPTYMEM_GET() global_emptymem_calculate()
extern lock_t sxbrk_lock; /* sxbrk/sched synch. lock */
#define sxbrk_lock() mutex_spinlock(&sxbrk_lock)
#define sxbrk_lockspl(SPL) mutex_spinlock_spl(&sxbrk_lock, SPL)
#define sxbrk_unlock(T) mutex_spinunlock(&sxbrk_lock, T)
#ifdef NUMA_BASE
extern void page_poison(pfd_t*);
extern void page_unpoison(pfd_t*);
extern void phead_unpoison(cnodeid_t, phead_t*, pfd_t *);
#endif
extern int page_discard(paddr_t, int, int);
extern int page_isdiscarded(paddr_t);
extern int page_ispoison(paddr_t);
extern int page_error_clean(pfd_t *);
extern void page_discard_enqueue(pfd_t *);
extern int page_validate_pfdat(pfd_t *);
#endif /* _KERNEL */
#endif /* !__SYS_PAGE_H__ */