diff --git a/.gitignore b/.gitignore index de6344e15706280de410cd83372f997720d0947a..a2939fc10b225d3727d6501831fad647553f11c8 100644 --- a/.gitignore +++ b/.gitignore @@ -34,13 +34,18 @@ modules.builtin # # Top-level generic files # -tags -TAGS -vmlinux -vmlinuz -System.map -Module.markers -Module.symvers +/tags +/TAGS +/linux +/vmlinux +/vmlinuz +/System.map +/Module.markers +/Module.symvers + +# +# git files that we don't want to ignore even it they are dot-files +# !.gitignore !.mailmap diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node new file mode 100644 index 0000000000000000000000000000000000000000..49b82cad70030baa89e70775e2e619dd3a98665b --- /dev/null +++ b/Documentation/ABI/stable/sysfs-devices-node @@ -0,0 +1,7 @@ +What: /sys/devices/system/node/nodeX +Date: October 2002 +Contact: Linux Memory Management list +Description: + When CONFIG_NUMA is enabled, this is a directory containing + information on node X such as what CPUs are local to the + node. diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy index 6434f0df012e3cd72d8e373bc30c406b7f0b3873..6cd6daefaaedeb160a6f1ac1d616de7871b38965 100644 --- a/Documentation/ABI/testing/ima_policy +++ b/Documentation/ABI/testing/ima_policy @@ -20,7 +20,7 @@ Description: lsm: [[subj_user=] [subj_role=] [subj_type=] [obj_user=] [obj_role=] [obj_type=]] - base: func:= [BPRM_CHECK][FILE_MMAP][INODE_PERMISSION] + base: func:= [BPRM_CHECK][FILE_MMAP][FILE_CHECK] mask:= [MAY_READ] [MAY_WRITE] [MAY_APPEND] [MAY_EXEC] fsmagic:= hex value uid:= decimal value @@ -40,11 +40,11 @@ Description: measure func=BPRM_CHECK measure func=FILE_MMAP mask=MAY_EXEC - measure func=INODE_PERM mask=MAY_READ uid=0 + measure func=FILE_CHECK mask=MAY_READ uid=0 The default policy measures all executables in bprm_check, all files mmapped executable in file_mmap, and all files - open for read by root in inode_permission. + open for read by root in do_filp_open. Examples of LSM specific definitions: @@ -54,8 +54,8 @@ Description: dont_measure obj_type=var_log_t dont_measure obj_type=auditd_log_t - measure subj_user=system_u func=INODE_PERM mask=MAY_READ - measure subj_role=system_r func=INODE_PERM mask=MAY_READ + measure subj_user=system_u func=FILE_CHECK mask=MAY_READ + measure subj_role=system_r func=FILE_CHECK mask=MAY_READ Smack: - measure subj_user=_ func=INODE_PERM mask=MAY_READ + measure subj_user=_ func=FILE_CHECK mask=MAY_READ diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index d2f90334bb93f90af2986e96d3cfd9710180eca7..4873c759d535a7549d5eecf62a7125b29d6c2dea 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -128,3 +128,17 @@ Description: preferred request size for workloads where sustained throughput is desired. If no optimal I/O size is reported this file contains 0. + +What: /sys/block//queue/nomerges +Date: January 2010 +Contact: +Description: + Standard I/O elevator operations include attempts to + merge contiguous I/Os. For known random I/O loads these + attempts will always fail and result in extra cycles + being spent in the kernel. This allows one to turn off + this behavior on one of two ways: When set to 1, complex + merge checks are disabled, but the simple one-shot merges + with the previous I/O request are enabled. When set to 2, + all merge tries are disabled. The default value is 0 - + which enables all types of merge tries. diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb index a07c0f366f91827fa869ec4751d3586280aff2d4..a986e9bbba3d2972770f9e2a8196bea6eb119a0f 100644 --- a/Documentation/ABI/testing/sysfs-bus-usb +++ b/Documentation/ABI/testing/sysfs-bus-usb @@ -159,3 +159,14 @@ Description: device. This is useful to ensure auto probing won't match the driver to the device. For example: # echo "046d c315" > /sys/bus/usb/drivers/foo/remove_id + +What: /sys/bus/usb/device/.../avoid_reset +Date: December 2009 +Contact: Oliver Neukum +Description: + Writing 1 to this file tells the kernel that this + device will morph into another mode when it is reset. + Drivers will not use reset for error handling for + such devices. +Users: + usb_modeswitch diff --git a/Documentation/ABI/testing/sysfs-devices-power b/Documentation/ABI/testing/sysfs-devices-power new file mode 100644 index 0000000000000000000000000000000000000000..6123c523bfd7961c8cdc235aee2b96367062faf5 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-power @@ -0,0 +1,79 @@ +What: /sys/devices/.../power/ +Date: January 2009 +Contact: Rafael J. Wysocki +Description: + The /sys/devices/.../power directory contains attributes + allowing the user space to check and modify some power + management related properties of given device. + +What: /sys/devices/.../power/wakeup +Date: January 2009 +Contact: Rafael J. Wysocki +Description: + The /sys/devices/.../power/wakeup attribute allows the user + space to check if the device is enabled to wake up the system + from sleep states, such as the memory sleep state (suspend to + RAM) and hibernation (suspend to disk), and to enable or disable + it to do that as desired. + + Some devices support "wakeup" events, which are hardware signals + used to activate the system from a sleep state. Such devices + have one of the following two values for the sysfs power/wakeup + file: + + + "enabled\n" to issue the events; + + "disabled\n" not to do so; + + In that cases the user space can change the setting represented + by the contents of this file by writing either "enabled", or + "disabled" to it. + + For the devices that are not capable of generating system wakeup + events this file contains "\n". In that cases the user space + cannot modify the contents of this file and the device cannot be + enabled to wake up the system. + +What: /sys/devices/.../power/control +Date: January 2009 +Contact: Rafael J. Wysocki +Description: + The /sys/devices/.../power/control attribute allows the user + space to control the run-time power management of the device. + + All devices have one of the following two values for the + power/control file: + + + "auto\n" to allow the device to be power managed at run time; + + "on\n" to prevent the device from being power managed; + + The default for all devices is "auto", which means that they may + be subject to automatic power management, depending on their + drivers. Changing this attribute to "on" prevents the driver + from power managing the device at run time. Doing that while + the device is suspended causes it to be woken up. + +What: /sys/devices/.../power/async +Date: January 2009 +Contact: Rafael J. Wysocki +Description: + The /sys/devices/.../async attribute allows the user space to + enable or diasble the device's suspend and resume callbacks to + be executed asynchronously (ie. in separate threads, in parallel + with the main suspend/resume thread) during system-wide power + transitions (eg. suspend to RAM, hibernation). + + All devices have one of the following two values for the + power/async file: + + + "enabled\n" to permit the asynchronous suspend/resume; + + "disabled\n" to forbid it; + + The value of this attribute may be changed by writing either + "enabled", or "disabled" to it. + + It generally is unsafe to permit the asynchronous suspend/resume + of a device unless it is certain that all of the PM dependencies + of the device are known to the PM core. However, for some + devices this attribute is set to "enabled" by bus type code or + device drivers and in that cases it should be safe to leave the + default value. diff --git a/Documentation/ABI/testing/sysfs-platform-asus-laptop b/Documentation/ABI/testing/sysfs-platform-asus-laptop index a1cb660c50cfb4fcc8b685853a2b599e39d75f85..1d775390e8562dae8e4eb53d1db9959edff4b943 100644 --- a/Documentation/ABI/testing/sysfs-platform-asus-laptop +++ b/Documentation/ABI/testing/sysfs-platform-asus-laptop @@ -1,4 +1,4 @@ -What: /sys/devices/platform/asus-laptop/display +What: /sys/devices/platform/asus_laptop/display Date: January 2007 KernelVersion: 2.6.20 Contact: "Corentin Chary" @@ -13,7 +13,7 @@ Description: Ex: - 0 (0000b) means no display - 3 (0011b) CRT+LCD. -What: /sys/devices/platform/asus-laptop/gps +What: /sys/devices/platform/asus_laptop/gps Date: January 2007 KernelVersion: 2.6.20 Contact: "Corentin Chary" @@ -21,7 +21,7 @@ Description: Control the gps device. 1 means on, 0 means off. Users: Lapsus -What: /sys/devices/platform/asus-laptop/ledd +What: /sys/devices/platform/asus_laptop/ledd Date: January 2007 KernelVersion: 2.6.20 Contact: "Corentin Chary" @@ -29,11 +29,11 @@ Description: Some models like the W1N have a LED display that can be used to display several informations. To control the LED display, use the following : - echo 0x0T000DDD > /sys/devices/platform/asus-laptop/ + echo 0x0T000DDD > /sys/devices/platform/asus_laptop/ where T control the 3 letters display, and DDD the 3 digits display. The DDD table can be found in Documentation/laptops/asus-laptop.txt -What: /sys/devices/platform/asus-laptop/bluetooth +What: /sys/devices/platform/asus_laptop/bluetooth Date: January 2007 KernelVersion: 2.6.20 Contact: "Corentin Chary" @@ -42,7 +42,7 @@ Description: This may control the led, the device or both. Users: Lapsus -What: /sys/devices/platform/asus-laptop/wlan +What: /sys/devices/platform/asus_laptop/wlan Date: January 2007 KernelVersion: 2.6.20 Contact: "Corentin Chary" diff --git a/Documentation/ABI/testing/sysfs-platform-eeepc-laptop b/Documentation/ABI/testing/sysfs-platform-eeepc-laptop index 7445dfb321b5bf6019daacd2c4eb63cfb6da3012..5b026c69587ac881efbcdf3fff697722f9df2301 100644 --- a/Documentation/ABI/testing/sysfs-platform-eeepc-laptop +++ b/Documentation/ABI/testing/sysfs-platform-eeepc-laptop @@ -1,4 +1,4 @@ -What: /sys/devices/platform/eeepc-laptop/disp +What: /sys/devices/platform/eeepc/disp Date: May 2008 KernelVersion: 2.6.26 Contact: "Corentin Chary" @@ -9,21 +9,21 @@ Description: - 3 = LCD+CRT If you run X11, you should use xrandr instead. -What: /sys/devices/platform/eeepc-laptop/camera +What: /sys/devices/platform/eeepc/camera Date: May 2008 KernelVersion: 2.6.26 Contact: "Corentin Chary" Description: Control the camera. 1 means on, 0 means off. -What: /sys/devices/platform/eeepc-laptop/cardr +What: /sys/devices/platform/eeepc/cardr Date: May 2008 KernelVersion: 2.6.26 Contact: "Corentin Chary" Description: Control the card reader. 1 means on, 0 means off. -What: /sys/devices/platform/eeepc-laptop/cpufv +What: /sys/devices/platform/eeepc/cpufv Date: Jun 2009 KernelVersion: 2.6.31 Contact: "Corentin Chary" @@ -42,7 +42,7 @@ Description: `------------ Availables modes For example, 0x301 means: mode 1 selected, 3 available modes. -What: /sys/devices/platform/eeepc-laptop/available_cpufv +What: /sys/devices/platform/eeepc/available_cpufv Date: Jun 2009 KernelVersion: 2.6.31 Contact: "Corentin Chary" diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index dcff4d0623add0e7708c642d0cfe210b0c1f48ab..d6a801f45b484e4df433ff09e85a6f3bc07707d5 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -101,3 +101,16 @@ Description: CAUTION: Using it will cause your machine's real-time (CMOS) clock to be set to a random invalid time after a resume. + +What: /sys/power/pm_async +Date: January 2009 +Contact: Rafael J. Wysocki +Description: + The /sys/power/pm_async file controls the switch allowing the + user space to enable or disable asynchronous suspend and resume + of devices. If enabled, this feature will cause some device + drivers' suspend and resume callbacks to be executed in parallel + with each other and with the main suspend thread. It is enabled + if this file contains "1", which is the default. It may be + disabled by writing "0" to this file, in which case all devices + will be suspended and resumed synchronously. diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index 5aceb88b3f8b622c8c23ed288f37f42de705c904..05e2ae236865edf6b56e2357c6089432793be077 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt @@ -4,20 +4,18 @@ James E.J. Bottomley This document describes the DMA API. For a more gentle introduction -phrased in terms of the pci_ equivalents (and actual examples) see -Documentation/PCI/PCI-DMA-mapping.txt. +of the API (and actual examples) see +Documentation/DMA-API-HOWTO.txt. -This API is split into two pieces. Part I describes the API and the -corresponding pci_ API. Part II describes the extensions to the API -for supporting non-consistent memory machines. Unless you know that -your driver absolutely has to support non-consistent platforms (this -is usually only legacy platforms) you should only use the API -described in part I. +This API is split into two pieces. Part I describes the API. Part II +describes the extensions to the API for supporting non-consistent +memory machines. Unless you know that your driver absolutely has to +support non-consistent platforms (this is usually only legacy +platforms) you should only use the API described in part I. -Part I - pci_ and dma_ Equivalent API +Part I - dma_ API ------------------------------------- -To get the pci_ API, you must #include To get the dma_ API, you must #include @@ -27,9 +25,6 @@ Part Ia - Using large dma-coherent buffers void * dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) -void * -pci_alloc_consistent(struct pci_dev *dev, size_t size, - dma_addr_t *dma_handle) Consistent memory is memory for which a write by either the device or the processor can immediately be read by the processor or device @@ -53,15 +48,11 @@ The simplest way to do that is to use the dma_pool calls (see below). The flag parameter (dma_alloc_coherent only) allows the caller to specify the GFP_ flags (see kmalloc) for the allocation (the implementation may choose to ignore flags that affect the location of -the returned memory, like GFP_DMA). For pci_alloc_consistent, you -must assume GFP_ATOMIC behaviour. +the returned memory, like GFP_DMA). void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle) -void -pci_free_consistent(struct pci_dev *dev, size_t size, void *cpu_addr, - dma_addr_t dma_handle) Free the region of consistent memory you previously allocated. dev, size and dma_handle must all be the same as those passed into the @@ -89,10 +80,6 @@ for alignment, like queue heads needing to be aligned on N-byte boundaries. dma_pool_create(const char *name, struct device *dev, size_t size, size_t align, size_t alloc); - struct pci_pool * - pci_pool_create(const char *name, struct pci_device *dev, - size_t size, size_t align, size_t alloc); - The pool create() routines initialize a pool of dma-coherent buffers for use with a given device. It must be called in a context which can sleep. @@ -108,9 +95,6 @@ from this pool must not cross 4KByte boundaries. void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags, dma_addr_t *dma_handle); - void *pci_pool_alloc(struct pci_pool *pool, gfp_t gfp_flags, - dma_addr_t *dma_handle); - This allocates memory from the pool; the returned memory will meet the size and alignment requirements specified at creation time. Pass GFP_ATOMIC to prevent blocking, or if it's permitted (not in_interrupt, not holding SMP locks), @@ -122,9 +106,6 @@ pool's device. void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t addr); - void pci_pool_free(struct pci_pool *pool, void *vaddr, - dma_addr_t addr); - This puts memory back into the pool. The pool is what was passed to the pool allocation routine; the cpu (vaddr) and dma addresses are what were returned when that routine allocated the memory being freed. @@ -132,8 +113,6 @@ were returned when that routine allocated the memory being freed. void dma_pool_destroy(struct dma_pool *pool); - void pci_pool_destroy(struct pci_pool *pool); - The pool destroy() routines free the resources of the pool. They must be called in a context which can sleep. Make sure you've freed all allocated memory back to the pool before you destroy it. @@ -144,8 +123,6 @@ Part Ic - DMA addressing limitations int dma_supported(struct device *dev, u64 mask) -int -pci_dma_supported(struct pci_dev *hwdev, u64 mask) Checks to see if the device can support DMA to the memory described by mask. @@ -159,8 +136,14 @@ driver writers. int dma_set_mask(struct device *dev, u64 mask) + +Checks to see if the mask is possible and updates the device +parameters if it is. + +Returns: 0 if successful and a negative error if not. + int -pci_set_dma_mask(struct pci_device *dev, u64 mask) +dma_set_coherent_mask(struct device *dev, u64 mask) Checks to see if the mask is possible and updates the device parameters if it is. @@ -187,9 +170,6 @@ Part Id - Streaming DMA mappings dma_addr_t dma_map_single(struct device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction) -dma_addr_t -pci_map_single(struct pci_dev *hwdev, void *cpu_addr, size_t size, - int direction) Maps a piece of processor virtual memory so it can be accessed by the device and returns the physical handle of the memory. @@ -198,14 +178,10 @@ The direction for both api's may be converted freely by casting. However the dma_ API uses a strongly typed enumerator for its direction: -DMA_NONE = PCI_DMA_NONE no direction (used for - debugging) -DMA_TO_DEVICE = PCI_DMA_TODEVICE data is going from the - memory to the device -DMA_FROM_DEVICE = PCI_DMA_FROMDEVICE data is coming from - the device to the - memory -DMA_BIDIRECTIONAL = PCI_DMA_BIDIRECTIONAL direction isn't known +DMA_NONE no direction (used for debugging) +DMA_TO_DEVICE data is going from the memory to the device +DMA_FROM_DEVICE data is coming from the device to the memory +DMA_BIDIRECTIONAL direction isn't known Notes: Not all memory regions in a machine can be mapped by this API. Further, regions that appear to be physically contiguous in @@ -268,9 +244,6 @@ cache lines are updated with data that the device may have changed). void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction direction) -void -pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, - size_t size, int direction) Unmaps the region previously mapped. All the parameters passed in must be identical to those passed in (and returned) by the mapping @@ -280,15 +253,9 @@ dma_addr_t dma_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction) -dma_addr_t -pci_map_page(struct pci_dev *hwdev, struct page *page, - unsigned long offset, size_t size, int direction) void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, enum dma_data_direction direction) -void -pci_unmap_page(struct pci_dev *hwdev, dma_addr_t dma_address, - size_t size, int direction) API for mapping and unmapping for pages. All the notes and warnings for the other mapping APIs apply here. Also, although the @@ -299,9 +266,6 @@ cache width is. int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -int -pci_dma_mapping_error(struct pci_dev *hwdev, dma_addr_t dma_addr) - In some circumstances dma_map_single and dma_map_page will fail to create a mapping. A driver can check for these errors by testing the returned dma address with dma_mapping_error(). A non-zero return value means the mapping @@ -311,9 +275,6 @@ reduce current DMA mapping usage or delay and try again later). int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) - int - pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, - int nents, int direction) Returns: the number of physical segments mapped (this may be shorter than passed in if some elements of the scatter/gather list are @@ -353,9 +314,6 @@ accessed sg->address and sg->length as shown above. void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, enum dma_data_direction direction) - void - pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, - int nents, int direction) Unmap the previously mapped scatter/gather list. All the parameters must be the same as those and passed in to the scatter/gather mapping @@ -365,21 +323,23 @@ Note: must be the number you passed in, *not* the number of physical entries returned. void -dma_sync_single(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) void -pci_dma_sync_single(struct pci_dev *hwdev, dma_addr_t dma_handle, - size_t size, int direction) +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) void -dma_sync_sg(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, + enum dma_data_direction direction) void -pci_dma_sync_sg(struct pci_dev *hwdev, struct scatterlist *sg, - int nelems, int direction) +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, + enum dma_data_direction direction) -Synchronise a single contiguous or scatter/gather mapping. All the -parameters must be the same as those passed into the single mapping -API. +Synchronise a single contiguous or scatter/gather mapping for the cpu +and device. With the sync_sg API, all the parameters must be the same +as those passed into the single mapping API. With the sync_single API, +you can use dma_handle and size parameters that aren't identical to +those passed into the single mapping API to do a partial sync. Notes: You must do this: @@ -461,9 +421,9 @@ void whizco_dma_map_sg_attrs(struct device *dev, dma_addr_t dma_addr, Part II - Advanced dma_ usage ----------------------------- -Warning: These pieces of the DMA API have no PCI equivalent. They -should also not be used in the majority of cases, since they cater for -unlikely corner cases that don't belong in usual drivers. +Warning: These pieces of the DMA API should not be used in the +majority of cases, since they cater for unlikely corner cases that +don't belong in usual drivers. If you don't understand how cache line coherency works between a processor and an I/O device, you should not be using this part of the @@ -513,16 +473,6 @@ line, but it will guarantee that one or more cache lines fit exactly into the width returned by this call. It will also always be a power of two for easy alignment. -void -dma_sync_single_range(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) - -Does a partial sync, starting at offset and continuing for size. You -must be careful to observe the cache alignment and width when doing -anything like this. You must also be extra careful about accessing -memory you intend to sync partially. - void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction direction) diff --git a/Documentation/DocBook/device-drivers.tmpl b/Documentation/DocBook/device-drivers.tmpl index f9a6e2c75f128da4d57dcd331434731d804fbacb..1b2dd4fc3db24b57db4178d92fe2b9828c27b9ff 100644 --- a/Documentation/DocBook/device-drivers.tmpl +++ b/Documentation/DocBook/device-drivers.tmpl @@ -45,7 +45,7 @@ Atomic and pointer manipulation -!Iarch/x86/include/asm/atomic_32.h +!Iarch/x86/include/asm/atomic.h !Iarch/x86/include/asm/unaligned.h diff --git a/Documentation/DocBook/deviceiobook.tmpl b/Documentation/DocBook/deviceiobook.tmpl index 3ed88126ab8f8f360dac4d10933e8202ba3469e5..c1ed6a49e598cd992552fa6e3b0da70f59aa7829 100644 --- a/Documentation/DocBook/deviceiobook.tmpl +++ b/Documentation/DocBook/deviceiobook.tmpl @@ -316,7 +316,7 @@ CPU B: spin_unlock_irqrestore(&dev_lock, flags) Public Functions Provided -!Iarch/x86/include/asm/io_32.h +!Iarch/x86/include/asm/io.h !Elib/iomap.c diff --git a/Documentation/DocBook/mac80211.tmpl b/Documentation/DocBook/mac80211.tmpl index f3f37f141dbdb4a00ab621e07a9c1fa361a1f782..affb15a344a16b5b99fb8830e9736ca166466812 100644 --- a/Documentation/DocBook/mac80211.tmpl +++ b/Documentation/DocBook/mac80211.tmpl @@ -144,7 +144,7 @@ usage should require reading the full document. this though and the recommendation to allow only a single interface in STA mode at first! -!Finclude/net/mac80211.h ieee80211_if_init_conf +!Finclude/net/mac80211.h ieee80211_vif @@ -234,7 +234,6 @@ usage should require reading the full document. Multiple queues and QoS support TBD !Finclude/net/mac80211.h ieee80211_tx_queue_params -!Finclude/net/mac80211.h ieee80211_tx_queue_stats diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl index 5e7d84b48505c16a15e11c4cd2f058ec4a6c6e84..133cd6c3f3c15658986209f65859719d7407aca6 100644 --- a/Documentation/DocBook/mtdnand.tmpl +++ b/Documentation/DocBook/mtdnand.tmpl @@ -488,7 +488,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip) The ECC bytes must be placed immidiately after the data bytes in order to make the syndrome generator work. This is contrary to the usual layout used by software ECC. The - seperation of data and out of band area is not longer + separation of data and out of band area is not longer possible. The nand driver code handles this layout and the remaining free bytes in the oob area are managed by the autoplacement code. Provide a matching oob-layout @@ -560,7 +560,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip) bad blocks. They have factory marked good blocks. The marker pattern is erased when the block is erased to be reused. So in case of powerloss before writing the pattern back to the chip this block - would be lost and added to the bad blocks. Therefor we scan the + would be lost and added to the bad blocks. Therefore we scan the chip(s) when we detect them the first time for good blocks and store this information in a bad block table before erasing any of the blocks. @@ -1094,7 +1094,7 @@ in this page manufacturers specifications. This applies similar to the spare area. - Therefor NAND aware filesystems must either write in page size chunks + Therefore NAND aware filesystems must either write in page size chunks or hold a writebuffer to collect smaller writes until they sum up to pagesize. Available NAND aware filesystems: JFFS2, YAFFS. diff --git a/Documentation/DocBook/v4l/common.xml b/Documentation/DocBook/v4l/common.xml index c65f0ac9b6eec58942562c7e8d1a5386554a4fa9..cea23e1c4fc6f7f517434b861de46fef08a6fd25 100644 --- a/Documentation/DocBook/v4l/common.xml +++ b/Documentation/DocBook/v4l/common.xml @@ -1170,7 +1170,7 @@ frames per second. If less than this number of frames is to be captured or output, applications can request frame skipping or duplicating on the driver side. This is especially useful when using the &func-read; or &func-write;, which are not augmented by timestamps -or sequence counters, and to avoid unneccessary data copying. +or sequence counters, and to avoid unnecessary data copying. Finally these ioctls can be used to determine the number of buffers used internally by a driver in read/write mode. For diff --git a/Documentation/DocBook/v4l/io.xml b/Documentation/DocBook/v4l/io.xml index f92f24323b2aad000a4f6890368c03aec5dcaf7b..e870330cbf772fc757f5c6400ed4cfb61ed72486 100644 --- a/Documentation/DocBook/v4l/io.xml +++ b/Documentation/DocBook/v4l/io.xml @@ -589,7 +589,8 @@ number of a video input as in &v4l2-input; field A place holder for future extensions and custom (driver defined) buffer types -V4L2_BUF_TYPE_PRIVATE and higher. +V4L2_BUF_TYPE_PRIVATE and higher. Applications +should set this to 0. diff --git a/Documentation/DocBook/v4l/vidioc-g-parm.xml b/Documentation/DocBook/v4l/vidioc-g-parm.xml index 78332d365ce907dd2a3d3d413ea2c60f3e0dbe1d..392aa9e5571ec80f2e5ec465a5ccff9acffdcb45 100644 --- a/Documentation/DocBook/v4l/vidioc-g-parm.xml +++ b/Documentation/DocBook/v4l/vidioc-g-parm.xml @@ -55,7 +55,7 @@ captured or output, applications can request frame skipping or duplicating on the driver side. This is especially useful when using the read() or write(), which are not augmented by timestamps or sequence counters, and to avoid -unneccessary data copying. +unnecessary data copying. Further these ioctls can be used to determine the number of buffers used internally by a driver in read/write mode. For diff --git a/Documentation/DocBook/v4l/vidioc-qbuf.xml b/Documentation/DocBook/v4l/vidioc-qbuf.xml index 1870817781548b409889d9ed93220cb26162ec74..b843bd7b389735725bd7d68954560509efe5cc41 100644 --- a/Documentation/DocBook/v4l/vidioc-qbuf.xml +++ b/Documentation/DocBook/v4l/vidioc-qbuf.xml @@ -54,12 +54,10 @@ to enqueue an empty (capturing) or filled (output) buffer in the driver's incoming queue. The semantics depend on the selected I/O method. - To enqueue a memory mapped -buffer applications set the type field of a -&v4l2-buffer; to the same buffer type as previously &v4l2-format; -type and &v4l2-requestbuffers; -type, the memory -field to V4L2_MEMORY_MMAP and the + To enqueue a buffer applications set the type +field of a &v4l2-buffer; to the same buffer type as was previously used +with &v4l2-format; type and &v4l2-requestbuffers; +type. Applications must also set the index field. Valid index numbers range from zero to the number of buffers allocated with &VIDIOC-REQBUFS; (&v4l2-requestbuffers; count) minus one. The @@ -70,8 +68,19 @@ intended for output (type is V4L2_BUF_TYPE_VBI_OUTPUT) applications must also initialize the bytesused, field and -timestamp fields. See for details. When +timestamp fields, see for details. +Applications must also set flags to 0. If a driver +supports capturing from specific video inputs and you want to specify a video +input, then flags should be set to +V4L2_BUF_FLAG_INPUT and the field +input must be initialized to the desired input. +The reserved field must be set to 0. + + + To enqueue a memory mapped +buffer applications set the memory +field to V4L2_MEMORY_MMAP. When VIDIOC_QBUF is called with a pointer to this structure the driver sets the V4L2_BUF_FLAG_MAPPED and @@ -81,14 +90,10 @@ structure the driver sets the &EINVAL;. To enqueue a user pointer -buffer applications set the type field of a -&v4l2-buffer; to the same buffer type as previously &v4l2-format; -type and &v4l2-requestbuffers; -type, the memory -field to V4L2_MEMORY_USERPTR and the +buffer applications set the memory +field to V4L2_MEMORY_USERPTR, the m.userptr field to the address of the -buffer and length to its size. When the -buffer is intended for output additional fields must be set as above. +buffer and length to its size. When VIDIOC_QBUF is called with a pointer to this structure the driver sets the V4L2_BUF_FLAG_QUEUED flag and clears the V4L2_BUF_FLAG_MAPPED and @@ -96,13 +101,14 @@ flag and clears the V4L2_BUF_FLAG_MAPPED and flags field, or it returns an error code. This ioctl locks the memory pages of the buffer in physical memory, they cannot be swapped out to disk. Buffers remain locked until -dequeued, until the &VIDIOC-STREAMOFF; or &VIDIOC-REQBUFS; ioctl are +dequeued, until the &VIDIOC-STREAMOFF; or &VIDIOC-REQBUFS; ioctl is called, or until the device is closed. Applications call the VIDIOC_DQBUF ioctl to dequeue a filled (capturing) or displayed (output) buffer from the driver's outgoing queue. They just set the -type and memory +type, memory +and reserved fields of a &v4l2-buffer; as above, when VIDIOC_DQBUF is called with a pointer to this structure the driver fills the remaining fields or returns an error code. diff --git a/Documentation/DocBook/v4l/vidioc-querybuf.xml b/Documentation/DocBook/v4l/vidioc-querybuf.xml index d834993e619172df3dae1f2b05495034c7ce1c73..e649805a4908838d2f4028099a8aa24a234c6db7 100644 --- a/Documentation/DocBook/v4l/vidioc-querybuf.xml +++ b/Documentation/DocBook/v4l/vidioc-querybuf.xml @@ -54,12 +54,13 @@ buffer at any time after buffers have been allocated with the &VIDIOC-REQBUFS; ioctl. Applications set the type field - of a &v4l2-buffer; to the same buffer type as previously + of a &v4l2-buffer; to the same buffer type as was previously used with &v4l2-format; type and &v4l2-requestbuffers; type, and the index field. Valid index numbers range from zero to the number of buffers allocated with &VIDIOC-REQBUFS; (&v4l2-requestbuffers; count) minus one. +The reserved field should to set to 0. After calling VIDIOC_QUERYBUF with a pointer to this structure drivers return an error code or fill the rest of the structure. @@ -68,8 +69,8 @@ the structure. V4L2_BUF_FLAG_MAPPED, V4L2_BUF_FLAG_QUEUED and V4L2_BUF_FLAG_DONE flags will be valid. The -memory field will be set to -V4L2_MEMORY_MMAP, the m.offset +memory field will be set to the current +I/O method, the m.offset contains the offset of the buffer from the start of the device memory, the length field its size. The driver may or may not set the remaining fields and flags, they are meaningless in diff --git a/Documentation/DocBook/v4l/vidioc-reqbufs.xml b/Documentation/DocBook/v4l/vidioc-reqbufs.xml index bab38084454fd0370985a72ff9cd44b4c457bfa0..1c08163720741d7f5fc081f8e0681e957fec0232 100644 --- a/Documentation/DocBook/v4l/vidioc-reqbufs.xml +++ b/Documentation/DocBook/v4l/vidioc-reqbufs.xml @@ -54,23 +54,23 @@ I/O. Memory mapped buffers are located in device memory and must be allocated with this ioctl before they can be mapped into the application's address space. User buffers are allocated by applications themselves, and this ioctl is merely used to switch the -driver into user pointer I/O mode. +driver into user pointer I/O mode and to setup some internal structures. - To allocate device buffers applications initialize three -fields of a v4l2_requestbuffers structure. + To allocate device buffers applications initialize all +fields of the v4l2_requestbuffers structure. They set the type field to the respective stream or buffer type, the count field to -the desired number of buffers, and memory -must be set to V4L2_MEMORY_MMAP. When the ioctl -is called with a pointer to this structure the driver attempts to -allocate the requested number of buffers and stores the actual number +the desired number of buffers, memory +must be set to the requested I/O method and the reserved array +must be zeroed. When the ioctl +is called with a pointer to this structure the driver will attempt to allocate +the requested number of buffers and it stores the actual number allocated in the count field. It can be smaller than the number requested, even zero, when the driver runs out -of free memory. A larger number is possible when the driver requires -more buffers to function correctly. - For example video output requires at least two buffers, +of free memory. A larger number is also possible when the driver requires +more buffers to function correctly. For example video output requires at least two buffers, one displayed and one filled by the application. - When memory mapping I/O is not supported the ioctl + When the I/O method is not supported the ioctl returns an &EINVAL;. Applications can call VIDIOC_REQBUFS @@ -81,14 +81,6 @@ in progress, an implicit &VIDIOC-STREAMOFF;. - To negotiate user pointer I/O, applications initialize only -the type field and set -memory to -V4L2_MEMORY_USERPTR. When the ioctl is called -with a pointer to this structure the driver prepares for user pointer -I/O, when this I/O method is not supported the ioctl returns an -&EINVAL;. - struct <structname>v4l2_requestbuffers</structname> @@ -97,9 +89,7 @@ I/O, when this I/O method is not supported the ioctl returns an __u32 count - The number of buffers requested or granted. This -field is only used when memory is set to -V4L2_MEMORY_MMAP. + The number of buffers requested or granted. &v4l2-buf-type; @@ -120,7 +110,7 @@ as the &v4l2-format; type field. See reserved[2] A place holder for future extensions and custom (driver defined) buffer types V4L2_BUF_TYPE_PRIVATE and -higher. +higher. This array should be zeroed by applications. diff --git a/Documentation/HOWTO b/Documentation/HOWTO index 8495fc970391b7d455455e31515705a1f3d8f0f4..f5395af88a41bfc49c987aabe378e51fc04e2b41 100644 --- a/Documentation/HOWTO +++ b/Documentation/HOWTO @@ -221,8 +221,8 @@ branches. These different branches are: - main 2.6.x kernel tree - 2.6.x.y -stable kernel tree - 2.6.x -git kernel patches - - 2.6.x -mm kernel patches - subsystem specific kernel trees and patches + - the 2.6.x -next kernel tree for integration tests 2.6.x kernel tree ----------------- @@ -232,7 +232,7 @@ process is as follows: - As soon as a new kernel is released a two weeks window is open, during this period of time maintainers can submit big diffs to Linus, usually the patches that have already been included in the - -mm kernel for a few weeks. The preferred way to submit big changes + -next kernel for a few weeks. The preferred way to submit big changes is using git (the kernel's source management tool, more information can be found at http://git.or.cz/) but plain patches are also just fine. @@ -293,84 +293,43 @@ daily and represent the current state of Linus' tree. They are more experimental than -rc kernels since they are generated automatically without even a cursory glance to see if they are sane. -2.6.x -mm kernel patches ------------------------- -These are experimental kernel patches released by Andrew Morton. Andrew -takes all of the different subsystem kernel trees and patches and mushes -them together, along with a lot of patches that have been plucked from -the linux-kernel mailing list. This tree serves as a proving ground for -new features and patches. Once a patch has proved its worth in -mm for -a while Andrew or the subsystem maintainer pushes it on to Linus for -inclusion in mainline. - -It is heavily encouraged that all new patches get tested in the -mm tree -before they are sent to Linus for inclusion in the main kernel tree. Code -which does not make an appearance in -mm before the opening of the merge -window will prove hard to merge into the mainline. - -These kernels are not appropriate for use on systems that are supposed -to be stable and they are more risky to run than any of the other -branches. - -If you wish to help out with the kernel development process, please test -and use these kernel releases and provide feedback to the linux-kernel -mailing list if you have any problems, and if everything works properly. - -In addition to all the other experimental patches, these kernels usually -also contain any changes in the mainline -git kernels available at the -time of release. - -The -mm kernels are not released on a fixed schedule, but usually a few --mm kernels are released in between each -rc kernel (1 to 3 is common). - Subsystem Specific kernel trees and patches ------------------------------------------- -A number of the different kernel subsystem developers expose their -development trees so that others can see what is happening in the -different areas of the kernel. These trees are pulled into the -mm -kernel releases as described above. - -Here is a list of some of the different kernel trees available: - git trees: - - Kbuild development tree, Sam Ravnborg - git.kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git - - - ACPI development tree, Len Brown - git.kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git - - - Block development tree, Jens Axboe - git.kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git - - - DRM development tree, Dave Airlie - git.kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git - - - ia64 development tree, Tony Luck - git.kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git - - - infiniband, Roland Dreier - git.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git - - - libata, Jeff Garzik - git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git - - - network drivers, Jeff Garzik - git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git - - - pcmcia, Dominik Brodowski - git.kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git - - - SCSI, James Bottomley - git.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git - - - x86, Ingo Molnar - git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86.git - - quilt trees: - - USB, Driver Core, and I2C, Greg Kroah-Hartman - kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/ +The maintainers of the various kernel subsystems --- and also many +kernel subsystem developers --- expose their current state of +development in source repositories. That way, others can see what is +happening in the different areas of the kernel. In areas where +development is rapid, a developer may be asked to base his submissions +onto such a subsystem kernel tree so that conflicts between the +submission and other already ongoing work are avoided. + +Most of these repositories are git trees, but there are also other SCMs +in use, or patch queues being published as quilt series. Addresses of +these subsystem repositories are listed in the MAINTAINERS file. Many +of them can be browsed at http://git.kernel.org/. + +Before a proposed patch is committed to such a subsystem tree, it is +subject to review which primarily happens on mailing lists (see the +respective section below). For several kernel subsystems, this review +process is tracked with the tool patchwork. Patchwork offers a web +interface which shows patch postings, any comments on a patch or +revisions to it, and maintainers can mark patches as under review, +accepted, or rejected. Most of these patchwork sites are listed at +http://patchwork.kernel.org/ or http://patchwork.ozlabs.org/. + +2.6.x -next kernel tree for integration tests +--------------------------------------------- +Before updates from subsystem trees are merged into the mainline 2.6.x +tree, they need to be integration-tested. For this purpose, a special +testing repository exists into which virtually all subsystem trees are +pulled on an almost daily basis: + http://git.kernel.org/?p=linux/kernel/git/sfr/linux-next.git + http://linux.f-seidel.de/linux-next/pmwiki/ + +This way, the -next kernel gives a summary outlook onto what will be +expected to go into the mainline kernel at the next merge period. +Adventurous testers are very welcome to runtime-test the -next kernel. - Other kernel trees can be found listed at http://git.kernel.org/ and in - the MAINTAINERS file. Bug Reporting ------------- diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt index bc38283379f01d107ec710d96c0863109b68622e..69dd29ed824e60c7290fab69f2df7848ddea7427 100644 --- a/Documentation/IPMI.txt +++ b/Documentation/IPMI.txt @@ -365,6 +365,7 @@ You can change this at module load time (for a module) with: regshifts=,,... slave_addrs=,,... force_kipmid=,,... + kipmid_max_busy_us=,,... unload_when_empty=[0|1] Each of these except si_trydefaults is a list, the first item for the @@ -433,6 +434,7 @@ kernel command line as: ipmi_si.regshifts=,,... ipmi_si.slave_addrs=,,... ipmi_si.force_kipmid=,,... + ipmi_si.kipmid_max_busy_us=,,... It works the same as the module parameters of the same names. @@ -450,6 +452,16 @@ force this thread on or off. If you force it off and don't have interrupts, the driver will run VERY slowly. Don't blame me, these interfaces suck. +Unfortunately, this thread can use a lot of CPU depending on the +interface's performance. This can waste a lot of CPU and cause +various issues with detecting idle CPU and using extra power. To +avoid this, the kipmid_max_busy_us sets the maximum amount of time, in +microseconds, that kipmid will spin before sleeping for a tick. This +value sets a balance between performance and CPU waste and needs to be +tuned to your needs. Maybe, someday, auto-tuning will be added, but +that's not a simple thing and even the auto-tuning would need to be +tuned to the user's desired performance. + The driver supports a hot add and remove of interfaces. This way, interfaces can be added or removed after the kernel is up and running. This is done using /sys/modules/ipmi_si/parameters/hotmod, which is a diff --git a/Documentation/Makefile b/Documentation/Makefile index 94b9457335344eb6b82fb06759e10f89ab037f25..6fc7ea1d1f9dd22dfe8ba59ad898a1c5b68f879b 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -1,3 +1,3 @@ obj-m := DocBook/ accounting/ auxdisplay/ connector/ \ - filesystems/configfs/ ia64/ networking/ \ - pcmcia/ spi/ video4linux/ vm/ watchdog/src/ + filesystems/ filesystems/configfs/ ia64/ laptops/ networking/ \ + pcmcia/ spi/ timers/ video4linux/ vm/ watchdog/src/ diff --git a/Documentation/PCI/PCI-DMA-mapping.txt b/Documentation/PCI/PCI-DMA-mapping.txt index ecad88d9fe5900a86eb4fc44066326caf2639be2..52618ab069ad3bdb677caac1f8ee4f78d0fd9301 100644 --- a/Documentation/PCI/PCI-DMA-mapping.txt +++ b/Documentation/PCI/PCI-DMA-mapping.txt @@ -1,12 +1,12 @@ - Dynamic DMA mapping - =================== + Dynamic DMA mapping Guide + ========================= David S. Miller Richard Henderson Jakub Jelinek -This document describes the DMA mapping system in terms of the pci_ -API. For a similar API that works for generic devices, see +This is a guide to device driver writers on how to use the DMA API +with example pseudo-code. For a concise description of the API, see DMA-API.txt. Most of the 64bit platforms have special hardware that translates bus @@ -26,12 +26,15 @@ mapped only for the time they are actually used and unmapped after the DMA transfer. The following API will work of course even on platforms where no such -hardware exists, see e.g. arch/x86/include/asm/pci.h for how it is implemented on -top of the virt_to_bus interface. +hardware exists. + +Note that the DMA API works with any bus independent of the underlying +microprocessor architecture. You should use the DMA API rather than +the bus specific DMA API (e.g. pci_dma_*). First of all, you should make sure -#include +#include is in your driver. This file will obtain for you the definition of the dma_addr_t (which can hold any valid DMA address for the platform) @@ -78,44 +81,43 @@ for you to DMA from/to. DMA addressing limitations Does your device have any DMA addressing limitations? For example, is -your device only capable of driving the low order 24-bits of address -on the PCI bus for SAC DMA transfers? If so, you need to inform the -PCI layer of this fact. +your device only capable of driving the low order 24-bits of address? +If so, you need to inform the kernel of this fact. By default, the kernel assumes that your device can address the full -32-bits in a SAC cycle. For a 64-bit DAC capable device, this needs -to be increased. And for a device with limitations, as discussed in -the previous paragraph, it needs to be decreased. - -pci_alloc_consistent() by default will return 32-bit DMA addresses. -PCI-X specification requires PCI-X devices to support 64-bit -addressing (DAC) for all transactions. And at least one platform (SGI -SN2) requires 64-bit consistent allocations to operate correctly when -the IO bus is in PCI-X mode. Therefore, like with pci_set_dma_mask(), -it's good practice to call pci_set_consistent_dma_mask() to set the -appropriate mask even if your device only supports 32-bit DMA -(default) and especially if it's a PCI-X device. - -For correct operation, you must interrogate the PCI layer in your -device probe routine to see if the PCI controller on the machine can -properly support the DMA addressing limitation your device has. It is -good style to do this even if your device holds the default setting, +32-bits. For a 64-bit capable device, this needs to be increased. +And for a device with limitations, as discussed in the previous +paragraph, it needs to be decreased. + +Special note about PCI: PCI-X specification requires PCI-X devices to +support 64-bit addressing (DAC) for all transactions. And at least +one platform (SGI SN2) requires 64-bit consistent allocations to +operate correctly when the IO bus is in PCI-X mode. + +For correct operation, you must interrogate the kernel in your device +probe routine to see if the DMA controller on the machine can properly +support the DMA addressing limitation your device has. It is good +style to do this even if your device holds the default setting, because this shows that you did think about these issues wrt. your device. -The query is performed via a call to pci_set_dma_mask(): +The query is performed via a call to dma_set_mask(): - int pci_set_dma_mask(struct pci_dev *pdev, u64 device_mask); + int dma_set_mask(struct device *dev, u64 mask); The query for consistent allocations is performed via a call to -pci_set_consistent_dma_mask(): +dma_set_coherent_mask(): - int pci_set_consistent_dma_mask(struct pci_dev *pdev, u64 device_mask); + int dma_set_coherent_mask(struct device *dev, u64 mask); -Here, pdev is a pointer to the PCI device struct of your device, and -device_mask is a bit mask describing which bits of a PCI address your -device supports. It returns zero if your card can perform DMA -properly on the machine given the address mask you provided. +Here, dev is a pointer to the device struct of your device, and mask +is a bit mask describing which bits of an address your device +supports. It returns zero if your card can perform DMA properly on +the machine given the address mask you provided. In general, the +device struct of your device is embedded in the bus specific device +struct of your device. For example, a pointer to the device struct of +your PCI device is pdev->dev (pdev is a pointer to the PCI device +struct of your device). If it returns non-zero, your device cannot perform DMA properly on this platform, and attempting to do so will result in undefined @@ -133,31 +135,30 @@ of your driver reports that performance is bad or that the device is not even detected, you can ask them for the kernel messages to find out exactly why. -The standard 32-bit addressing PCI device would do something like -this: +The standard 32-bit addressing device would do something like this: - if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) { + if (dma_set_mask(dev, DMA_BIT_MASK(32))) { printk(KERN_WARNING "mydev: No suitable DMA available.\n"); goto ignore_this_device; } -Another common scenario is a 64-bit capable device. The approach -here is to try for 64-bit DAC addressing, but back down to a -32-bit mask should that fail. The PCI platform code may fail the -64-bit mask not because the platform is not capable of 64-bit -addressing. Rather, it may fail in this case simply because -32-bit SAC addressing is done more efficiently than DAC addressing. -Sparc64 is one platform which behaves in this way. +Another common scenario is a 64-bit capable device. The approach here +is to try for 64-bit addressing, but back down to a 32-bit mask that +should not fail. The kernel may fail the 64-bit mask not because the +platform is not capable of 64-bit addressing. Rather, it may fail in +this case simply because 32-bit addressing is done more efficiently +than 64-bit addressing. For example, Sparc64 PCI SAC addressing is +more efficient than DAC addressing. Here is how you would handle a 64-bit capable device which can drive all 64-bits when accessing streaming DMA: int using_dac; - if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) { + if (!dma_set_mask(dev, DMA_BIT_MASK(64))) { using_dac = 1; - } else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) { + } else if (!dma_set_mask(dev, DMA_BIT_MASK(32))) { using_dac = 0; } else { printk(KERN_WARNING @@ -170,36 +171,36 @@ the case would look like this: int using_dac, consistent_using_dac; - if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) { + if (!dma_set_mask(dev, DMA_BIT_MASK(64))) { using_dac = 1; consistent_using_dac = 1; - pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); - } else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) { + dma_set_coherent_mask(dev, DMA_BIT_MASK(64)); + } else if (!dma_set_mask(dev, DMA_BIT_MASK(32))) { using_dac = 0; consistent_using_dac = 0; - pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + dma_set_coherent_mask(dev, DMA_BIT_MASK(32)); } else { printk(KERN_WARNING "mydev: No suitable DMA available.\n"); goto ignore_this_device; } -pci_set_consistent_dma_mask() will always be able to set the same or a -smaller mask as pci_set_dma_mask(). However for the rare case that a +dma_set_coherent_mask() will always be able to set the same or a +smaller mask as dma_set_mask(). However for the rare case that a device driver only uses consistent allocations, one would have to -check the return value from pci_set_consistent_dma_mask(). +check the return value from dma_set_coherent_mask(). Finally, if your device can only drive the low 24-bits of -address during PCI bus mastering you might do something like: +address you might do something like: - if (pci_set_dma_mask(pdev, DMA_BIT_MASK(24))) { + if (dma_set_mask(dev, DMA_BIT_MASK(24))) { printk(KERN_WARNING "mydev: 24-bit DMA addressing not available.\n"); goto ignore_this_device; } -When pci_set_dma_mask() is successful, and returns zero, the PCI layer -saves away this mask you have provided. The PCI layer will use this +When dma_set_mask() is successful, and returns zero, the kernel saves +away this mask you have provided. The kernel will use this information later when you make DMA mappings. There is a case which we are aware of at this time, which is worth @@ -208,7 +209,7 @@ functions (for example a sound card provides playback and record functions) and the various different functions have _different_ DMA addressing limitations, you may wish to probe each mask and only provide the functionality which the machine can handle. It -is important that the last call to pci_set_dma_mask() be for the +is important that the last call to dma_set_mask() be for the most specific mask. Here is pseudo-code showing how this might be done: @@ -217,17 +218,17 @@ Here is pseudo-code showing how this might be done: #define RECORD_ADDRESS_BITS DMA_BIT_MASK(24) struct my_sound_card *card; - struct pci_dev *pdev; + struct device *dev; ... - if (!pci_set_dma_mask(pdev, PLAYBACK_ADDRESS_BITS)) { + if (!dma_set_mask(dev, PLAYBACK_ADDRESS_BITS)) { card->playback_enabled = 1; } else { card->playback_enabled = 0; printk(KERN_WARNING "%s: Playback disabled due to DMA limitations.\n", card->name); } - if (!pci_set_dma_mask(pdev, RECORD_ADDRESS_BITS)) { + if (!dma_set_mask(dev, RECORD_ADDRESS_BITS)) { card->record_enabled = 1; } else { card->record_enabled = 0; @@ -252,8 +253,8 @@ There are two types of DMA mappings: Think of "consistent" as "synchronous" or "coherent". The current default is to return consistent memory in the low 32 - bits of the PCI bus space. However, for future compatibility you - should set the consistent mask even if this default is fine for your + bits of the bus space. However, for future compatibility you should + set the consistent mask even if this default is fine for your driver. Good examples of what to use consistent mappings for are: @@ -285,9 +286,9 @@ There are two types of DMA mappings: found in PCI bridges (such as by reading a register's value after writing it). -- Streaming DMA mappings which are usually mapped for one DMA transfer, - unmapped right after it (unless you use pci_dma_sync_* below) and for which - hardware can optimize for sequential accesses. +- Streaming DMA mappings which are usually mapped for one DMA + transfer, unmapped right after it (unless you use dma_sync_* below) + and for which hardware can optimize for sequential accesses. This of "streaming" as "asynchronous" or "outside the coherency domain". @@ -302,8 +303,8 @@ There are two types of DMA mappings: optimizations the hardware allows. To this end, when using such mappings you must be explicit about what you want to happen. -Neither type of DMA mapping has alignment restrictions that come -from PCI, although some devices may have such restrictions. +Neither type of DMA mapping has alignment restrictions that come from +the underlying bus, although some devices may have such restrictions. Also, systems with caches that aren't DMA-coherent will work better when the underlying buffers don't share cache lines with other data. @@ -315,33 +316,27 @@ you should do: dma_addr_t dma_handle; - cpu_addr = pci_alloc_consistent(pdev, size, &dma_handle); - -where pdev is a struct pci_dev *. This may be called in interrupt context. -You should use dma_alloc_coherent (see DMA-API.txt) for buses -where devices don't have struct pci_dev (like ISA, EISA). + cpu_addr = dma_alloc_coherent(dev, size, &dma_handle, gfp); -This argument is needed because the DMA translations may be bus -specific (and often is private to the bus which the device is attached -to). +where device is a struct device *. This may be called in interrupt +context with the GFP_ATOMIC flag. Size is the length of the region you want to allocate, in bytes. This routine will allocate RAM for that region, so it acts similarly to __get_free_pages (but takes size instead of a page order). If your driver needs regions sized smaller than a page, you may prefer using -the pci_pool interface, described below. - -The consistent DMA mapping interfaces, for non-NULL pdev, will by -default return a DMA address which is SAC (Single Address Cycle) -addressable. Even if the device indicates (via PCI dma mask) that it -may address the upper 32-bits and thus perform DAC cycles, consistent -allocation will only return > 32-bit PCI addresses for DMA if the -consistent dma mask has been explicitly changed via -pci_set_consistent_dma_mask(). This is true of the pci_pool interface -as well. - -pci_alloc_consistent returns two values: the virtual address which you +the dma_pool interface, described below. + +The consistent DMA mapping interfaces, for non-NULL dev, will by +default return a DMA address which is 32-bit addressable. Even if the +device indicates (via DMA mask) that it may address the upper 32-bits, +consistent allocation will only return > 32-bit addresses for DMA if +the consistent DMA mask has been explicitly changed via +dma_set_coherent_mask(). This is true of the dma_pool interface as +well. + +dma_alloc_coherent returns two values: the virtual address which you can use to access it from the CPU and dma_handle which you pass to the card. @@ -354,54 +349,54 @@ buffer you receive will not cross a 64K boundary. To unmap and free such a DMA region, you call: - pci_free_consistent(pdev, size, cpu_addr, dma_handle); + dma_free_coherent(dev, size, cpu_addr, dma_handle); -where pdev, size are the same as in the above call and cpu_addr and -dma_handle are the values pci_alloc_consistent returned to you. +where dev, size are the same as in the above call and cpu_addr and +dma_handle are the values dma_alloc_coherent returned to you. This function may not be called in interrupt context. If your driver needs lots of smaller memory regions, you can write -custom code to subdivide pages returned by pci_alloc_consistent, -or you can use the pci_pool API to do that. A pci_pool is like -a kmem_cache, but it uses pci_alloc_consistent not __get_free_pages. +custom code to subdivide pages returned by dma_alloc_coherent, +or you can use the dma_pool API to do that. A dma_pool is like +a kmem_cache, but it uses dma_alloc_coherent not __get_free_pages. Also, it understands common hardware constraints for alignment, like queue heads needing to be aligned on N byte boundaries. -Create a pci_pool like this: +Create a dma_pool like this: - struct pci_pool *pool; + struct dma_pool *pool; - pool = pci_pool_create(name, pdev, size, align, alloc); + pool = dma_pool_create(name, dev, size, align, alloc); -The "name" is for diagnostics (like a kmem_cache name); pdev and size +The "name" is for diagnostics (like a kmem_cache name); dev and size are as above. The device's hardware alignment requirement for this type of data is "align" (which is expressed in bytes, and must be a power of two). If your device has no boundary crossing restrictions, pass 0 for alloc; passing 4096 says memory allocated from this pool must not cross 4KByte boundaries (but at that time it may be better to -go for pci_alloc_consistent directly instead). +go for dma_alloc_coherent directly instead). -Allocate memory from a pci pool like this: +Allocate memory from a dma pool like this: - cpu_addr = pci_pool_alloc(pool, flags, &dma_handle); + cpu_addr = dma_pool_alloc(pool, flags, &dma_handle); flags are SLAB_KERNEL if blocking is permitted (not in_interrupt nor -holding SMP locks), SLAB_ATOMIC otherwise. Like pci_alloc_consistent, +holding SMP locks), SLAB_ATOMIC otherwise. Like dma_alloc_coherent, this returns two values, cpu_addr and dma_handle. -Free memory that was allocated from a pci_pool like this: +Free memory that was allocated from a dma_pool like this: - pci_pool_free(pool, cpu_addr, dma_handle); + dma_pool_free(pool, cpu_addr, dma_handle); -where pool is what you passed to pci_pool_alloc, and cpu_addr and -dma_handle are the values pci_pool_alloc returned. This function +where pool is what you passed to dma_pool_alloc, and cpu_addr and +dma_handle are the values dma_pool_alloc returned. This function may be called in interrupt context. -Destroy a pci_pool by calling: +Destroy a dma_pool by calling: - pci_pool_destroy(pool); + dma_pool_destroy(pool); -Make sure you've called pci_pool_free for all memory allocated +Make sure you've called dma_pool_free for all memory allocated from a pool before you destroy the pool. This function may not be called in interrupt context. @@ -411,15 +406,15 @@ The interfaces described in subsequent portions of this document take a DMA direction argument, which is an integer and takes on one of the following values: - PCI_DMA_BIDIRECTIONAL - PCI_DMA_TODEVICE - PCI_DMA_FROMDEVICE - PCI_DMA_NONE + DMA_BIDIRECTIONAL + DMA_TO_DEVICE + DMA_FROM_DEVICE + DMA_NONE One should provide the exact DMA direction if you know it. -PCI_DMA_TODEVICE means "from main memory to the PCI device" -PCI_DMA_FROMDEVICE means "from the PCI device to main memory" +DMA_TO_DEVICE means "from main memory to the device" +DMA_FROM_DEVICE means "from the device to main memory" It is the direction in which the data moves during the DMA transfer. @@ -427,12 +422,12 @@ You are _strongly_ encouraged to specify this as precisely as you possibly can. If you absolutely cannot know the direction of the DMA transfer, -specify PCI_DMA_BIDIRECTIONAL. It means that the DMA can go in +specify DMA_BIDIRECTIONAL. It means that the DMA can go in either direction. The platform guarantees that you may legally specify this, and that it will work, but this may be at the cost of performance for example. -The value PCI_DMA_NONE is to be used for debugging. One can +The value DMA_NONE is to be used for debugging. One can hold this in a data structure before you come to know the precise direction, and this will help catch cases where your direction tracking logic has failed to set things up properly. @@ -442,21 +437,21 @@ potential platform-specific optimizations of such) is for debugging. Some platforms actually have a write permission boolean which DMA mappings can be marked with, much like page protections in the user program address space. Such platforms can and do report errors in the -kernel logs when the PCI controller hardware detects violation of the +kernel logs when the DMA controller hardware detects violation of the permission setting. Only streaming mappings specify a direction, consistent mappings implicitly have a direction attribute setting of -PCI_DMA_BIDIRECTIONAL. +DMA_BIDIRECTIONAL. The SCSI subsystem tells you the direction to use in the 'sc_data_direction' member of the SCSI command your driver is working on. For Networking drivers, it's a rather simple affair. For transmit -packets, map/unmap them with the PCI_DMA_TODEVICE direction +packets, map/unmap them with the DMA_TO_DEVICE direction specifier. For receive packets, just the opposite, map/unmap them -with the PCI_DMA_FROMDEVICE direction specifier. +with the DMA_FROM_DEVICE direction specifier. Using Streaming DMA mappings @@ -467,43 +462,43 @@ scatterlist. To map a single region, you do: - struct pci_dev *pdev = mydev->pdev; + struct device *dev = &my_dev->dev; dma_addr_t dma_handle; void *addr = buffer->ptr; size_t size = buffer->len; - dma_handle = pci_map_single(pdev, addr, size, direction); + dma_handle = dma_map_single(dev, addr, size, direction); and to unmap it: - pci_unmap_single(pdev, dma_handle, size, direction); + dma_unmap_single(dev, dma_handle, size, direction); -You should call pci_unmap_single when the DMA activity is finished, e.g. +You should call dma_unmap_single when the DMA activity is finished, e.g. from the interrupt which told you that the DMA transfer is done. Using cpu pointers like this for single mappings has a disadvantage, you cannot reference HIGHMEM memory in this way. Thus, there is a -map/unmap interface pair akin to pci_{map,unmap}_single. These +map/unmap interface pair akin to dma_{map,unmap}_single. These interfaces deal with page/offset pairs instead of cpu pointers. Specifically: - struct pci_dev *pdev = mydev->pdev; + struct device *dev = &my_dev->dev; dma_addr_t dma_handle; struct page *page = buffer->page; unsigned long offset = buffer->offset; size_t size = buffer->len; - dma_handle = pci_map_page(pdev, page, offset, size, direction); + dma_handle = dma_map_page(dev, page, offset, size, direction); ... - pci_unmap_page(pdev, dma_handle, size, direction); + dma_unmap_page(dev, dma_handle, size, direction); Here, "offset" means byte offset within the given page. With scatterlists, you map a region gathered from several regions by: - int i, count = pci_map_sg(pdev, sglist, nents, direction); + int i, count = dma_map_sg(dev, sglist, nents, direction); struct scatterlist *sg; for_each_sg(sglist, sg, count, i) { @@ -527,16 +522,16 @@ accessed sg->address and sg->length as shown above. To unmap a scatterlist, just call: - pci_unmap_sg(pdev, sglist, nents, direction); + dma_unmap_sg(dev, sglist, nents, direction); Again, make sure DMA activity has already finished. -PLEASE NOTE: The 'nents' argument to the pci_unmap_sg call must be - the _same_ one you passed into the pci_map_sg call, +PLEASE NOTE: The 'nents' argument to the dma_unmap_sg call must be + the _same_ one you passed into the dma_map_sg call, it should _NOT_ be the 'count' value _returned_ from the - pci_map_sg call. + dma_map_sg call. -Every pci_map_{single,sg} call should have its pci_unmap_{single,sg} +Every dma_map_{single,sg} call should have its dma_unmap_{single,sg} counterpart, because the bus address space is a shared resource (although in some ports the mapping is per each BUS so less devices contend for the same bus address space) and you could render the machine unusable by eating @@ -547,14 +542,14 @@ the data in between the DMA transfers, the buffer needs to be synced properly in order for the cpu and device to see the most uptodate and correct copy of the DMA buffer. -So, firstly, just map it with pci_map_{single,sg}, and after each DMA +So, firstly, just map it with dma_map_{single,sg}, and after each DMA transfer call either: - pci_dma_sync_single_for_cpu(pdev, dma_handle, size, direction); + dma_sync_single_for_cpu(dev, dma_handle, size, direction); or: - pci_dma_sync_sg_for_cpu(pdev, sglist, nents, direction); + dma_sync_sg_for_cpu(dev, sglist, nents, direction); as appropriate. @@ -562,27 +557,27 @@ Then, if you wish to let the device get at the DMA area again, finish accessing the data with the cpu, and then before actually giving the buffer to the hardware call either: - pci_dma_sync_single_for_device(pdev, dma_handle, size, direction); + dma_sync_single_for_device(dev, dma_handle, size, direction); or: - pci_dma_sync_sg_for_device(dev, sglist, nents, direction); + dma_sync_sg_for_device(dev, sglist, nents, direction); as appropriate. After the last DMA transfer call one of the DMA unmap routines -pci_unmap_{single,sg}. If you don't touch the data from the first pci_map_* -call till pci_unmap_*, then you don't have to call the pci_dma_sync_* +dma_unmap_{single,sg}. If you don't touch the data from the first dma_map_* +call till dma_unmap_*, then you don't have to call the dma_sync_* routines at all. Here is pseudo code which shows a situation in which you would need -to use the pci_dma_sync_*() interfaces. +to use the dma_sync_*() interfaces. my_card_setup_receive_buffer(struct my_card *cp, char *buffer, int len) { dma_addr_t mapping; - mapping = pci_map_single(cp->pdev, buffer, len, PCI_DMA_FROMDEVICE); + mapping = dma_map_single(cp->dev, buffer, len, DMA_FROM_DEVICE); cp->rx_buf = buffer; cp->rx_len = len; @@ -606,25 +601,25 @@ to use the pci_dma_sync_*() interfaces. * the DMA transfer with the CPU first * so that we see updated contents. */ - pci_dma_sync_single_for_cpu(cp->pdev, cp->rx_dma, - cp->rx_len, - PCI_DMA_FROMDEVICE); + dma_sync_single_for_cpu(&cp->dev, cp->rx_dma, + cp->rx_len, + DMA_FROM_DEVICE); /* Now it is safe to examine the buffer. */ hp = (struct my_card_header *) cp->rx_buf; if (header_is_ok(hp)) { - pci_unmap_single(cp->pdev, cp->rx_dma, cp->rx_len, - PCI_DMA_FROMDEVICE); + dma_unmap_single(&cp->dev, cp->rx_dma, cp->rx_len, + DMA_FROM_DEVICE); pass_to_upper_layers(cp->rx_buf); make_and_setup_new_rx_buf(cp); } else { /* Just sync the buffer and give it back * to the card. */ - pci_dma_sync_single_for_device(cp->pdev, - cp->rx_dma, - cp->rx_len, - PCI_DMA_FROMDEVICE); + dma_sync_single_for_device(&cp->dev, + cp->rx_dma, + cp->rx_len, + DMA_FROM_DEVICE); give_rx_buf_to_card(cp); } } @@ -634,19 +629,19 @@ Drivers converted fully to this interface should not use virt_to_bus any longer, nor should they use bus_to_virt. Some drivers have to be changed a little bit, because there is no longer an equivalent to bus_to_virt in the dynamic DMA mapping scheme - you have to always store the DMA addresses -returned by the pci_alloc_consistent, pci_pool_alloc, and pci_map_single -calls (pci_map_sg stores them in the scatterlist itself if the platform +returned by the dma_alloc_coherent, dma_pool_alloc, and dma_map_single +calls (dma_map_sg stores them in the scatterlist itself if the platform supports dynamic DMA mapping in hardware) in your driver structures and/or in the card registers. -All PCI drivers should be using these interfaces with no exceptions. -It is planned to completely remove virt_to_bus() and bus_to_virt() as +All drivers should be using these interfaces with no exceptions. It +is planned to completely remove virt_to_bus() and bus_to_virt() as they are entirely deprecated. Some ports already do not provide these as it is impossible to correctly support them. Optimizing Unmap State Space Consumption -On many platforms, pci_unmap_{single,page}() is simply a nop. +On many platforms, dma_unmap_{single,page}() is simply a nop. Therefore, keeping track of the mapping address and length is a waste of space. Instead of filling your drivers up with ifdefs and the like to "work around" this (which would defeat the whole purpose of a @@ -655,7 +650,7 @@ portable API) the following facilities are provided. Actually, instead of describing the macros one by one, we'll transform some example code. -1) Use DECLARE_PCI_UNMAP_{ADDR,LEN} in state saving structures. +1) Use DEFINE_DMA_UNMAP_{ADDR,LEN} in state saving structures. Example, before: struct ring_state { @@ -668,14 +663,11 @@ transform some example code. struct ring_state { struct sk_buff *skb; - DECLARE_PCI_UNMAP_ADDR(mapping) - DECLARE_PCI_UNMAP_LEN(len) + DEFINE_DMA_UNMAP_ADDR(mapping); + DEFINE_DMA_UNMAP_LEN(len); }; - NOTE: DO NOT put a semicolon at the end of the DECLARE_*() - macro. - -2) Use pci_unmap_{addr,len}_set to set these values. +2) Use dma_unmap_{addr,len}_set to set these values. Example, before: ringp->mapping = FOO; @@ -683,21 +675,21 @@ transform some example code. after: - pci_unmap_addr_set(ringp, mapping, FOO); - pci_unmap_len_set(ringp, len, BAR); + dma_unmap_addr_set(ringp, mapping, FOO); + dma_unmap_len_set(ringp, len, BAR); -3) Use pci_unmap_{addr,len} to access these values. +3) Use dma_unmap_{addr,len} to access these values. Example, before: - pci_unmap_single(pdev, ringp->mapping, ringp->len, - PCI_DMA_FROMDEVICE); + dma_unmap_single(dev, ringp->mapping, ringp->len, + DMA_FROM_DEVICE); after: - pci_unmap_single(pdev, - pci_unmap_addr(ringp, mapping), - pci_unmap_len(ringp, len), - PCI_DMA_FROMDEVICE); + dma_unmap_single(dev, + dma_unmap_addr(ringp, mapping), + dma_unmap_len(ringp, len), + DMA_FROM_DEVICE); It really should be self-explanatory. We treat the ADDR and LEN separately, because it is possible for an implementation to only @@ -732,15 +724,15 @@ to "Closing". DMA address space is limited on some architectures and an allocation failure can be determined by: -- checking if pci_alloc_consistent returns NULL or pci_map_sg returns 0 +- checking if dma_alloc_coherent returns NULL or dma_map_sg returns 0 -- checking the returned dma_addr_t of pci_map_single and pci_map_page - by using pci_dma_mapping_error(): +- checking the returned dma_addr_t of dma_map_single and dma_map_page + by using dma_mapping_error(): dma_addr_t dma_handle; - dma_handle = pci_map_single(pdev, addr, size, direction); - if (pci_dma_mapping_error(pdev, dma_handle)) { + dma_handle = dma_map_single(dev, addr, size, direction); + if (dma_mapping_error(dev, dma_handle)) { /* * reduce current DMA mapping usage, * delay and try again later or diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX index 9bb62f7b89c3ebaa7375a2fd8c79f65a34674c27..71b6f500ddb9ae3914ba8f31988470c4d8cb8d3c 100644 --- a/Documentation/RCU/00-INDEX +++ b/Documentation/RCU/00-INDEX @@ -6,16 +6,22 @@ checklist.txt - Review Checklist for RCU Patches listRCU.txt - Using RCU to Protect Read-Mostly Linked Lists +lockdep.txt + - RCU and lockdep checking NMI-RCU.txt - Using RCU to Protect Dynamic NMI Handlers +rcubarrier.txt + - RCU and Unloadable Modules +rculist_nulls.txt + - RCU list primitives for use with SLAB_DESTROY_BY_RCU rcuref.txt - Reference-count design for elements of lists/arrays protected by RCU rcu.txt - RCU Concepts -rcubarrier.txt - - Unloading modules that use RCU callbacks RTFP.txt - List of RCU papers (bibliography) going back to 1980. +stallwarn.txt + - RCU CPU stall warnings (CONFIG_RCU_CPU_STALL_DETECTOR) torture.txt - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST) trace.txt diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt index d2b85237c76e9cd4527f66bc5ffef3d40273e6c3..5aea459e3dd6cbbfa87ad224f0faec570ed77875 100644 --- a/Documentation/RCU/RTFP.txt +++ b/Documentation/RCU/RTFP.txt @@ -25,10 +25,10 @@ to be referencing the data structure. However, this mechanism was not optimized for modern computer systems, which is not surprising given that these overheads were not so expensive in the mid-80s. Nonetheless, passive serialization appears to be the first deferred-destruction -mechanism to be used in production. Furthermore, the relevant patent has -lapsed, so this approach may be used in non-GPL software, if desired. -(In contrast, use of RCU is permitted only in software licensed under -GPL. Sorry!!!) +mechanism to be used in production. Furthermore, the relevant patent +has lapsed, so this approach may be used in non-GPL software, if desired. +(In contrast, implementation of RCU is permitted only in software licensed +under either GPL or LGPL. Sorry!!!) In 1990, Pugh [Pugh90] noted that explicitly tracking which threads were reading a given data structure permitted deferred free to operate @@ -150,6 +150,18 @@ preemptible RCU [PaulEMcKenney2007PreemptibleRCU], and the three-part LWN "What is RCU?" series [PaulEMcKenney2007WhatIsRCUFundamentally, PaulEMcKenney2008WhatIsRCUUsage, and PaulEMcKenney2008WhatIsRCUAPI]. +2008 saw a journal paper on real-time RCU [DinakarGuniguntala2008IBMSysJ], +a history of how Linux changed RCU more than RCU changed Linux +[PaulEMcKenney2008RCUOSR], and a design overview of hierarchical RCU +[PaulEMcKenney2008HierarchicalRCU]. + +2009 introduced user-level RCU algorithms [PaulEMcKenney2009MaliciousURCU], +which Mathieu Desnoyers is now maintaining [MathieuDesnoyers2009URCU] +[MathieuDesnoyersPhD]. TINY_RCU [PaulEMcKenney2009BloatWatchRCU] made +its appearance, as did expedited RCU [PaulEMcKenney2009expeditedRCU]. +The problem of resizeable RCU-protected hash tables may now be on a path +to a solution [JoshTriplett2009RPHash]. + Bibtex Entries @article{Kung80 @@ -730,6 +742,11 @@ Revised: " } +# +# "What is RCU?" LWN series. +# +######################################################################## + @article{DinakarGuniguntala2008IBMSysJ ,author="D. Guniguntala and P. E. McKenney and J. Triplett and J. Walpole" ,title="The read-copy-update mechanism for supporting real-time applications on shared-memory multiprocessor systems with {Linux}" @@ -820,3 +837,39 @@ Revised: Uniprocessor assumptions allow simplified RCU implementation. " } + +@unpublished{PaulEMcKenney2009expeditedRCU +,Author="Paul E. McKenney" +,Title="[{PATCH} -tip 0/3] expedited 'big hammer' {RCU} grace periods" +,month="June" +,day="25" +,year="2009" +,note="Available: +\url{http://lkml.org/lkml/2009/6/25/306} +[Viewed August 16, 2009]" +,annotation=" + First posting of expedited RCU to be accepted into -tip. +" +} + +@unpublished{JoshTriplett2009RPHash +,Author="Josh Triplett" +,Title="Scalable concurrent hash tables via relativistic programming" +,month="September" +,year="2009" +,note="Linux Plumbers Conference presentation" +,annotation=" + RP fun with hash tables. +" +} + +@phdthesis{MathieuDesnoyersPhD +, title = "Low-Impact Operating System Tracing" +, author = "Mathieu Desnoyers" +, school = "Ecole Polytechnique de Montr\'{e}al" +, month = "December" +, year = 2009 +,note="Available: +\url{http://www.lttng.org/pub/thesis/desnoyers-dissertation-2009-12.pdf} +[Viewed December 9, 2009]" +} diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index 51525a30e8b4015d9c1ca8c3866b274a6a5874f3..cbc180f90194fcb01bc273f8b953d91b28a79fae 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -8,13 +8,12 @@ would cause. This list is based on experiences reviewing such patches over a rather long period of time, but improvements are always welcome! 0. Is RCU being applied to a read-mostly situation? If the data - structure is updated more than about 10% of the time, then - you should strongly consider some other approach, unless - detailed performance measurements show that RCU is nonetheless - the right tool for the job. Yes, you might think of RCU - as simply cutting overhead off of the readers and imposing it - on the writers. That is exactly why normal uses of RCU will - do much more reading than updating. + structure is updated more than about 10% of the time, then you + should strongly consider some other approach, unless detailed + performance measurements show that RCU is nonetheless the right + tool for the job. Yes, RCU does reduce read-side overhead by + increasing write-side overhead, which is exactly why normal uses + of RCU will do much more reading than updating. Another exception is where performance is not an issue, and RCU provides a simpler implementation. An example of this situation @@ -35,13 +34,13 @@ over a rather long period of time, but improvements are always welcome! If you choose #b, be prepared to describe how you have handled memory barriers on weakly ordered machines (pretty much all of - them -- even x86 allows reads to be reordered), and be prepared - to explain why this added complexity is worthwhile. If you - choose #c, be prepared to explain how this single task does not - become a major bottleneck on big multiprocessor machines (for - example, if the task is updating information relating to itself - that other tasks can read, there by definition can be no - bottleneck). + them -- even x86 allows later loads to be reordered to precede + earlier stores), and be prepared to explain why this added + complexity is worthwhile. If you choose #c, be prepared to + explain how this single task does not become a major bottleneck on + big multiprocessor machines (for example, if the task is updating + information relating to itself that other tasks can read, there + by definition can be no bottleneck). 2. Do the RCU read-side critical sections make proper use of rcu_read_lock() and friends? These primitives are needed @@ -51,8 +50,10 @@ over a rather long period of time, but improvements are always welcome! actuarial risk of your kernel. As a rough rule of thumb, any dereference of an RCU-protected - pointer must be covered by rcu_read_lock() or rcu_read_lock_bh() - or by the appropriate update-side lock. + pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(), + rcu_read_lock_sched(), or by the appropriate update-side lock. + Disabling of preemption can serve as rcu_read_lock_sched(), but + is less readable. 3. Does the update code tolerate concurrent accesses? @@ -62,25 +63,27 @@ over a rather long period of time, but improvements are always welcome! of ways to handle this concurrency, depending on the situation: a. Use the RCU variants of the list and hlist update - primitives to add, remove, and replace elements on an - RCU-protected list. Alternatively, use the RCU-protected - trees that have been added to the Linux kernel. + primitives to add, remove, and replace elements on + an RCU-protected list. Alternatively, use the other + RCU-protected data structures that have been added to + the Linux kernel. This is almost always the best approach. b. Proceed as in (a) above, but also maintain per-element locks (that are acquired by both readers and writers) that guard per-element state. Of course, fields that - the readers refrain from accessing can be guarded by the - update-side lock. + the readers refrain from accessing can be guarded by + some other lock acquired only by updaters, if desired. This works quite well, also. c. Make updates appear atomic to readers. For example, - pointer updates to properly aligned fields will appear - atomic, as will individual atomic primitives. Operations - performed under a lock and sequences of multiple atomic - primitives will -not- appear to be atomic. + pointer updates to properly aligned fields will + appear atomic, as will individual atomic primitives. + Sequences of perations performed under a lock will -not- + appear to be atomic to RCU readers, nor will sequences + of multiple atomic primitives. This can work, but is starting to get a bit tricky. @@ -98,9 +101,9 @@ over a rather long period of time, but improvements are always welcome! a new structure containing updated values. 4. Weakly ordered CPUs pose special challenges. Almost all CPUs - are weakly ordered -- even i386 CPUs allow reads to be reordered. - RCU code must take all of the following measures to prevent - memory-corruption problems: + are weakly ordered -- even x86 CPUs allow later loads to be + reordered to precede earlier stores. RCU code must take all of + the following measures to prevent memory-corruption problems: a. Readers must maintain proper ordering of their memory accesses. The rcu_dereference() primitive ensures that @@ -113,14 +116,25 @@ over a rather long period of time, but improvements are always welcome! The rcu_dereference() primitive is also an excellent documentation aid, letting the person reading the code know exactly which pointers are protected by RCU. - - The rcu_dereference() primitive is used by the various - "_rcu()" list-traversal primitives, such as the - list_for_each_entry_rcu(). Note that it is perfectly - legal (if redundant) for update-side code to use - rcu_dereference() and the "_rcu()" list-traversal - primitives. This is particularly useful in code - that is common to readers and updaters. + Please note that compilers can also reorder code, and + they are becoming increasingly aggressive about doing + just that. The rcu_dereference() primitive therefore + also prevents destructive compiler optimizations. + + The rcu_dereference() primitive is used by the + various "_rcu()" list-traversal primitives, such + as the list_for_each_entry_rcu(). Note that it is + perfectly legal (if redundant) for update-side code to + use rcu_dereference() and the "_rcu()" list-traversal + primitives. This is particularly useful in code that + is common to readers and updaters. However, lockdep + will complain if you access rcu_dereference() outside + of an RCU read-side critical section. See lockdep.txt + to learn what to do about this. + + Of course, neither rcu_dereference() nor the "_rcu()" + list-traversal primitives can substitute for a good + concurrency design coordinating among multiple updaters. b. If the list macros are being used, the list_add_tail_rcu() and list_add_rcu() primitives must be used in order @@ -135,11 +149,14 @@ over a rather long period of time, but improvements are always welcome! readers. Similarly, if the hlist macros are being used, the hlist_del_rcu() primitive is required. - The list_replace_rcu() primitive may be used to - replace an old structure with a new one in an - RCU-protected list. + The list_replace_rcu() and hlist_replace_rcu() primitives + may be used to replace an old structure with a new one + in their respective types of RCU-protected lists. + + d. Rules similar to (4b) and (4c) apply to the "hlist_nulls" + type of RCU-protected linked lists. - d. Updates must ensure that initialization of a given + e. Updates must ensure that initialization of a given structure happens before pointers to that structure are publicized. Use the rcu_assign_pointer() primitive when publicizing a pointer to a structure that can @@ -151,16 +168,31 @@ over a rather long period of time, but improvements are always welcome! it cannot block. 6. Since synchronize_rcu() can block, it cannot be called from - any sort of irq context. Ditto for synchronize_sched() and - synchronize_srcu(). - -7. If the updater uses call_rcu(), then the corresponding readers - must use rcu_read_lock() and rcu_read_unlock(). If the updater - uses call_rcu_bh(), then the corresponding readers must use - rcu_read_lock_bh() and rcu_read_unlock_bh(). If the updater - uses call_rcu_sched(), then the corresponding readers must - disable preemption. Mixing things up will result in confusion - and broken kernels. + any sort of irq context. The same rule applies for + synchronize_rcu_bh(), synchronize_sched(), synchronize_srcu(), + synchronize_rcu_expedited(), synchronize_rcu_bh_expedited(), + synchronize_sched_expedite(), and synchronize_srcu_expedited(). + + The expedited forms of these primitives have the same semantics + as the non-expedited forms, but expediting is both expensive + and unfriendly to real-time workloads. Use of the expedited + primitives should be restricted to rare configuration-change + operations that would not normally be undertaken while a real-time + workload is running. + +7. If the updater uses call_rcu() or synchronize_rcu(), then the + corresponding readers must use rcu_read_lock() and + rcu_read_unlock(). If the updater uses call_rcu_bh() or + synchronize_rcu_bh(), then the corresponding readers must + use rcu_read_lock_bh() and rcu_read_unlock_bh(). If the + updater uses call_rcu_sched() or synchronize_sched(), then + the corresponding readers must disable preemption, possibly + by calling rcu_read_lock_sched() and rcu_read_unlock_sched(). + If the updater uses synchronize_srcu(), the the corresponding + readers must use srcu_read_lock() and srcu_read_unlock(), + and with the same srcu_struct. The rules for the expedited + primitives are the same as for their non-expedited counterparts. + Mixing things up will result in confusion and broken kernels. One exception to this rule: rcu_read_lock() and rcu_read_unlock() may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh() @@ -212,6 +244,8 @@ over a rather long period of time, but improvements are always welcome! e. Periodically invoke synchronize_rcu(), permitting a limited number of updates per grace period. + The same cautions apply to call_rcu_bh() and call_rcu_sched(). + 9. All RCU list-traversal primitives, which include rcu_dereference(), list_for_each_entry_rcu(), list_for_each_continue_rcu(), and list_for_each_safe_rcu(), @@ -219,7 +253,9 @@ over a rather long period of time, but improvements are always welcome! must be protected by appropriate update-side locks. RCU read-side critical sections are delimited by rcu_read_lock() and rcu_read_unlock(), or by similar primitives such as - rcu_read_lock_bh() and rcu_read_unlock_bh(). + rcu_read_lock_bh() and rcu_read_unlock_bh(), in which case + the matching rcu_dereference() primitive must be used in order + to keep lockdep happy, in this case, rcu_dereference_bh(). The reason that it is permissible to use RCU list-traversal primitives when the update-side lock is held is that doing so @@ -229,7 +265,8 @@ over a rather long period of time, but improvements are always welcome! 10. Conversely, if you are in an RCU read-side critical section, and you don't hold the appropriate update-side lock, you -must- use the "_rcu()" variants of the list macros. Failing to do so - will break Alpha and confuse people reading your code. + will break Alpha, cause aggressive compilers to generate bad code, + and confuse people trying to read your code. 11. Note that synchronize_rcu() -only- guarantees to wait until all currently executing rcu_read_lock()-protected RCU read-side @@ -239,15 +276,21 @@ over a rather long period of time, but improvements are always welcome! rcu_read_lock()-protected read-side critical sections, do -not- use synchronize_rcu(). - If you want to wait for some of these other things, you might - instead need to use synchronize_irq() or synchronize_sched(). + Similarly, disabling preemption is not an acceptable substitute + for rcu_read_lock(). Code that attempts to use preemption + disabling where it should be using rcu_read_lock() will break + in real-time kernel builds. + + If you want to wait for interrupt handlers, NMI handlers, and + code under the influence of preempt_disable(), you instead + need to use synchronize_irq() or synchronize_sched(). 12. Any lock acquired by an RCU callback must be acquired elsewhere with softirq disabled, e.g., via spin_lock_irqsave(), spin_lock_bh(), etc. Failing to disable irq on a given - acquisition of that lock will result in deadlock as soon as the - RCU callback happens to interrupt that acquisition's critical - section. + acquisition of that lock will result in deadlock as soon as + the RCU softirq handler happens to run your RCU callback while + interrupting that acquisition's critical section. 13. RCU callbacks can be and are executed in parallel. In many cases, the callback code simply wrappers around kfree(), so that this @@ -265,29 +308,30 @@ over a rather long period of time, but improvements are always welcome! not the case, a self-spawning RCU callback would prevent the victim CPU from ever going offline.) -14. SRCU (srcu_read_lock(), srcu_read_unlock(), and synchronize_srcu()) - may only be invoked from process context. Unlike other forms of - RCU, it -is- permissible to block in an SRCU read-side critical - section (demarked by srcu_read_lock() and srcu_read_unlock()), - hence the "SRCU": "sleepable RCU". Please note that if you - don't need to sleep in read-side critical sections, you should - be using RCU rather than SRCU, because RCU is almost always - faster and easier to use than is SRCU. +14. SRCU (srcu_read_lock(), srcu_read_unlock(), srcu_dereference(), + synchronize_srcu(), and synchronize_srcu_expedited()) may only + be invoked from process context. Unlike other forms of RCU, it + -is- permissible to block in an SRCU read-side critical section + (demarked by srcu_read_lock() and srcu_read_unlock()), hence the + "SRCU": "sleepable RCU". Please note that if you don't need + to sleep in read-side critical sections, you should be using + RCU rather than SRCU, because RCU is almost always faster and + easier to use than is SRCU. Also unlike other forms of RCU, explicit initialization and cleanup is required via init_srcu_struct() and cleanup_srcu_struct(). These are passed a "struct srcu_struct" that defines the scope of a given SRCU domain. Once initialized, the srcu_struct is passed to srcu_read_lock(), srcu_read_unlock() - and synchronize_srcu(). A given synchronize_srcu() waits only - for SRCU read-side critical sections governed by srcu_read_lock() - and srcu_read_unlock() calls that have been passd the same - srcu_struct. This property is what makes sleeping read-side - critical sections tolerable -- a given subsystem delays only - its own updates, not those of other subsystems using SRCU. - Therefore, SRCU is less prone to OOM the system than RCU would - be if RCU's read-side critical sections were permitted to - sleep. + synchronize_srcu(), and synchronize_srcu_expedited(). A given + synchronize_srcu() waits only for SRCU read-side critical + sections governed by srcu_read_lock() and srcu_read_unlock() + calls that have been passed the same srcu_struct. This property + is what makes sleeping read-side critical sections tolerable -- + a given subsystem delays only its own updates, not those of other + subsystems using SRCU. Therefore, SRCU is less prone to OOM the + system than RCU would be if RCU's read-side critical sections + were permitted to sleep. The ability to sleep in read-side critical sections does not come for free. First, corresponding srcu_read_lock() and @@ -311,12 +355,12 @@ over a rather long period of time, but improvements are always welcome! destructive operation, and -only- -then- invoke call_rcu(), synchronize_rcu(), or friends. - Because these primitives only wait for pre-existing readers, - it is the caller's responsibility to guarantee safety to - any subsequent readers. + Because these primitives only wait for pre-existing readers, it + is the caller's responsibility to guarantee that any subsequent + readers will execute safely. -16. The various RCU read-side primitives do -not- contain memory - barriers. The CPU (and in some cases, the compiler) is free - to reorder code into and out of RCU read-side critical sections. - It is the responsibility of the RCU update-side primitives to - deal with this. +16. The various RCU read-side primitives do -not- necessarily contain + memory barriers. You should therefore plan for the CPU + and the compiler to freely reorder code into and out of RCU + read-side critical sections. It is the responsibility of the + RCU update-side primitives to deal with this. diff --git a/Documentation/RCU/lockdep.txt b/Documentation/RCU/lockdep.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe24b58627bdde8f6a5d0b331436408f1d43d3ea --- /dev/null +++ b/Documentation/RCU/lockdep.txt @@ -0,0 +1,67 @@ +RCU and lockdep checking + +All flavors of RCU have lockdep checking available, so that lockdep is +aware of when each task enters and leaves any flavor of RCU read-side +critical section. Each flavor of RCU is tracked separately (but note +that this is not the case in 2.6.32 and earlier). This allows lockdep's +tracking to include RCU state, which can sometimes help when debugging +deadlocks and the like. + +In addition, RCU provides the following primitives that check lockdep's +state: + + rcu_read_lock_held() for normal RCU. + rcu_read_lock_bh_held() for RCU-bh. + rcu_read_lock_sched_held() for RCU-sched. + srcu_read_lock_held() for SRCU. + +These functions are conservative, and will therefore return 1 if they +aren't certain (for example, if CONFIG_DEBUG_LOCK_ALLOC is not set). +This prevents things like WARN_ON(!rcu_read_lock_held()) from giving false +positives when lockdep is disabled. + +In addition, a separate kernel config parameter CONFIG_PROVE_RCU enables +checking of rcu_dereference() primitives: + + rcu_dereference(p): + Check for RCU read-side critical section. + rcu_dereference_bh(p): + Check for RCU-bh read-side critical section. + rcu_dereference_sched(p): + Check for RCU-sched read-side critical section. + srcu_dereference(p, sp): + Check for SRCU read-side critical section. + rcu_dereference_check(p, c): + Use explicit check expression "c". + rcu_dereference_raw(p) + Don't check. (Use sparingly, if at all.) + +The rcu_dereference_check() check expression can be any boolean +expression, but would normally include one of the rcu_read_lock_held() +family of functions and a lockdep expression. However, any boolean +expression can be used. For a moderately ornate example, consider +the following: + + file = rcu_dereference_check(fdt->fd[fd], + rcu_read_lock_held() || + lockdep_is_held(&files->file_lock) || + atomic_read(&files->count) == 1); + +This expression picks up the pointer "fdt->fd[fd]" in an RCU-safe manner, +and, if CONFIG_PROVE_RCU is configured, verifies that this expression +is used in: + +1. An RCU read-side critical section, or +2. with files->file_lock held, or +3. on an unshared files_struct. + +In case (1), the pointer is picked up in an RCU-safe manner for vanilla +RCU read-side critical sections, in case (2) the ->file_lock prevents +any change from taking place, and finally, in case (3) the current task +is the only task accessing the file_struct, again preventing any change +from taking place. + +There are currently only "universal" versions of the rcu_assign_pointer() +and RCU list-/tree-traversal primitives, which do not (yet) check for +being in an RCU read-side critical section. In the future, separate +versions of these primitives might be created. diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt index 2a23523ce4712d2b972a0d99ea22fcf00912ab8c..31852705b5863063f453fbb500dd6e6dfa456498 100644 --- a/Documentation/RCU/rcu.txt +++ b/Documentation/RCU/rcu.txt @@ -75,6 +75,8 @@ o I hear that RCU is patented? What is with that? search for the string "Patent" in RTFP.txt to find them. Of these, one was allowed to lapse by the assignee, and the others have been contributed to the Linux kernel under GPL. + There are now also LGPL implementations of user-level RCU + available (http://lttng.org/?q=node/18). o I hear that RCU needs work in order to support realtime kernels? @@ -91,48 +93,4 @@ o Where can I find more information on RCU? o What are all these files in this directory? - - NMI-RCU.txt - - Describes how to use RCU to implement dynamic - NMI handlers, which can be revectored on the fly, - without rebooting. - - RTFP.txt - - List of RCU-related publications and web sites. - - UP.txt - - Discussion of RCU usage in UP kernels. - - arrayRCU.txt - - Describes how to use RCU to protect arrays, with - resizeable arrays whose elements reference other - data structures being of the most interest. - - checklist.txt - - Lists things to check for when inspecting code that - uses RCU. - - listRCU.txt - - Describes how to use RCU to protect linked lists. - This is the simplest and most common use of RCU - in the Linux kernel. - - rcu.txt - - You are reading it! - - rcuref.txt - - Describes how to combine use of reference counts - with RCU. - - whatisRCU.txt - - Overview of how the RCU implementation works. Along - the way, presents a conceptual view of RCU. + See 00-INDEX for the list. diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt new file mode 100644 index 0000000000000000000000000000000000000000..1423d2570d78833b8421f51e248f4dabbd3f5bc5 --- /dev/null +++ b/Documentation/RCU/stallwarn.txt @@ -0,0 +1,58 @@ +Using RCU's CPU Stall Detector + +The CONFIG_RCU_CPU_STALL_DETECTOR kernel config parameter enables +RCU's CPU stall detector, which detects conditions that unduly delay +RCU grace periods. The stall detector's idea of what constitutes +"unduly delayed" is controlled by a pair of C preprocessor macros: + +RCU_SECONDS_TILL_STALL_CHECK + + This macro defines the period of time that RCU will wait from + the beginning of a grace period until it issues an RCU CPU + stall warning. It is normally ten seconds. + +RCU_SECONDS_TILL_STALL_RECHECK + + This macro defines the period of time that RCU will wait after + issuing a stall warning until it issues another stall warning. + It is normally set to thirty seconds. + +RCU_STALL_RAT_DELAY + + The CPU stall detector tries to make the offending CPU rat on itself, + as this often gives better-quality stack traces. However, if + the offending CPU does not detect its own stall in the number + of jiffies specified by RCU_STALL_RAT_DELAY, then other CPUs will + complain. This is normally set to two jiffies. + +The following problems can result in an RCU CPU stall warning: + +o A CPU looping in an RCU read-side critical section. + +o A CPU looping with interrupts disabled. + +o A CPU looping with preemption disabled. + +o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel + without invoking schedule(). + +o A bug in the RCU implementation. + +o A hardware failure. This is quite unlikely, but has occurred + at least once in a former life. A CPU failed in a running system, + becoming unresponsive, but not causing an immediate crash. + This resulted in a series of RCU CPU stall warnings, eventually + leading the realization that the CPU had failed. + +The RCU, RCU-sched, and RCU-bh implementations have CPU stall warning. +SRCU does not do so directly, but its calls to synchronize_sched() will +result in RCU-sched detecting any CPU stalls that might be occurring. + +To diagnose the cause of the stall, inspect the stack traces. The offending +function will usually be near the top of the stack. If you have a series +of stall warnings from a single extended stall, comparing the stack traces +can often help determine where the stall is occurring, which will usually +be in the function nearest the top of the stack that stays the same from +trace to trace. + +RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE. diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index 9dba3bb90e60a4710222d864124453d0d861b8c6..0e50bc2aa1e2e4f4682f5b5540a49987bfb00884 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt @@ -30,6 +30,18 @@ MODULE PARAMETERS This module has the following parameters: +fqs_duration Duration (in microseconds) of artificially induced bursts + of force_quiescent_state() invocations. In RCU + implementations having force_quiescent_state(), these + bursts help force races between forcing a given grace + period and that grace period ending on its own. + +fqs_holdoff Holdoff time (in microseconds) between consecutive calls + to force_quiescent_state() within a burst. + +fqs_stutter Wait time (in seconds) between consecutive bursts + of calls to force_quiescent_state(). + irqreaders Says to invoke RCU readers from irq level. This is currently done via timers. Defaults to "1" for variants of RCU that permit this. (Or, more accurately, variants of RCU that do diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index d542ca243b80d52afcc0ce9a32e7e5faa20a0920..1dc00ee97163261bb3390e21499e45b9a44bde20 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -323,14 +323,17 @@ used as follows: Defer Protect a. synchronize_rcu() rcu_read_lock() / rcu_read_unlock() - call_rcu() + call_rcu() rcu_dereference() b. call_rcu_bh() rcu_read_lock_bh() / rcu_read_unlock_bh() + rcu_dereference_bh() -c. synchronize_sched() preempt_disable() / preempt_enable() +c. synchronize_sched() rcu_read_lock_sched() / rcu_read_unlock_sched() + preempt_disable() / preempt_enable() local_irq_save() / local_irq_restore() hardirq enter / hardirq exit NMI enter / NMI exit + rcu_dereference_sched() These three mechanisms are used as follows: @@ -780,9 +783,8 @@ Linux-kernel source code, but it helps to have a full list of the APIs, since there does not appear to be a way to categorize them in docbook. Here is the list, by category. -RCU pointer/list traversal: +RCU list traversal: - rcu_dereference list_for_each_entry_rcu hlist_for_each_entry_rcu hlist_nulls_for_each_entry_rcu @@ -808,7 +810,7 @@ RCU: Critical sections Grace period Barrier rcu_read_lock synchronize_net rcu_barrier rcu_read_unlock synchronize_rcu - synchronize_rcu_expedited + rcu_dereference synchronize_rcu_expedited call_rcu @@ -816,7 +818,7 @@ bh: Critical sections Grace period Barrier rcu_read_lock_bh call_rcu_bh rcu_barrier_bh rcu_read_unlock_bh synchronize_rcu_bh - synchronize_rcu_bh_expedited + rcu_dereference_bh synchronize_rcu_bh_expedited sched: Critical sections Grace period Barrier @@ -825,12 +827,14 @@ sched: Critical sections Grace period Barrier rcu_read_unlock_sched call_rcu_sched [preempt_disable] synchronize_sched_expedited [and friends] + rcu_dereference_sched SRCU: Critical sections Grace period Barrier srcu_read_lock synchronize_srcu N/A srcu_read_unlock synchronize_srcu_expedited + srcu_dereference SRCU: Initialization/cleanup init_srcu_struct diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist index 1053a56be3b180c8ce202380ed9e882b5d97914a..8916ca48bc95f943c59adbf30cd0cec9a9e144b6 100644 --- a/Documentation/SubmitChecklist +++ b/Documentation/SubmitChecklist @@ -9,10 +9,14 @@ Documentation/SubmittingPatches and elsewhere regarding submitting Linux kernel patches. -1: Builds cleanly with applicable or modified CONFIG options =y, =m, and +1: If you use a facility then #include the file that defines/declares + that facility. Don't depend on other header files pulling in ones + that you use. + +2: Builds cleanly with applicable or modified CONFIG options =y, =m, and =n. No gcc warnings/errors, no linker warnings/errors. -2: Passes allnoconfig, allmodconfig +2b: Passes allnoconfig, allmodconfig 3: Builds on multiple CPU architectures by using local cross-compile tools or some other build farm. diff --git a/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt b/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt index 76b3a11e90be1a26e77f8741f01a25f99191ed10..fa968aa99d67b4273792cfd35f36804da84235ac 100644 --- a/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt +++ b/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt @@ -14,8 +14,8 @@ Introduction how the clocks are arranged. The first implementation used as single PLL to feed the ARM, memory and peripherals via a series of dividers and muxes and this is the implementation that is documented here. A - newer version where there is a seperate PLL and clock divider for the - ARM core is available as a seperate driver. + newer version where there is a separate PLL and clock divider for the + ARM core is available as a separate driver. Layout diff --git a/Documentation/arm/Samsung/Overview.txt b/Documentation/arm/Samsung/Overview.txt new file mode 100644 index 0000000000000000000000000000000000000000..7cced1fea9c3fdad45a36971806ea01dca0202ae --- /dev/null +++ b/Documentation/arm/Samsung/Overview.txt @@ -0,0 +1,86 @@ + Samsung ARM Linux Overview + ========================== + +Introduction +------------ + + The Samsung range of ARM SoCs spans many similar devices, from the initial + ARM9 through to the newest ARM cores. This document shows an overview of + the current kernel support, how to use it and where to find the code + that supports this. + + The currently supported SoCs are: + + - S3C24XX: See Documentation/arm/Samsung-S3C24XX/Overview.txt for full list + - S3C64XX: S3C6400 and S3C6410 + - S5PC6440 + + S5PC100 and S5PC110 support is currently being merged + + +S3C24XX Systems +--------------- + + There is still documentation in Documnetation/arm/Samsung-S3C24XX/ which + deals with the architecture and drivers specific to these devices. + + See Documentation/arm/Samsung-S3C24XX/Overview.txt for more information + on the implementation details and specific support. + + +Configuration +------------- + + A number of configurations are supplied, as there is no current way of + unifying all the SoCs into one kernel. + + s5p6440_defconfig - S5P6440 specific default configuration + s5pc100_defconfig - S5PC100 specific default configuration + + +Layout +------ + + The directory layout is currently being restructured, and consists of + several platform directories and then the machine specific directories + of the CPUs being built for. + + plat-samsung provides the base for all the implementations, and is the + last in the line of include directories that are processed for the build + specific information. It contains the base clock, GPIO and device definitions + to get the system running. + + plat-s3c is the s3c24xx/s3c64xx platform directory, although it is currently + involved in other builds this will be phased out once the relevant code is + moved elsewhere. + + plat-s3c24xx is for s3c24xx specific builds, see the S3C24XX docs. + + plat-s3c64xx is for the s3c64xx specific bits, see the S3C24XX docs. + + plat-s5p is for s5p specific builds, more to be added. + + + [ to finish ] + + +Port Contributors +----------------- + + Ben Dooks (BJD) + Vincent Sanders + Herbert Potzl + Arnaud Patard (RTP) + Roc Wu + Klaus Fetscher + Dimitry Andric + Shannon Holland + Guillaume Gourat (NexVision) + Christer Weinigel (wingel) (Acer N30) + Lucas Correia Villa Real (S3C2400 port) + + +Document Author +--------------- + +Copyright 2009-2010 Ben Dooks diff --git a/Documentation/arm/Samsung/clksrc-change-registers.awk b/Documentation/arm/Samsung/clksrc-change-registers.awk new file mode 100755 index 0000000000000000000000000000000000000000..0c50220851fbd3ea718eb0964825083ba8a42ffa --- /dev/null +++ b/Documentation/arm/Samsung/clksrc-change-registers.awk @@ -0,0 +1,167 @@ +#!/usr/bin/awk -f +# +# Copyright 2010 Ben Dooks +# +# Released under GPLv2 + +# example usage +# ./clksrc-change-registers.awk arch/arm/plat-s5pc1xx/include/plat/regs-clock.h < src > dst + +function extract_value(s) +{ + eqat = index(s, "=") + comat = index(s, ",") + return substr(s, eqat+2, (comat-eqat)-2) +} + +function remove_brackets(b) +{ + return substr(b, 2, length(b)-2) +} + +function splitdefine(l, p) +{ + r = split(l, tp) + + p[0] = tp[2] + p[1] = remove_brackets(tp[3]) +} + +function find_length(f) +{ + if (0) + printf "find_length " f "\n" > "/dev/stderr" + + if (f ~ /0x1/) + return 1 + else if (f ~ /0x3/) + return 2 + else if (f ~ /0x7/) + return 3 + else if (f ~ /0xf/) + return 4 + + printf "unknown legnth " f "\n" > "/dev/stderr" + exit +} + +function find_shift(s) +{ + id = index(s, "<") + if (id <= 0) { + printf "cannot find shift " s "\n" > "/dev/stderr" + exit + } + + return substr(s, id+2) +} + + +BEGIN { + if (ARGC < 2) { + print "too few arguments" > "/dev/stderr" + exit + } + +# read the header file and find the mask values that we will need +# to replace and create an associative array of values + + while (getline line < ARGV[1] > 0) { + if (line ~ /\#define.*_MASK/ && + !(line ~ /S5PC100_EPLL_MASK/) && + !(line ~ /USB_SIG_MASK/)) { + splitdefine(line, fields) + name = fields[0] + if (0) + printf "MASK " line "\n" > "/dev/stderr" + dmask[name,0] = find_length(fields[1]) + dmask[name,1] = find_shift(fields[1]) + if (0) + printf "=> '" name "' LENGTH=" dmask[name,0] " SHIFT=" dmask[name,1] "\n" > "/dev/stderr" + } else { + } + } + + delete ARGV[1] +} + +/clksrc_clk.*=.*{/ { + shift="" + mask="" + divshift="" + reg_div="" + reg_src="" + indent=1 + + print $0 + + for(; indent >= 1;) { + if ((getline line) <= 0) { + printf "unexpected end of file" > "/dev/stderr" + exit 1; + } + + if (line ~ /\.shift/) { + shift = extract_value(line) + } else if (line ~ /\.mask/) { + mask = extract_value(line) + } else if (line ~ /\.reg_divider/) { + reg_div = extract_value(line) + } else if (line ~ /\.reg_source/) { + reg_src = extract_value(line) + } else if (line ~ /\.divider_shift/) { + divshift = extract_value(line) + } else if (line ~ /{/) { + indent++ + print line + } else if (line ~ /}/) { + indent-- + + if (indent == 0) { + if (0) { + printf "shift '" shift "' ='" dmask[shift,0] "'\n" > "/dev/stderr" + printf "mask '" mask "'\n" > "/dev/stderr" + printf "dshft '" divshift "'\n" > "/dev/stderr" + printf "rdiv '" reg_div "'\n" > "/dev/stderr" + printf "rsrc '" reg_src "'\n" > "/dev/stderr" + } + + generated = mask + sub(reg_src, reg_div, generated) + + if (0) { + printf "/* rsrc " reg_src " */\n" + printf "/* rdiv " reg_div " */\n" + printf "/* shift " shift " */\n" + printf "/* mask " mask " */\n" + printf "/* generated " generated " */\n" + } + + if (reg_div != "") { + printf "\t.reg_div = { " + printf ".reg = " reg_div ", " + printf ".shift = " dmask[generated,1] ", " + printf ".size = " dmask[generated,0] ", " + printf "},\n" + } + + printf "\t.reg_src = { " + printf ".reg = " reg_src ", " + printf ".shift = " dmask[mask,1] ", " + printf ".size = " dmask[mask,0] ", " + + printf "},\n" + + } + + print line + } else { + print line + } + + if (0) + printf indent ":" line "\n" > "/dev/stderr" + } +} + +// && ! /clksrc_clk.*=.*{/ { print $0 } diff --git a/Documentation/arm/memory.txt b/Documentation/arm/memory.txt index 9d58c7c5eddd082b9a22c087c7e0023e02fa4b02..eb0fae18ffb12a805bb81eb2c5005a773d499d2b 100644 --- a/Documentation/arm/memory.txt +++ b/Documentation/arm/memory.txt @@ -59,7 +59,11 @@ PAGE_OFFSET high_memory-1 Kernel direct-mapped RAM region. This maps the platforms RAM, and typically maps all platform RAM in a 1:1 relationship. -TASK_SIZE PAGE_OFFSET-1 Kernel module space +PKMAP_BASE PAGE_OFFSET-1 Permanent kernel mappings + One way of mapping HIGHMEM pages into kernel + space. + +MODULES_VADDR MODULES_END-1 Kernel module space Kernel modules inserted via insmod are placed here using dynamic mappings. diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index e164403f60e19b92e01050b3800826acbe27448a..f65274081c8d19a1c2f6f8b53712a1cf9cbee54d 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -25,11 +25,11 @@ size allowed by the hardware. nomerges (RW) ------------- -This enables the user to disable the lookup logic involved with IO merging -requests in the block layer. Merging may still occur through a direct -1-hit cache, since that comes for (almost) free. The IO scheduler will not -waste cycles doing tree/hash lookups for merges if nomerges is 1. Defaults -to 0, enabling all merges. +This enables the user to disable the lookup logic involved with IO +merging requests in the block layer. By default (0) all merges are +enabled. When set to 1 only simple one-hit merges will be tried. When +set to 2 no merge algorithms will be tried (including one-hit or more +complex tree/hash lookups). nr_requests (RW) ---------------- diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt index da42ab414c4864db4bb9d35b0cca9005b27670d2..2b5f823abd035fd180d94732c523671378c2ef9d 100644 --- a/Documentation/cachetlb.txt +++ b/Documentation/cachetlb.txt @@ -88,12 +88,12 @@ changes occur: This is used primarily during fault processing. 5) void update_mmu_cache(struct vm_area_struct *vma, - unsigned long address, pte_t pte) + unsigned long address, pte_t *ptep) At the end of every page fault, this routine is invoked to tell the architecture specific code that a translation - described by "pte" now exists at virtual address "address" - for address space "vma->vm_mm", in the software page tables. + now exists at virtual address "address" for address space + "vma->vm_mm", in the software page tables. A port may use this information in any way it so chooses. For example, it could use this event to pre-load TLB @@ -377,3 +377,27 @@ maps this page at its virtual address. All the functionality of flush_icache_page can be implemented in flush_dcache_page and update_mmu_cache. In 2.7 the hope is to remove this interface completely. + +The final category of APIs is for I/O to deliberately aliased address +ranges inside the kernel. Such aliases are set up by use of the +vmap/vmalloc API. Since kernel I/O goes via physical pages, the I/O +subsystem assumes that the user mapping and kernel offset mapping are +the only aliases. This isn't true for vmap aliases, so anything in +the kernel trying to do I/O to vmap areas must manually manage +coherency. It must do this by flushing the vmap range before doing +I/O and invalidating it after the I/O returns. + + void flush_kernel_vmap_range(void *vaddr, int size) + flushes the kernel cache for a given virtual address range in + the vmap area. This is to make sure that any data the kernel + modified in the vmap range is made visible to the physical + page. The design is to make this area safe to perform I/O on. + Note that this API does *not* also flush the offset map alias + of the area. + + void invalidate_kernel_vmap_range(void *vaddr, int size) invalidates + the cache for a given virtual address range in the vmap area + which prevents the processor from making the cache stale by + speculatively reading data while the I/O was occurring to the + physical pages. This is only necessary for data reads into the + vmap area. diff --git a/Documentation/cdrom/ide-cd b/Documentation/cdrom/ide-cd index 2c558cd6c1ef605f11dfe2a1dff1be292946d57e..f4dc9de2694e90019ea8377537dfe40aeef1df8e 100644 --- a/Documentation/cdrom/ide-cd +++ b/Documentation/cdrom/ide-cd @@ -159,42 +159,7 @@ two arguments: the CDROM device, and the slot number to which you wish to change. If the slot number is -1, the drive is unloaded. -4. Compilation options ----------------------- - -There are a few additional options which can be set when compiling the -driver. Most people should not need to mess with any of these; they -are listed here simply for completeness. A compilation option can be -enabled by adding a line of the form `#define