Skip to content
  1. Feb 21, 2012
    • David Howells's avatar
      bitops: Optimise get_order() · d66acc39
      David Howells authored
      
      
      Optimise get_order() to use bit scanning instructions if such exist rather than
      a loop.  Also, make it possible to use get_order() in static initialisations
      too by building it on top of ilog2() in the constant parameter case.
      
      This has been tested for i386 and x86_64 using the following userspace program,
      and for FRV by making appropriate substitutions for fls() and fls64().  It will
      abort if the case for get_order() deviates from the original except for the
      order of 0, for which get_order() produces an undefined result.  This program
      tests both dynamic and static parameters.
      
      	#include <stdlib.h>
      	#include <stdio.h>
      
      	#ifdef __x86_64__
      	#define BITS_PER_LONG 64
      	#else
      	#define BITS_PER_LONG 32
      	#endif
      
      	#define PAGE_SHIFT 12
      
      	typedef unsigned long long __u64, u64;
      	typedef unsigned int __u32, u32;
      	#define noinline	__attribute__((noinline))
      
      	static inline int fls(int x)
      	{
      		int bitpos = -1;
      
      		asm("bsrl %1,%0"
      		    : "+r" (bitpos)
      		    : "rm" (x));
      		return bitpos + 1;
      	}
      
      	static __always_inline int fls64(__u64 x)
      	{
      	#if BITS_PER_LONG == 64
      		long bitpos = -1;
      
      		asm("bsrq %1,%0"
      		    : "+r" (bitpos)
      		    : "rm" (x));
      		return bitpos + 1;
      	#else
      		__u32 h = x >> 32, l = x;
      		int bitpos = -1;
      
      		asm("bsrl	%1,%0	\n"
      		    "subl	%2,%0	\n"
      		    "bsrl	%3,%0	\n"
      		    : "+r" (bitpos)
      		    : "rm" (l), "i"(32), "rm" (h));
      
      		return bitpos + 33;
      	#endif
      	}
      
      	static inline __attribute__((const))
      	int __ilog2_u32(u32 n)
      	{
      		return fls(n) - 1;
      	}
      
      	static inline __attribute__((const))
      	int __ilog2_u64(u64 n)
      	{
      		return fls64(n) - 1;
      	}
      
      	extern __attribute__((const, noreturn))
      	int ____ilog2_NaN(void);
      
      	#define ilog2(n)				\
      	(						\
      		__builtin_constant_p(n) ? (		\
      			(n) < 1 ? ____ilog2_NaN() :	\
      			(n) & (1ULL << 63) ? 63 :	\
      			(n) & (1ULL << 62) ? 62 :	\
      			(n) & (1ULL << 61) ? 61 :	\
      			(n) & (1ULL << 60) ? 60 :	\
      			(n) & (1ULL << 59) ? 59 :	\
      			(n) & (1ULL << 58) ? 58 :	\
      			(n) & (1ULL << 57) ? 57 :	\
      			(n) & (1ULL << 56) ? 56 :	\
      			(n) & (1ULL << 55) ? 55 :	\
      			(n) & (1ULL << 54) ? 54 :	\
      			(n) & (1ULL << 53) ? 53 :	\
      			(n) & (1ULL << 52) ? 52 :	\
      			(n) & (1ULL << 51) ? 51 :	\
      			(n) & (1ULL << 50) ? 50 :	\
      			(n) & (1ULL << 49) ? 49 :	\
      			(n) & (1ULL << 48) ? 48 :	\
      			(n) & (1ULL << 47) ? 47 :	\
      			(n) & (1ULL << 46) ? 46 :	\
      			(n) & (1ULL << 45) ? 45 :	\
      			(n) & (1ULL << 44) ? 44 :	\
      			(n) & (1ULL << 43) ? 43 :	\
      			(n) & (1ULL << 42) ? 42 :	\
      			(n) & (1ULL << 41) ? 41 :	\
      			(n) & (1ULL << 40) ? 40 :	\
      			(n) & (1ULL << 39) ? 39 :	\
      			(n) & (1ULL << 38) ? 38 :	\
      			(n) & (1ULL << 37) ? 37 :	\
      			(n) & (1ULL << 36) ? 36 :	\
      			(n) & (1ULL << 35) ? 35 :	\
      			(n) & (1ULL << 34) ? 34 :	\
      			(n) & (1ULL << 33) ? 33 :	\
      			(n) & (1ULL << 32) ? 32 :	\
      			(n) & (1ULL << 31) ? 31 :	\
      			(n) & (1ULL << 30) ? 30 :	\
      			(n) & (1ULL << 29) ? 29 :	\
      			(n) & (1ULL << 28) ? 28 :	\
      			(n) & (1ULL << 27) ? 27 :	\
      			(n) & (1ULL << 26) ? 26 :	\
      			(n) & (1ULL << 25) ? 25 :	\
      			(n) & (1ULL << 24) ? 24 :	\
      			(n) & (1ULL << 23) ? 23 :	\
      			(n) & (1ULL << 22) ? 22 :	\
      			(n) & (1ULL << 21) ? 21 :	\
      			(n) & (1ULL << 20) ? 20 :	\
      			(n) & (1ULL << 19) ? 19 :	\
      			(n) & (1ULL << 18) ? 18 :	\
      			(n) & (1ULL << 17) ? 17 :	\
      			(n) & (1ULL << 16) ? 16 :	\
      			(n) & (1ULL << 15) ? 15 :	\
      			(n) & (1ULL << 14) ? 14 :	\
      			(n) & (1ULL << 13) ? 13 :	\
      			(n) & (1ULL << 12) ? 12 :	\
      			(n) & (1ULL << 11) ? 11 :	\
      			(n) & (1ULL << 10) ? 10 :	\
      			(n) & (1ULL <<  9) ?  9 :	\
      			(n) & (1ULL <<  8) ?  8 :	\
      			(n) & (1ULL <<  7) ?  7 :	\
      			(n) & (1ULL <<  6) ?  6 :	\
      			(n) & (1ULL <<  5) ?  5 :	\
      			(n) & (1ULL <<  4) ?  4 :	\
      			(n) & (1ULL <<  3) ?  3 :	\
      			(n) & (1ULL <<  2) ?  2 :	\
      			(n) & (1ULL <<  1) ?  1 :	\
      			(n) & (1ULL <<  0) ?  0 :	\
      			____ilog2_NaN()			\
      					   ) :		\
      		(sizeof(n) <= 4) ?			\
      		__ilog2_u32(n) :			\
      		__ilog2_u64(n)				\
      	 )
      
      	static noinline __attribute__((const))
      	int old_get_order(unsigned long size)
      	{
      		int order;
      
      		size = (size - 1) >> (PAGE_SHIFT - 1);
      		order = -1;
      		do {
      			size >>= 1;
      			order++;
      		} while (size);
      		return order;
      	}
      
      	static noinline __attribute__((const))
      	int __get_order(unsigned long size)
      	{
      		int order;
      		size--;
      		size >>= PAGE_SHIFT;
      	#if BITS_PER_LONG == 32
      		order = fls(size);
      	#else
      		order = fls64(size);
      	#endif
      		return order;
      	}
      
      	#define get_order(n)						\
      	(								\
      		__builtin_constant_p(n) ? (				\
      			(n == 0UL) ? BITS_PER_LONG - PAGE_SHIFT :	\
      			((n < (1UL << PAGE_SHIFT)) ? 0 :		\
      			 ilog2((n) - 1) - PAGE_SHIFT + 1)		\
      		) :							\
      		__get_order(n)						\
      	)
      
      	#define order(N) \
      		{ (1UL << N) - 1,	get_order((1UL << N) - 1)	},	\
      		{ (1UL << N),		get_order((1UL << N))		},	\
      		{ (1UL << N) + 1,	get_order((1UL << N) + 1)	}
      
      	struct order {
      		unsigned long n, order;
      	};
      
      	static const struct order order_table[] = {
      		order(0),
      		order(1),
      		order(2),
      		order(3),
      		order(4),
      		order(5),
      		order(6),
      		order(7),
      		order(8),
      		order(9),
      		order(10),
      		order(11),
      		order(12),
      		order(13),
      		order(14),
      		order(15),
      		order(16),
      		order(17),
      		order(18),
      		order(19),
      		order(20),
      		order(21),
      		order(22),
      		order(23),
      		order(24),
      		order(25),
      		order(26),
      		order(27),
      		order(28),
      		order(29),
      		order(30),
      		order(31),
      	#if BITS_PER_LONG == 64
      		order(32),
      		order(33),
      		order(34),
      		order(35),
      	#endif
      		{ 0x2929 }
      	};
      
      	void check(int loop, unsigned long n)
      	{
      		unsigned long old, new;
      
      		printf("[%2d]: %09lx | ", loop, n);
      
      		old = old_get_order(n);
      		new = get_order(n);
      
      		printf("%3ld, %3ld\n", old, new);
      		if (n != 0 && old != new)
      			abort();
      	}
      
      	int main(int argc, char **argv)
      	{
      		const struct order *p;
      		unsigned long n;
      		int loop;
      
      		for (loop = 0; loop <= BITS_PER_LONG - 1; loop++) {
      			n = 1UL << loop;
      			check(loop, n - 1);
      			check(loop, n);
      			check(loop, n + 1);
      		}
      
      		for (p = order_table; p->n != 0x2929; p++) {
      			unsigned long old, new;
      
      			old = old_get_order(p->n);
      			new = p->order;
      			printf("%09lx\t%3ld, %3ld\n", p->n, old, new);
      			if (p->n != 0 && old != new)
      				abort();
      		}
      
      		return 0;
      	}
      
      Disassembling the x86_64 version of the above code shows:
      
      	0000000000400510 <old_get_order>:
      	  400510:       48 83 ef 01             sub    $0x1,%rdi
      	  400514:       b8 ff ff ff ff          mov    $0xffffffff,%eax
      	  400519:       48 c1 ef 0b             shr    $0xb,%rdi
      	  40051d:       0f 1f 00                nopl   (%rax)
      	  400520:       83 c0 01                add    $0x1,%eax
      	  400523:       48 d1 ef                shr    %rdi
      	  400526:       75 f8                   jne    400520 <old_get_order+0x10>
      	  400528:       f3 c3                   repz retq
      	  40052a:       66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
      
      	0000000000400530 <__get_order>:
      	  400530:       48 83 ef 01             sub    $0x1,%rdi
      	  400534:       48 c7 c0 ff ff ff ff    mov    $0xffffffffffffffff,%rax
      	  40053b:       48 c1 ef 0c             shr    $0xc,%rdi
      	  40053f:       48 0f bd c7             bsr    %rdi,%rax
      	  400543:       83 c0 01                add    $0x1,%eax
      	  400546:       c3                      retq
      	  400547:       66 0f 1f 84 00 00 00    nopw   0x0(%rax,%rax,1)
      	  40054e:       00 00
      
      As can be seen, the new __get_order() function is simpler than the
      old_get_order() function.
      
      Signed-off-by: default avatarDavid Howells <dhowells@redhat.com>
      Link: http://lkml.kernel.org/r/20120220223928.16199.29548.stgit@warthog.procyon.org.uk
      Acked-by: default avatarArnd Bergmann <arnd@arndb.de>
      Signed-off-by: default avatarH. Peter Anvin <hpa@zytor.com>
      d66acc39
    • David Howells's avatar
      bitops: Adjust the comment on get_order() to describe the size==0 case · e0891a98
      David Howells authored
      
      
      Adjust the comment on get_order() to note that the result of passing a size of
      0 results in an undefined value.
      
      Signed-off-by: default avatarDavid Howells <dhowells@redhat.com>
      Link: http://lkml.kernel.org/r/20120220223917.16199.9416.stgit@warthog.procyon.org.uk
      Acked-by: default avatarArnd Bergmann <arnd@arndb.de>
      Signed-off-by: default avatarH. Peter Anvin <hpa@zytor.com>
      e0891a98
  2. Feb 07, 2012
    • Jan Beulich's avatar
      x86/spinlocks: Eliminate TICKET_MASK · 7931d493
      Jan Beulich authored
      
      
      The definition of it being questionable already (unnecessarily
      including a cast), and it being used in a single place that can
      be written shorter without it, remove this #define.
      
      Along the same lines, simplify __ticket_spin_is_locked()'s main
      expression, which was the more convoluted way because of needs
      that went away with the recent type changes by Jeremy.
      
      This is pure cleanup, no functional change intended.
      
      Signed-off-by: default avatarJan Beulich <jbeulich@suse.com>
      Acked-by: default avatarJeremy Fitzhardinge <jeremy@goop.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Andrew Morton <akpm@linux-foundation.org>
      Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
      Link: http://lkml.kernel.org/r/4F2C06020200007800071066@nat28.tlf.novell.com
      Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
      7931d493
  3. Jan 27, 2012
    • Jan Beulich's avatar
      x86-64: Handle byte-wise tail copying in memcpy() without a loop · 9d8e2277
      Jan Beulich authored
      
      
      While hard to measure, reducing the number of possibly/likely
      mis-predicted branches can generally be expected to be slightly
      better.
      
      Other than apparent at the first glance, this also doesn't grow
      the function size (the alignment gap to the next function just
      gets smaller).
      
      Signed-off-by: default avatarJan Beulich <jbeulich@suse.com>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Link: http://lkml.kernel.org/r/4F218584020000780006F422@nat28.tlf.novell.com
      Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
      9d8e2277
    • Jan Beulich's avatar
      x86-64: Fix memcpy() to support sizes of 4Gb and above · 2ab56091
      Jan Beulich authored
      
      
      While currently there doesn't appear to be any reachable in-tree
      case where such large memory blocks may be passed to memcpy(),
      we already had hit the problem in our Xen kernels. Just like
      done recently for mmeset(), rather than working around it,
      prevent others from falling into the same trap by fixing this
      long standing limitation.
      
      Signed-off-by: default avatarJan Beulich <jbeulich@suse.com>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Link: http://lkml.kernel.org/r/4F21846F020000780006F3FA@nat28.tlf.novell.com
      Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
      2ab56091
  4. Jan 26, 2012
    • Jan Beulich's avatar
      x86-64: Fix memset() to support sizes of 4Gb and above · 5d7244e7
      Jan Beulich authored
      
      
      While currently there doesn't appear to be any reachable in-tree
      case where such large memory blocks may be passed to memset()
      (alloc_bootmem() being the primary non-reachable one, as it gets
      called with suitably large sizes in FLATMEM configurations), we
      have recently hit the problem a second time in our Xen kernels.
      
      Rather than working around it a second time, prevent others from
      falling into the same trap by fixing this long standing
      limitation.
      
      Signed-off-by: default avatarJan Beulich <jbeulich@suse.com>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Andrew Morton <akpm@linux-foundation.org>
      Link: http://lkml.kernel.org/r/4F05D992020000780006AA09@nat28.tlf.novell.com
      Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
      5d7244e7
  5. Jan 06, 2012
  6. Jan 04, 2012
    • Eric Dumazet's avatar
      x86: Fix atomic64_xxx_cx8() functions · ceb7b40b
      Eric Dumazet authored
      
      
      It appears about all functions in arch/x86/lib/atomic64_cx8_32.S
      are wrong in case cmpxchg8b must be restarted, because
      LOCK_PREFIX macro defines a label "1" clashing with other local
      labels :
      
      1:
      	some_instructions
      	LOCK_PREFIX
      	cmpxchg8b (%ebp)
      	jne 1b  / jumps to beginning of LOCK_PREFIX !
      
      A possible fix is to use a magic label "672" in LOCK_PREFIX asm
      definition, similar to the "671" one we defined in
      LOCK_PREFIX_HERE.
      
      Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
      Acked-by: default avatarJan Beulich <JBeulich@suse.com>
      Cc: Christoph Lameter <cl@linux.com>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Andrew Morton <akpm@linux-foundation.org>
      Link: http://lkml.kernel.org/r/1325608540.2320.103.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC
      Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
      ceb7b40b
    • Jan Beulich's avatar
      x86: Fix and improve cmpxchg_double{,_local}() · cdcd6298
      Jan Beulich authored
      
      
      Just like the per-CPU ones they had several
      problems/shortcomings:
      
      Only the first memory operand was mentioned in the asm()
      operands, and the 2x64-bit version didn't have a memory clobber
      while the 2x32-bit one did. The former allowed the compiler to
      not recognize the need to re-load the data in case it had it
      cached in some register, while the latter was overly
      destructive.
      
      The types of the local copies of the old and new values were
      incorrect (the types of the pointed-to variables should be used
      here, to make sure the respective old/new variable types are
      compatible).
      
      The __dummy/__junk variables were pointless, given that local
      copies of the inputs already existed (and can hence be used for
      discarded outputs).
      
      The 32-bit variant of cmpxchg_double_local() referenced
      cmpxchg16b_local().
      
      At once also:
      
       - change the return value type to what it really is: 'bool'
       - unify 32- and 64-bit variants
       - abstract out the common part of the 'normal' and 'local' variants
      
      Signed-off-by: default avatarJan Beulich <jbeulich@suse.com>
      Cc: Christoph Lameter <cl@linux.com>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Andrew Morton <akpm@linux-foundation.org>
      Link: http://lkml.kernel.org/r/4F01F12A020000780006A19B@nat28.tlf.novell.com
      Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
      cdcd6298
    • Ingo Molnar's avatar
      Merge commit 'v3.2-rc7' into x86/asm · adaf4ed2
      Ingo Molnar authored
      
      
      Merge reason: Update from -rc4 to -rc7.
      
      Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
      adaf4ed2
  7. Dec 24, 2011
  8. Dec 23, 2011
    • Florian Westphal's avatar
      netfilter: xt_connbytes: handle negation correctly · 0354b48f
      Florian Westphal authored
      
      
      "! --connbytes 23:42" should match if the packet/byte count is not in range.
      
      As there is no explict "invert match" toggle in the match structure,
      userspace swaps the from and to arguments
      (i.e., as if "--connbytes 42:23" were given).
      
      However, "what <= 23 && what >= 42" will always be false.
      
      Change things so we use "||" in case "from" is larger than "to".
      
      This change may look like it breaks backwards compatibility when "to" is 0.
      However, older iptables binaries will refuse "connbytes 42:0",
      and current releases treat it to mean "! --connbytes 0:42",
      so we should be fine.
      
      Signed-off-by: default avatarFlorian Westphal <fw@strlen.de>
      Signed-off-by: default avatarPablo Neira Ayuso <pablo@netfilter.org>
      0354b48f
    • Al Viro's avatar
      Btrfs: call d_instantiate after all ops are setup · 08c422c2
      Al Viro authored
      
      
      This closes races where btrfs is calling d_instantiate too soon during
      inode creation.  All of the callers of btrfs_add_nondir are updated to
      instantiate after the inode is fully setup in memory.
      
      Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
      Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
      08c422c2
    • Chris Mason's avatar
      Btrfs: fix worker lock misuse in find_worker · 8d532b2a
      Chris Mason authored
      
      
      Dan Carpenter noticed that we were doing a double unlock on the worker
      lock, and sometimes picking a worker thread without the lock held.
      
      This fixes both errors.
      
      Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
      Reported-by: default avatarDan Carpenter <dan.carpenter@oracle.com>
      8d532b2a
    • Eric Dumazet's avatar
      net: relax rcvbuf limits · 0fd7bac6
      Eric Dumazet authored
      skb->truesize might be big even for a small packet.
      
      Its even bigger after commit 87fb4b7b
      
       (net: more accurate skb
      truesize) and big MTU.
      
      We should allow queueing at least one packet per receiver, even with a
      low RCVBUF setting.
      
      Reported-by: default avatarMichal Simek <monstr@monstr.eu>
      Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      0fd7bac6
    • Xi Wang's avatar
      rps: fix insufficient bounds checking in store_rps_dev_flow_table_cnt() · a0a129f8
      Xi Wang authored
      
      
      Setting a large rps_flow_cnt like (1 << 30) on 32-bit platform will
      cause a kernel oops due to insufficient bounds checking.
      
      	if (count > 1<<30) {
      		/* Enforce a limit to prevent overflow */
      		return -EINVAL;
      	}
      	count = roundup_pow_of_two(count);
      	table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count));
      
      Note that the macro RPS_DEV_FLOW_TABLE_SIZE(count) is defined as:
      
      	... + (count * sizeof(struct rps_dev_flow))
      
      where sizeof(struct rps_dev_flow) is 8.  (1 << 30) * 8 will overflow
      32 bits.
      
      This patch replaces the magic number (1 << 30) with a symbolic bound.
      
      Suggested-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
      Signed-off-by: default avatarXi Wang <xi.wang@gmail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      a0a129f8
    • Eric Dumazet's avatar
      net: introduce DST_NOPEER dst flag · e688a604
      Eric Dumazet authored
      Chris Boot reported crashes occurring in ipv6_select_ident().
      
      [  461.457562] RIP: 0010:[<ffffffff812dde61>]  [<ffffffff812dde61>]
      ipv6_select_ident+0x31/0xa7
      
      [  461.578229] Call Trace:
      [  461.580742] <IRQ>
      [  461.582870]  [<ffffffff812efa7f>] ? udp6_ufo_fragment+0x124/0x1a2
      [  461.589054]  [<ffffffff812dbfe0>] ? ipv6_gso_segment+0xc0/0x155
      [  461.595140]  [<ffffffff812700c6>] ? skb_gso_segment+0x208/0x28b
      [  461.601198]  [<ffffffffa03f236b>] ? ipv6_confirm+0x146/0x15e
      [nf_conntrack_ipv6]
      [  461.608786]  [<ffffffff81291c4d>] ? nf_iterate+0x41/0x77
      [  461.614227]  [<ffffffff81271d64>] ? dev_hard_start_xmit+0x357/0x543
      [  461.620659]  [<ffffffff81291cf6>] ? nf_hook_slow+0x73/0x111
      [  461.626440]  [<ffffffffa0379745>] ? br_parse_ip_options+0x19a/0x19a
      [bridge]
      [  461.633581]  [<ffffffff812722ff>] ? dev_queue_xmit+0x3af/0x459
      [  461.639577]  [<ffffffffa03747d2>] ? br_dev_queue_push_xmit+0x72/0x76
      [bridge]
      [  461.646887]  [<ffffffffa03791e3>] ? br_nf_post_routing+0x17d/0x18f
      [bridge]
      [  461.653997]  [<ffffffff81291c4d>] ? nf_iterate+0x41/0x77
      [  461.659473]  [<ffffffffa0374760>] ? br_flood+0xfa/0xfa [bridge]
      [  461.665485]  [<ffffffff81291cf6>] ? nf_hook_slow+0x73/0x111
      [  461.671234]  [<ffffffffa0374760>] ? br_flood+0xfa/0xfa [bridge]
      [  461.677299]  [<ffffffffa0379215>] ?
      nf_bridge_update_protocol+0x20/0x20 [bridge]
      [  461.684891]  [<ffffffffa03bb0e5>] ? nf_ct_zone+0xa/0x17 [nf_conntrack]
      [  461.691520]  [<ffffffffa0374760>] ? br_flood+0xfa/0xfa [bridge]
      [  461.697572]  [<ffffffffa0374812>] ? NF_HOOK.constprop.8+0x3c/0x56
      [bridge]
      [  461.704616]  [<ffffffffa0379031>] ?
      nf_bridge_push_encap_header+0x1c/0x26 [bridge]
      [  461.712329]  [<ffffffffa037929f>] ? br_nf_forward_finish+0x8a/0x95
      [bridge]
      [  461.719490]  [<ffffffffa037900a>] ?
      nf_bridge_pull_encap_header+0x1c/0x27 [bridge]
      [  461.727223]  [<ffffffffa0379974>] ? br_nf_forward_ip+0x1c0/0x1d4 [bridge]
      [  461.734292]  [<ffffffff81291c4d>] ? nf_iterate+0x41/0x77
      [  461.739758]  [<ffffffffa03748cc>] ? __br_deliver+0xa0/0xa0 [bridge]
      [  461.746203]  [<ffffffff81291cf6>] ? nf_hook_slow+0x73/0x111
      [  461.751950]  [<ffffffffa03748cc>] ? __br_deliver+0xa0/0xa0 [bridge]
      [  461.758378]  [<ffffffffa037533a>] ? NF_HOOK.constprop.4+0x56/0x56
      [bridge]
      
      This is caused by bridge netfilter special dst_entry (fake_rtable), a
      special shared entry, where attaching an inetpeer makes no sense.
      
      Problem is present since commit 87c48fa3
      
       (ipv6: make fragment
      identifications less predictable)
      
      Introduce DST_NOPEER dst flag and make sure ipv6_select_ident() and
      __ip_select_ident() fallback to the 'no peer attached' handling.
      
      Reported-by: default avatarChris Boot <bootc@bootc.net>
      Tested-by: default avatarChris Boot <bootc@bootc.net>
      Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      e688a604
    • Thomas Graf's avatar
      mqprio: Avoid panic if no options are provided · 7838f2ce
      Thomas Graf authored
      
      
      Userspace may not provide TCA_OPTIONS, in fact tc currently does
      so not do so if no arguments are specified on the command line.
      Return EINVAL instead of panicing.
      
      Signed-off-by: default avatarThomas Graf <tgraf@redhat.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      7838f2ce
    • Eric Dumazet's avatar
      bridge: provide a mtu() method for fake_dst_ops · a13861a2
      Eric Dumazet authored
      Commit 618f9bc7
      
       (net: Move mtu handling down to the protocol
      depended handlers) forgot the bridge netfilter case, adding a NULL
      dereference in ip_fragment().
      
      Reported-by: default avatarChris Boot <bootc@bootc.net>
      CC: Steffen Klassert <steffen.klassert@secunet.com>
      Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
      Acked-by: default avatarSteffen Klassert <steffen.klassert@secunet.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      a13861a2
    • Linus Torvalds's avatar
      Merge branch 'for-linus' of git://neil.brown.name/md · ad1fca20
      Linus Torvalds authored
      * 'for-linus' of git://neil.brown.name/md:
        md/bitmap: It is OK to clear bits during recovery.
        md: don't give up looking for spares on first failure-to-add
        md/raid5: ensure correct assessment of drives during degraded reshape.
        md/linear: fix hot-add of devices to linear arrays.
      ad1fca20
    • NeilBrown's avatar
      md/bitmap: It is OK to clear bits during recovery. · 961902c0
      NeilBrown authored
      commit d0a4bb49
      
       introduced a
      regression which is annoying but fairly harmless.
      
      When writing to an array that is undergoing recovery (a spare
      in being integrated into the array), writing to the array will
      set bits in the bitmap, but they will not be cleared when the
      write completes.
      
      For bits covering areas that have not been recovered yet this is not a
      problem as the recovery will clear the bits.  However bits set in
      already-recovered region will stay set and never be cleared.
      This doesn't risk data integrity.  The only negatives are:
       - next time there is a crash, more resyncing than necessary will
         be done.
       - the bitmap doesn't look clean, which is confusing.
      
      While an array is recovering we don't want to update the
      'events_cleared' setting in the bitmap but we do still want to clear
      bits that have very recently been set - providing they were written to
      the recovering device.
      
      So split those two needs - which previously both depended on 'success'
      and always clear the bit of the write went to all devices.
      
      Signed-off-by: default avatarNeilBrown <neilb@suse.de>
      961902c0
    • NeilBrown's avatar
      md: don't give up looking for spares on first failure-to-add · 60fc1370
      NeilBrown authored
      
      
      Before performing a recovery we try to remove any spares that
      might not be working, then add any that might have become relevant.
      
      Currently we abort on the first spare that cannot be added.
      This is a false optimisation.
      It is conceivable that - depending on rules in the personality - a
      subsequent spare might be accepted.
      Also the loop does other things like count the available spares and
      reset the 'recovery_offset' value.
      
      If we abort early these might not happen properly.
      
      So remove the early abort.
      
      In particular if you have an array what is undergoing recovery and
      which has extra spares, then the recovery may not restart after as
      reboot as the could of 'spares' might end up as zero.
      
      Reported-by: default avatarAnssi Hannula <anssi.hannula@iki.fi>
      Signed-off-by: default avatarNeilBrown <neilb@suse.de>
      60fc1370
    • NeilBrown's avatar
      md/raid5: ensure correct assessment of drives during degraded reshape. · 30d7a483
      NeilBrown authored
      
      
      While reshaping a degraded array (as when reshaping a RAID0 by first
      converting it to a degraded RAID4) we currently get confused about
      which devices are in_sync.  In most cases we get it right, but in the
      region that is being reshaped we need to treat non-failed devices as
      in-sync when we have the data but haven't actually written it out yet.
      
      Reported-by: default avatarAdam Kwolek <adam.kwolek@intel.com>
      Signed-off-by: default avatarNeilBrown <neilb@suse.de>
      30d7a483
    • NeilBrown's avatar
      md/linear: fix hot-add of devices to linear arrays. · 09cd9270
      NeilBrown authored
      commit d70ed2e4
      
      
      broke hot-add to a linear array.
      After that commit, metadata if not written to devices until they
      have been fully integrated into the array as determined by
      saved_raid_disk.  That patch arranged to clear that field after
      a recovery completed.
      
      However for linear arrays, there is no recovery - the integration is
      instantaneous.  So we need to explicitly clear the saved_raid_disk
      field.
      
      Signed-off-by: default avatarNeilBrown <neilb@suse.de>
      09cd9270
    • David S. Miller's avatar
      sparc64: Fix MSIQ HV call ordering in pci_sun4v_msiq_build_irq(). · 7cc85833
      David S. Miller authored
      
      
      This silently was working for many years and stopped working on
      Niagara-T3 machines.
      
      We need to set the MSIQ to VALID before we can set it's state to IDLE.
      
      On Niagara-T3, setting the state to IDLE first was causing HV_EINVAL
      errors.  The hypervisor documentation says, rather ambiguously, that
      the MSIQ must be "initialized" before one can set the state.
      
      I previously understood this to mean merely that a successful setconf()
      operation has been performed on the MSIQ, which we have done at this
      point.  But it seems to also mean that it has been set VALID too.
      
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      7cc85833
    • Linus Torvalds's avatar
      Merge branch 'usb-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb · b3b1b70e
      Linus Torvalds authored
      * 'usb-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb:
        USB: Fix usb/isp1760 build on sparc
        usb: gadget: epautoconf: do not change number of streams
        usb: dwc3: core: fix cached revision on our structure
        usb: musb: fix reset issue with full speed device
      b3b1b70e
    • Linus Torvalds's avatar
      Merge branch 'upstream-linus' of git://github.com/jgarzik/libata-dev · abe8809c
      Linus Torvalds authored
      * 'upstream-linus' of git://github.com/jgarzik/libata-dev:
        pata_of_platform: Add missing CONFIG_OF_IRQ dependency.
      abe8809c
    • David Miller's avatar
    • Stephen Rothwell's avatar
  9. Dec 22, 2011
    • Srivatsa S. Bhat's avatar
      VFS: Fix race between CPU hotplug and lglocks · e30e2fdf
      Srivatsa S. Bhat authored
      
      
      Currently, the *_global_[un]lock_online() routines are not at all synchronized
      with CPU hotplug. Soft-lockups detected as a consequence of this race was
      reported earlier at https://lkml.org/lkml/2011/8/24/185. (Thanks to Cong Meng
      for finding out that the root-cause of this issue is the race condition
      between br_write_[un]lock() and CPU hotplug, which results in the lock states
      getting messed up).
      
      Fixing this race by just adding {get,put}_online_cpus() at appropriate places
      in *_global_[un]lock_online() is not a good option, because, then suddenly
      br_write_[un]lock() would become blocking, whereas they have been kept as
      non-blocking all this time, and we would want to keep them that way.
      
      So, overall, we want to ensure 3 things:
      1. br_write_lock() and br_write_unlock() must remain as non-blocking.
      2. The corresponding lock and unlock of the per-cpu spinlocks must not happen
         for different sets of CPUs.
      3. Either prevent any new CPU online operation in between this lock-unlock, or
         ensure that the newly onlined CPU does not proceed with its corresponding
         per-cpu spinlock unlocked.
      
      To achieve all this:
      (a) We introduce a new spinlock that is taken by the *_global_lock_online()
          routine and released by the *_global_unlock_online() routine.
      (b) We register a callback for CPU hotplug notifications, and this callback
          takes the same spinlock as above.
      (c) We maintain a bitmap which is close to the cpu_online_mask, and once it is
          initialized in the lock_init() code, all future updates to it are done in
          the callback, under the above spinlock.
      (d) The above bitmap is used (instead of cpu_online_mask) while locking and
          unlocking the per-cpu locks.
      
      The callback takes the spinlock upon the CPU_UP_PREPARE event. So, if the
      br_write_lock-unlock sequence is in progress, the callback keeps spinning,
      thus preventing the CPU online operation till the lock-unlock sequence is
      complete. This takes care of requirement (3).
      
      The bitmap that we maintain remains unmodified throughout the lock-unlock
      sequence, since all updates to it are managed by the callback, which takes
      the same spinlock as the one taken by the lock code and released only by the
      unlock routine. Combining this with (d) above, satisfies requirement (2).
      
      Overall, since we use a spinlock (mentioned in (a)) to prevent CPU hotplug
      operations from racing with br_write_lock-unlock, requirement (1) is also
      taken care of.
      
      By the way, it is to be noted that a CPU offline operation can actually run
      in parallel with our lock-unlock sequence, because our callback doesn't react
      to notifications earlier than CPU_DEAD (in order to maintain our bitmap
      properly). And this means, since we use our own bitmap (which is stale, on
      purpose) during the lock-unlock sequence, we could end up unlocking the
      per-cpu lock of an offline CPU (because we had locked it earlier, when the
      CPU was online), in order to satisfy requirement (2). But this is harmless,
      though it looks a bit awkward.
      
      Debugged-by: default avatarCong Meng <mc@linux.vnet.ibm.com>
      Signed-off-by: default avatarSrivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
      Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
      Cc: stable@vger.kernel.org
      e30e2fdf
    • Linus Torvalds's avatar
      Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net · ecefc36b
      Linus Torvalds authored
      * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net:
        net: Add a flow_cache_flush_deferred function
        ipv4: reintroduce route cache garbage collector
        net: have ipconfig not wait if no dev is available
        sctp: Do not account for sizeof(struct sk_buff) in estimated rwnd
        asix: new device id
        davinci-cpdma: fix locking issue in cpdma_chan_stop
        sctp: fix incorrect overflow check on autoclose
        r8169: fix Config2 MSIEnable bit setting.
        llc: llc_cmsg_rcv was getting called after sk_eat_skb.
        net: bpf_jit: fix an off-one bug in x86_64 cond jump target
        iwlwifi: update SCD BC table for all SCD queues
        Revert "Bluetooth: Revert: Fix L2CAP connection establishment"
        Bluetooth: Clear RFCOMM session timer when disconnecting last channel
        Bluetooth: Prevent uninitialized data access in L2CAP configuration
        iwlwifi: allow to switch to HT40 if not associated
        iwlwifi: tx_sync only on PAN context
        mwifiex: avoid double list_del in command cancel path
        ath9k: fix max phy rate at rate control init
        nfc: signedness bug in __nci_request()
        iwlwifi: do not set the sequence control bit is not needed
      ecefc36b
    • Linus Torvalds's avatar
      Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound · d5ed5e48
      Linus Torvalds authored
      * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound:
        ALSA: atmel/ac97c: using software reset instead hardware reset if not available
      d5ed5e48