From 1ca3683cc6d2c2ce4204df519c4e4730d037905a Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 4 Oct 2023 16:49:42 +0200 Subject: [PATCH 0001/1519] x86/percpu: Enable named address spaces with known compiler version Enable named address spaces with known compiler versions (GCC 12.1 and later) in order to avoid possible issues with named address spaces with older compilers. Set CC_HAS_NAMED_AS when the compiler satisfies version requirements and set USE_X86_SEG_SUPPORT to signal when segment qualifiers could be used. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Josh Poimboeuf Link: https://lore.kernel.org/r/20231004145137.86537-3-ubizjak@gmail.com --- arch/x86/Kconfig | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 982b777eadc7a..ecb256954351d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2388,6 +2388,13 @@ source "kernel/livepatch/Kconfig" endmenu +config CC_HAS_NAMED_AS + def_bool CC_IS_GCC && GCC_VERSION >= 120100 + +config USE_X86_SEG_SUPPORT + def_bool y + depends on CC_HAS_NAMED_AS && SMP + config CC_HAS_SLS def_bool $(cc-option,-mharden-sls=all) -- GitLab From 9a462b9eafa6dda16ea8429b151edb1fb535d744 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Wed, 4 Oct 2023 16:49:43 +0200 Subject: [PATCH 0002/1519] x86/percpu: Use compiler segment prefix qualifier Using a segment prefix qualifier is cleaner than using a segment prefix in the inline assembly, and provides the compiler with more information, telling it that __seg_gs:[addr] is different than [addr] when it analyzes data dependencies. It also enables various optimizations that will be implemented in the next patches. Use segment prefix qualifiers when they are supported. Unfortunately, gcc does not provide a way to remove segment qualifiers, which is needed to use typeof() to create local instances of the per-CPU variable. For this reason, do not use the segment qualifier for per-CPU variables, and do casting using the segment qualifier instead. Uros: Improve compiler support detection and update the patch to the current mainline. Signed-off-by: Nadav Amit Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Josh Poimboeuf Link: https://lore.kernel.org/r/20231004145137.86537-4-ubizjak@gmail.com --- arch/x86/include/asm/percpu.h | 68 +++++++++++++++++++++++----------- arch/x86/include/asm/preempt.h | 2 +- 2 files changed, 47 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 20624b80f8904..da451202a1b9b 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -28,26 +28,50 @@ #include #ifdef CONFIG_SMP + +#ifdef CONFIG_CC_HAS_NAMED_AS + +#ifdef CONFIG_X86_64 +#define __percpu_seg_override __seg_gs +#else +#define __percpu_seg_override __seg_fs +#endif + +#define __percpu_prefix "" + +#else /* CONFIG_CC_HAS_NAMED_AS */ + +#define __percpu_seg_override #define __percpu_prefix "%%"__stringify(__percpu_seg)":" + +#endif /* CONFIG_CC_HAS_NAMED_AS */ + +#define __force_percpu_prefix "%%"__stringify(__percpu_seg)":" #define __my_cpu_offset this_cpu_read(this_cpu_off) /* * Compared to the generic __my_cpu_offset version, the following * saves one instruction and avoids clobbering a temp register. */ -#define arch_raw_cpu_ptr(ptr) \ -({ \ - unsigned long tcp_ptr__; \ - asm ("add " __percpu_arg(1) ", %0" \ - : "=r" (tcp_ptr__) \ - : "m" (this_cpu_off), "0" (ptr)); \ - (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ +#define arch_raw_cpu_ptr(ptr) \ +({ \ + unsigned long tcp_ptr__; \ + asm ("add " __percpu_arg(1) ", %0" \ + : "=r" (tcp_ptr__) \ + : "m" (__my_cpu_var(this_cpu_off)), "0" (ptr)); \ + (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ }) -#else +#else /* CONFIG_SMP */ +#define __percpu_seg_override #define __percpu_prefix "" -#endif +#define __force_percpu_prefix "" +#endif /* CONFIG_SMP */ +#define __my_cpu_type(var) typeof(var) __percpu_seg_override +#define __my_cpu_ptr(ptr) (__my_cpu_type(*ptr) *)(uintptr_t)(ptr) +#define __my_cpu_var(var) (*__my_cpu_ptr(&var)) #define __percpu_arg(x) __percpu_prefix "%" #x +#define __force_percpu_arg(x) __force_percpu_prefix "%" #x /* * Initialized pointers to per-cpu variables needed for the boot @@ -107,14 +131,14 @@ do { \ (void)pto_tmp__; \ } \ asm qual(__pcpu_op2_##size(op, "%[val]", __percpu_arg([var])) \ - : [var] "+m" (_var) \ + : [var] "+m" (__my_cpu_var(_var)) \ : [val] __pcpu_reg_imm_##size(pto_val__)); \ } while (0) #define percpu_unary_op(size, qual, op, _var) \ ({ \ asm qual (__pcpu_op1_##size(op, __percpu_arg([var])) \ - : [var] "+m" (_var)); \ + : [var] "+m" (__my_cpu_var(_var))); \ }) /* @@ -144,14 +168,14 @@ do { \ __pcpu_type_##size pfo_val__; \ asm qual (__pcpu_op2_##size(op, __percpu_arg([var]), "%[val]") \ : [val] __pcpu_reg_##size("=", pfo_val__) \ - : [var] "m" (_var)); \ + : [var] "m" (__my_cpu_var(_var))); \ (typeof(_var))(unsigned long) pfo_val__; \ }) #define percpu_stable_op(size, op, _var) \ ({ \ __pcpu_type_##size pfo_val__; \ - asm(__pcpu_op2_##size(op, __percpu_arg(P[var]), "%[val]") \ + asm(__pcpu_op2_##size(op, __force_percpu_arg(P[var]), "%[val]") \ : [val] __pcpu_reg_##size("=", pfo_val__) \ : [var] "p" (&(_var))); \ (typeof(_var))(unsigned long) pfo_val__; \ @@ -166,7 +190,7 @@ do { \ asm qual (__pcpu_op2_##size("xadd", "%[tmp]", \ __percpu_arg([var])) \ : [tmp] __pcpu_reg_##size("+", paro_tmp__), \ - [var] "+m" (_var) \ + [var] "+m" (__my_cpu_var(_var)) \ : : "memory"); \ (typeof(_var))(unsigned long) (paro_tmp__ + _val); \ }) @@ -187,7 +211,7 @@ do { \ __percpu_arg([var])) \ "\n\tjnz 1b" \ : [oval] "=&a" (pxo_old__), \ - [var] "+m" (_var) \ + [var] "+m" (__my_cpu_var(_var)) \ : [nval] __pcpu_reg_##size(, pxo_new__) \ : "memory"); \ (typeof(_var))(unsigned long) pxo_old__; \ @@ -204,7 +228,7 @@ do { \ asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \ __percpu_arg([var])) \ : [oval] "+a" (pco_old__), \ - [var] "+m" (_var) \ + [var] "+m" (__my_cpu_var(_var)) \ : [nval] __pcpu_reg_##size(, pco_new__) \ : "memory"); \ (typeof(_var))(unsigned long) pco_old__; \ @@ -221,7 +245,7 @@ do { \ CC_SET(z) \ : CC_OUT(z) (success), \ [oval] "+a" (pco_old__), \ - [var] "+m" (_var) \ + [var] "+m" (__my_cpu_var(_var)) \ : [nval] __pcpu_reg_##size(, pco_new__) \ : "memory"); \ if (unlikely(!success)) \ @@ -244,7 +268,7 @@ do { \ \ asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ - : [var] "+m" (_var), \ + : [var] "+m" (__my_cpu_var(_var)), \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ @@ -276,7 +300,7 @@ do { \ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ CC_SET(z) \ : CC_OUT(z) (success), \ - [var] "+m" (_var), \ + [var] "+m" (__my_cpu_var(_var)), \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ @@ -313,7 +337,7 @@ do { \ \ asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ - : [var] "+m" (_var), \ + : [var] "+m" (__my_cpu_var(_var)), \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ @@ -345,7 +369,7 @@ do { \ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ CC_SET(z) \ : CC_OUT(z) (success), \ - [var] "+m" (_var), \ + [var] "+m" (__my_cpu_var(_var)), \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ @@ -494,7 +518,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr, asm volatile("btl "__percpu_arg(2)",%1" CC_SET(c) : CC_OUT(c) (oldbit) - : "m" (*(unsigned long __percpu *)addr), "Ir" (nr)); + : "m" (*__my_cpu_ptr((unsigned long __percpu *)(addr))), "Ir" (nr)); return oldbit; } diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 4527e1430c6dc..4b2a35d8d56a8 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -92,7 +92,7 @@ static __always_inline void __preempt_count_sub(int val) */ static __always_inline bool __preempt_count_dec_and_test(void) { - return GEN_UNARY_RMWcc("decl", pcpu_hot.preempt_count, e, + return GEN_UNARY_RMWcc("decl", __my_cpu_var(pcpu_hot.preempt_count), e, __percpu_arg([var])); } -- GitLab From ca4256348660cb2162668ec3d13d1f921d05374a Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 4 Oct 2023 21:23:08 +0200 Subject: [PATCH 0003/1519] x86/percpu: Use C for percpu read/write accessors The percpu code mostly uses inline assembly. Using segment qualifiers allows to use C code instead, which enables the compiler to perform various optimizations (e.g. propagation of memory arguments). Convert percpu read and write accessors to C code, so the memory argument can be propagated to the instruction that uses this argument. Some examples of propagations: a) into sign/zero extensions: the code improves from: 65 8a 05 00 00 00 00 mov %gs:0x0(%rip),%al 0f b6 c0 movzbl %al,%eax to: 65 0f b6 05 00 00 00 movzbl %gs:0x0(%rip),%eax 00 and in a similar way for: movzbl %gs:0x0(%rip),%edx movzwl %gs:0x0(%rip),%esi movzbl %gs:0x78(%rbx),%eax movslq %gs:0x0(%rip),%rdx movslq %gs:(%rdi),%rbx b) into compares: the code improves from: 65 8b 05 00 00 00 00 mov %gs:0x0(%rip),%eax a9 00 00 0f 00 test $0xf0000,%eax to: 65 f7 05 00 00 00 00 testl $0xf0000,%gs:0x0(%rip) 00 00 0f 00 and in a similar way for: testl $0xf0000,%gs:0x0(%rip) testb $0x1,%gs:0x0(%rip) testl $0xff00,%gs:0x0(%rip) cmpb $0x0,%gs:0x0(%rip) cmp %gs:0x0(%rip),%r14d cmpw $0x8,%gs:0x0(%rip) cmpb $0x0,%gs:(%rax) c) into other insns: the code improves from: 1a355: 83 fa ff cmp $0xffffffff,%edx 1a358: 75 07 jne 1a361 <...> 1a35a: 65 8b 15 00 00 00 00 mov %gs:0x0(%rip),%edx 1a361: to: 1a35a: 83 fa ff cmp $0xffffffff,%edx 1a35d: 65 0f 44 15 00 00 00 cmove %gs:0x0(%rip),%edx 1a364: 00 The above propagations result in the following code size improvements for current mainline kernel (with the default config), compiled with: # gcc (GCC) 12.3.1 20230508 (Red Hat 12.3.1-1) text data bss dec filename 25508862 4386540 808388 30703790 vmlinux-vanilla.o 25500922 4386532 808388 30695842 vmlinux-new.o Co-developed-by: Nadav Amit Signed-off-by: Nadav Amit Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Josh Poimboeuf Link: https://lore.kernel.org/r/20231004192404.31733-1-ubizjak@gmail.com --- arch/x86/include/asm/percpu.h | 65 +++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index da451202a1b9b..60ea7755c0fea 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -400,13 +400,66 @@ do { \ #define this_cpu_read_stable_8(pcp) percpu_stable_op(8, "mov", pcp) #define this_cpu_read_stable(pcp) __pcpu_size_call_return(this_cpu_read_stable_, pcp) +#ifdef CONFIG_USE_X86_SEG_SUPPORT + +#define __raw_cpu_read(qual, pcp) \ +({ \ + *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)); \ +}) + +#define __raw_cpu_write(qual, pcp, val) \ +do { \ + *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)) = (val); \ +} while (0) + +#define raw_cpu_read_1(pcp) __raw_cpu_read(, pcp) +#define raw_cpu_read_2(pcp) __raw_cpu_read(, pcp) +#define raw_cpu_read_4(pcp) __raw_cpu_read(, pcp) +#define raw_cpu_write_1(pcp, val) __raw_cpu_write(, pcp, val) +#define raw_cpu_write_2(pcp, val) __raw_cpu_write(, pcp, val) +#define raw_cpu_write_4(pcp, val) __raw_cpu_write(, pcp, val) + +#define this_cpu_read_1(pcp) __raw_cpu_read(volatile, pcp) +#define this_cpu_read_2(pcp) __raw_cpu_read(volatile, pcp) +#define this_cpu_read_4(pcp) __raw_cpu_read(volatile, pcp) +#define this_cpu_write_1(pcp, val) __raw_cpu_write(volatile, pcp, val) +#define this_cpu_write_2(pcp, val) __raw_cpu_write(volatile, pcp, val) +#define this_cpu_write_4(pcp, val) __raw_cpu_write(volatile, pcp, val) + +#ifdef CONFIG_X86_64 +#define raw_cpu_read_8(pcp) __raw_cpu_read(, pcp) +#define raw_cpu_write_8(pcp, val) __raw_cpu_write(, pcp, val) + +#define this_cpu_read_8(pcp) __raw_cpu_read(volatile, pcp) +#define this_cpu_write_8(pcp, val) __raw_cpu_write(volatile, pcp, val) +#endif + +#else /* CONFIG_USE_X86_SEG_SUPPORT */ + #define raw_cpu_read_1(pcp) percpu_from_op(1, , "mov", pcp) #define raw_cpu_read_2(pcp) percpu_from_op(2, , "mov", pcp) #define raw_cpu_read_4(pcp) percpu_from_op(4, , "mov", pcp) - #define raw_cpu_write_1(pcp, val) percpu_to_op(1, , "mov", (pcp), val) #define raw_cpu_write_2(pcp, val) percpu_to_op(2, , "mov", (pcp), val) #define raw_cpu_write_4(pcp, val) percpu_to_op(4, , "mov", (pcp), val) + +#define this_cpu_read_1(pcp) percpu_from_op(1, volatile, "mov", pcp) +#define this_cpu_read_2(pcp) percpu_from_op(2, volatile, "mov", pcp) +#define this_cpu_read_4(pcp) percpu_from_op(4, volatile, "mov", pcp) +#define this_cpu_write_1(pcp, val) percpu_to_op(1, volatile, "mov", (pcp), val) +#define this_cpu_write_2(pcp, val) percpu_to_op(2, volatile, "mov", (pcp), val) +#define this_cpu_write_4(pcp, val) percpu_to_op(4, volatile, "mov", (pcp), val) + +#ifdef CONFIG_X86_64 +#define raw_cpu_read_8(pcp) percpu_from_op(8, , "mov", pcp) +#define raw_cpu_write_8(pcp, val) percpu_to_op(8, , "mov", (pcp), val) + +#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp) +#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val) +#endif + +#endif /* CONFIG_USE_X86_SEG_SUPPORT */ + #define raw_cpu_add_1(pcp, val) percpu_add_op(1, , (pcp), val) #define raw_cpu_add_2(pcp, val) percpu_add_op(2, , (pcp), val) #define raw_cpu_add_4(pcp, val) percpu_add_op(4, , (pcp), val) @@ -432,12 +485,6 @@ do { \ #define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val) #define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val) -#define this_cpu_read_1(pcp) percpu_from_op(1, volatile, "mov", pcp) -#define this_cpu_read_2(pcp) percpu_from_op(2, volatile, "mov", pcp) -#define this_cpu_read_4(pcp) percpu_from_op(4, volatile, "mov", pcp) -#define this_cpu_write_1(pcp, val) percpu_to_op(1, volatile, "mov", (pcp), val) -#define this_cpu_write_2(pcp, val) percpu_to_op(2, volatile, "mov", (pcp), val) -#define this_cpu_write_4(pcp, val) percpu_to_op(4, volatile, "mov", (pcp), val) #define this_cpu_add_1(pcp, val) percpu_add_op(1, volatile, (pcp), val) #define this_cpu_add_2(pcp, val) percpu_add_op(2, volatile, (pcp), val) #define this_cpu_add_4(pcp, val) percpu_add_op(4, volatile, (pcp), val) @@ -476,8 +523,6 @@ do { \ * 32 bit must fall back to generic operations. */ #ifdef CONFIG_X86_64 -#define raw_cpu_read_8(pcp) percpu_from_op(8, , "mov", pcp) -#define raw_cpu_write_8(pcp, val) percpu_to_op(8, , "mov", (pcp), val) #define raw_cpu_add_8(pcp, val) percpu_add_op(8, , (pcp), val) #define raw_cpu_and_8(pcp, val) percpu_to_op(8, , "and", (pcp), val) #define raw_cpu_or_8(pcp, val) percpu_to_op(8, , "or", (pcp), val) @@ -486,8 +531,6 @@ do { \ #define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval) #define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval) -#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp) -#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val) #define this_cpu_add_8(pcp, val) percpu_add_op(8, volatile, (pcp), val) #define this_cpu_and_8(pcp, val) percpu_to_op(8, volatile, "and", (pcp), val) #define this_cpu_or_8(pcp, val) percpu_to_op(8, volatile, "or", (pcp), val) -- GitLab From e29aad08b1da7772b362537be32335c0394e65fe Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 9 Oct 2023 17:13:48 +0200 Subject: [PATCH 0004/1519] x86/percpu: Disable named address spaces for KASAN -fsanitize=kernel-address (KASAN) is at the moment incompatible with named address spaces - see GCC PR sanitizer/111736: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111736 GCC is doing a KASAN check on a percpu address which it shouldn't do, and didn't used to do because we did the access using inline asm. But now that GCC does the accesses as normal (albeit special address space) memory accesses, the KASAN code triggers on them too, and it all goes to hell in a handbasket very quickly. Those percpu accessor functions need to disable any KASAN checking or other sanitizer checking. Not on the percpu address, because that's not a "real" address, it's obviously just the offset from the segment register. And GCC should probably not have generated such code in the first place, so arguably this is a bug with -fsanitize=kernel-address. The patch also removes a stale dependency on CONFIG_SMP. Reported-by: kernel test robot Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Josh Poimboeuf Link: https://lore.kernel.org/r/20231009151409.53656-1-ubizjak@gmail.com Closes: https://lore.kernel.org/oe-lkp/202310071301.a5113890-oliver.sang@intel.com --- arch/x86/Kconfig | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ecb256954351d..54e79d3061f96 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2393,7 +2393,12 @@ config CC_HAS_NAMED_AS config USE_X86_SEG_SUPPORT def_bool y - depends on CC_HAS_NAMED_AS && SMP + depends on CC_HAS_NAMED_AS + # + # -fsanitize=kernel-address (KASAN) is at the moment incompatible + # with named address spaces - see GCC PR sanitizer/111736. + # + depends on !KASAN config CC_HAS_SLS def_bool $(cc-option,-mharden-sls=all) -- GitLab From a048d3abae7c33f0a3f4575fab15ac5504d443f7 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 15 Oct 2023 22:24:39 +0200 Subject: [PATCH 0005/1519] x86/percpu: Rewrite arch_raw_cpu_ptr() to be easier for compilers to optimize Implement arch_raw_cpu_ptr() as a load from this_cpu_off and then add the ptr value to the base. This way, the compiler can propagate addend to the following instruction and simplify address calculation. E.g.: address calcuation in amd_pmu_enable_virt() improves from: 48 c7 c0 00 00 00 00 mov $0x0,%rax 87b7: R_X86_64_32S cpu_hw_events 65 48 03 05 00 00 00 add %gs:0x0(%rip),%rax 00 87bf: R_X86_64_PC32 this_cpu_off-0x4 48 c7 80 28 13 00 00 movq $0x0,0x1328(%rax) 00 00 00 00 to: 65 48 8b 05 00 00 00 mov %gs:0x0(%rip),%rax 00 8798: R_X86_64_PC32 this_cpu_off-0x4 48 c7 80 00 00 00 00 movq $0x0,0x0(%rax) 00 00 00 00 87a6: R_X86_64_32S cpu_hw_events+0x1328 The compiler also eliminates additional redundant loads from this_cpu_off, reducing the number of percpu offset reads from 1668 to 1646 on a test build, a -1.3% reduction. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: Uros Bizjak Cc: Sean Christopherson Link: https://lore.kernel.org/r/20231015202523.189168-1-ubizjak@gmail.com --- arch/x86/include/asm/percpu.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 60ea7755c0fea..915675f3ad605 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -56,9 +56,11 @@ #define arch_raw_cpu_ptr(ptr) \ ({ \ unsigned long tcp_ptr__; \ - asm ("add " __percpu_arg(1) ", %0" \ + asm ("mov " __percpu_arg(1) ", %0" \ : "=r" (tcp_ptr__) \ - : "m" (__my_cpu_var(this_cpu_off)), "0" (ptr)); \ + : "m" (__my_cpu_var(this_cpu_off))); \ + \ + tcp_ptr__ += (unsigned long)(ptr); \ (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ }) #else /* CONFIG_SMP */ -- GitLab From 1d10f3aec2bb734b4b594afe8c1bd0aa656a7e4d Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 15 Oct 2023 22:24:40 +0200 Subject: [PATCH 0006/1519] x86/percpu: Use C for arch_raw_cpu_ptr(), to improve code generation Implement arch_raw_cpu_ptr() in C to allow the compiler to perform better optimizations, such as setting an appropriate base to compute the address. The compiler is free to choose either MOV or ADD from this_cpu_off address to construct the optimal final address. There are some other issues when memory access to the percpu area is implemented with an asm. Compilers can not eliminate asm common subexpressions over basic block boundaries, but are extremely good at optimizing memory access. By implementing arch_raw_cpu_ptr() in C, the compiler can eliminate additional redundant loads from this_cpu_off, further reducing the number of percpu offset reads from 1646 to 1631 on a test build, a -0.9% reduction. Co-developed-by: Nadav Amit Signed-off-by: Nadav Amit Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: Uros Bizjak Cc: Sean Christopherson Link: https://lore.kernel.org/r/20231015202523.189168-2-ubizjak@gmail.com --- arch/x86/include/asm/percpu.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 915675f3ad605..54746903b8c3c 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -49,6 +49,21 @@ #define __force_percpu_prefix "%%"__stringify(__percpu_seg)":" #define __my_cpu_offset this_cpu_read(this_cpu_off) +#ifdef CONFIG_USE_X86_SEG_SUPPORT +/* + * Efficient implementation for cases in which the compiler supports + * named address spaces. Allows the compiler to perform additional + * optimizations that can save more instructions. + */ +#define arch_raw_cpu_ptr(ptr) \ +({ \ + unsigned long tcp_ptr__; \ + tcp_ptr__ = __raw_cpu_read(, this_cpu_off); \ + \ + tcp_ptr__ += (unsigned long)(ptr); \ + (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ +}) +#else /* CONFIG_USE_X86_SEG_SUPPORT */ /* * Compared to the generic __my_cpu_offset version, the following * saves one instruction and avoids clobbering a temp register. @@ -63,6 +78,8 @@ tcp_ptr__ += (unsigned long)(ptr); \ (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ }) +#endif /* CONFIG_USE_X86_SEG_SUPPORT */ + #else /* CONFIG_SMP */ #define __percpu_seg_override #define __percpu_prefix "" -- GitLab From e39828d2c1c0781ccfcf742791daf88fdfa481ea Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 16 Oct 2023 22:07:30 +0200 Subject: [PATCH 0007/1519] x86/percpu: Use the correct asm operand modifier in percpu_stable_op() The "P" asm operand modifier is a x86 target-specific modifier. When used for a constant, it drops all syntax-specific prefixes and issues the bare constant. This modifier is not correct for address handling, in this case a generic "a" operand modifier should be used. The "a" asm operand modifier substitutes a memory reference, with the actual operand treated as address. For x86_64, when a symbol is provided, the "a" modifier emits "sym(%rip)" instead of "sym", enabling shorter %rip-relative addressing. Clang allows only "i" and "r" operand constraints with an "a" modifier, so the patch normalizes the modifier/constraint pair to "a"/"i" which is consistent between both compilers. The patch reduces code size of a test build by 4072 bytes: text data bss dec hex filename 25523268 4388300 808452 30720020 1d4c014 vmlinux-old.o 25519196 4388300 808452 30715948 1d4b02c vmlinux-new.o [ mingo: Changelog clarity. ] Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: Uros Bizjak Cc: Sean Christopherson Link: https://lore.kernel.org/r/20231016200755.287403-1-ubizjak@gmail.com --- arch/x86/include/asm/percpu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 54746903b8c3c..ac3220aeb779a 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -194,9 +194,9 @@ do { \ #define percpu_stable_op(size, op, _var) \ ({ \ __pcpu_type_##size pfo_val__; \ - asm(__pcpu_op2_##size(op, __force_percpu_arg(P[var]), "%[val]") \ + asm(__pcpu_op2_##size(op, __force_percpu_arg(a[var]), "%[val]") \ : [val] __pcpu_reg_##size("=", pfo_val__) \ - : [var] "p" (&(_var))); \ + : [var] "i" (&(_var))); \ (typeof(_var))(unsigned long) pfo_val__; \ }) -- GitLab From 24b8a23638cbf92449c353f828b1d309548c78f4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 18 Oct 2023 20:41:58 +0200 Subject: [PATCH 0008/1519] x86/fpu: Clean up FPU switching in the middle of task switching It happens to work, but it's very very wrong, because our 'current' macro is magic that is supposedly loading a stable value. It just happens to be not quite stable enough and the compilers re-load the value enough for this code to work. But it's wrong. The whole struct fpu *prev_fpu = &prev->fpu; thing in __switch_to() is pretty ugly. There's no reason why we should look at that 'prev_fpu' pointer there, or pass it down. And it only generates worse code, in how it loads 'current' when __switch_to() has the right task pointers. The attached patch not only cleans this up, it actually generates better code too: (a) it removes one push/pop pair at entry/exit because there's one less register used (no 'current') (b) it removes that pointless load of 'current' because it just uses the right argument: - movq %gs:pcpu_hot(%rip), %r12 - testq $16384, (%r12) + testq $16384, (%rdi) Signed-off-by: Linus Torvalds Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231018184227.446318-1-ubizjak@gmail.com --- arch/x86/include/asm/fpu/sched.h | 10 ++++++---- arch/x86/kernel/process_32.c | 7 +++---- arch/x86/kernel/process_64.c | 7 +++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index ca6e5e5f16b2e..c485f1944c5f8 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -37,10 +37,12 @@ extern void fpu_flush_thread(void); * The FPU context is only stored/restored for a user task and * PF_KTHREAD is used to distinguish between kernel and user threads. */ -static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) +static inline void switch_fpu_prepare(struct task_struct *old, int cpu) { if (cpu_feature_enabled(X86_FEATURE_FPU) && - !(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { + !(old->flags & (PF_KTHREAD | PF_USER_WORKER))) { + struct fpu *old_fpu = &old->thread.fpu; + save_fpregs_to_fpstate(old_fpu); /* * The save operation preserved register state, so the @@ -60,10 +62,10 @@ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) * Delay loading of the complete FPU state until the return to userland. * PKRU is handled separately. */ -static inline void switch_fpu_finish(void) +static inline void switch_fpu_finish(struct task_struct *new) { if (cpu_feature_enabled(X86_FEATURE_FPU)) - set_thread_flag(TIF_NEED_FPU_LOAD); + set_tsk_thread_flag(new, TIF_NEED_FPU_LOAD); } #endif /* _ASM_X86_FPU_SCHED_H */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 708c87b88cc15..0917c7f25720b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -156,13 +156,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; - struct fpu *prev_fpu = &prev->fpu; int cpu = smp_processor_id(); /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - if (!test_thread_flag(TIF_NEED_FPU_LOAD)) - switch_fpu_prepare(prev_fpu, cpu); + if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD)) + switch_fpu_prepare(prev_p, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -209,7 +208,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) raw_cpu_write(pcpu_hot.current_task, next_p); - switch_fpu_finish(); + switch_fpu_finish(next_p); /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(next_p); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 33b268747bb7b..1553e19904e05 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -562,14 +562,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; - struct fpu *prev_fpu = &prev->fpu; int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(pcpu_hot.hardirq_stack_inuse)); - if (!test_thread_flag(TIF_NEED_FPU_LOAD)) - switch_fpu_prepare(prev_fpu, cpu); + if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD)) + switch_fpu_prepare(prev_p, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). @@ -623,7 +622,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) raw_cpu_write(pcpu_hot.current_task, next_p); raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p)); - switch_fpu_finish(); + switch_fpu_finish(next_p); /* Reload sp0. */ update_task_stack(next_p); -- GitLab From 39d64ee59ceee0fb61243eab3c4b7b4492f80df2 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 17 Oct 2023 18:27:32 +0200 Subject: [PATCH 0009/1519] x86/percpu: Correct PER_CPU_VAR() usage to include symbol and its addend The PER_CPU_VAR() macro should be applied to a symbol and its addend. Inconsistent usage is currently harmless, but needs to be corrected before %rip-relative addressing is introduced to the PER_CPU_VAR() macro. No functional changes intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: linux-kernel@vger.kernel.org Cc: Brian Gerst Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Sean Christopherson --- arch/x86/entry/calling.h | 2 +- arch/x86/entry/entry_32.S | 2 +- arch/x86/entry/entry_64.S | 2 +- arch/x86/kernel/head_64.S | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index f6907627172ba..47368ab0bda0e 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -173,7 +173,7 @@ For 32-bit we have the following conventions - kernel is built with .endm #define THIS_CPU_user_pcid_flush_mask \ - PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask + PER_CPU_VAR(cpu_tlbstate + TLB_STATE_user_pcid_flush_mask) .macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 6e6af42e044a2..d4e094b2c877f 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -305,7 +305,7 @@ .macro CHECK_AND_APPLY_ESPFIX #ifdef CONFIG_X86_ESPFIX32 #define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8) -#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + GDT_ESPFIX_OFFSET +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page + GDT_ESPFIX_OFFSET) ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b940e928c8088..6d236652fceb6 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -252,7 +252,7 @@ SYM_FUNC_START(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movq TASK_stack_canary(%rsi), %rbx - movq %rbx, PER_CPU_VAR(fixed_percpu_data) + FIXED_stack_canary + movq %rbx, PER_CPU_VAR(fixed_percpu_data + FIXED_stack_canary) #endif /* diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ea6995920b7aa..bfe5ec2f4f83f 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -449,7 +449,7 @@ SYM_CODE_START(soft_restart_cpu) UNWIND_HINT_END_OF_STACK /* Find the idle task stack */ - movq PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx + movq PER_CPU_VAR(pcpu_hot + X86_current_task), %rcx movq TASK_threadsp(%rcx), %rsp jmp .Ljump_to_C_code -- GitLab From aa47f90cd4331e1809d56c72fcbdbbe0a85e5992 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 17 Oct 2023 18:27:33 +0200 Subject: [PATCH 0010/1519] x86/percpu, xen: Correct PER_CPU_VAR() usage to include symbol and its addend The PER_CPU_VAR() macro should be applied to a symbol and its addend. Inconsistent usage is currently harmless, but needs to be corrected before %rip-relative addressing is introduced to the PER_CPU_VAR() macro. No functional changes intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Reviewed-by: Boris Ostrovsky Cc: linux-kernel@vger.kernel.org Cc: Brian Gerst Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Sean Christopherson --- arch/x86/xen/xen-asm.S | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 9e5e680087853..448958ddbaf87 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -28,7 +28,7 @@ * non-zero. */ SYM_FUNC_START(xen_irq_disable_direct) - movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + movb $1, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask) RET SYM_FUNC_END(xen_irq_disable_direct) @@ -69,7 +69,7 @@ SYM_FUNC_END(check_events) SYM_FUNC_START(xen_irq_enable_direct) FRAME_BEGIN /* Unmask events */ - movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + movb $0, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask) /* * Preempt here doesn't matter because that will deal with any @@ -78,7 +78,7 @@ SYM_FUNC_START(xen_irq_enable_direct) */ /* Test for pending */ - testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending + testb $0xff, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_pending) jz 1f call check_events @@ -97,7 +97,7 @@ SYM_FUNC_END(xen_irq_enable_direct) * x86 use opposite senses (mask vs enable). */ SYM_FUNC_START(xen_save_fl_direct) - testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + testb $0xff, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask) setz %ah addb %ah, %ah RET @@ -113,7 +113,7 @@ SYM_FUNC_END(xen_read_cr2); SYM_FUNC_START(xen_read_cr2_direct) FRAME_BEGIN - _ASM_MOV PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %_ASM_AX + _ASM_MOV PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_arch_cr2), %_ASM_AX FRAME_END RET SYM_FUNC_END(xen_read_cr2_direct); -- GitLab From 59bec00ace28d565ae0a68b23063ef3b961d82d5 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 17 Oct 2023 18:27:34 +0200 Subject: [PATCH 0011/1519] x86/percpu: Introduce %rip-relative addressing to PER_CPU_VAR() Introduce x86_64 %rip-relative addressing to the PER_CPU_VAR() macro. Instructions using %rip-relative address operand are one byte shorter than their absolute address counterparts and are also compatible with position independent executable (-fpie) builds. The patch reduces code size of a test kernel build by 150 bytes. The PER_CPU_VAR() macro is intended to be applied to a symbol and should not be used with register operands. Introduce the new __percpu macro and use it in cmpxchg{8,16}b_emu.S instead. Also add a missing function comment to this_cpu_cmpxchg8b_emu(). No functional changes intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: linux-kernel@vger.kernel.org Cc: Brian Gerst Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Sean Christopherson --- arch/x86/include/asm/percpu.h | 12 ++++++++---- arch/x86/lib/cmpxchg16b_emu.S | 12 ++++++------ arch/x86/lib/cmpxchg8b_emu.S | 30 +++++++++++++++++++++--------- 3 files changed, 35 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index ac3220aeb779a..bbcc1ca737f05 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -4,17 +4,21 @@ #ifdef CONFIG_X86_64 #define __percpu_seg gs +#define __percpu_rel (%rip) #else #define __percpu_seg fs +#define __percpu_rel #endif #ifdef __ASSEMBLY__ #ifdef CONFIG_SMP -#define PER_CPU_VAR(var) %__percpu_seg:var -#else /* ! SMP */ -#define PER_CPU_VAR(var) var -#endif /* SMP */ +#define __percpu %__percpu_seg: +#else +#define __percpu +#endif + +#define PER_CPU_VAR(var) __percpu(var)__percpu_rel #ifdef CONFIG_X86_64_SMP #define INIT_PER_CPU_VAR(var) init_per_cpu__##var diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S index 6962df3157938..4fb44894ad875 100644 --- a/arch/x86/lib/cmpxchg16b_emu.S +++ b/arch/x86/lib/cmpxchg16b_emu.S @@ -23,14 +23,14 @@ SYM_FUNC_START(this_cpu_cmpxchg16b_emu) cli /* if (*ptr == old) */ - cmpq PER_CPU_VAR(0(%rsi)), %rax + cmpq __percpu (%rsi), %rax jne .Lnot_same - cmpq PER_CPU_VAR(8(%rsi)), %rdx + cmpq __percpu 8(%rsi), %rdx jne .Lnot_same /* *ptr = new */ - movq %rbx, PER_CPU_VAR(0(%rsi)) - movq %rcx, PER_CPU_VAR(8(%rsi)) + movq %rbx, __percpu (%rsi) + movq %rcx, __percpu 8(%rsi) /* set ZF in EFLAGS to indicate success */ orl $X86_EFLAGS_ZF, (%rsp) @@ -42,8 +42,8 @@ SYM_FUNC_START(this_cpu_cmpxchg16b_emu) /* *ptr != old */ /* old = *ptr */ - movq PER_CPU_VAR(0(%rsi)), %rax - movq PER_CPU_VAR(8(%rsi)), %rdx + movq __percpu (%rsi), %rax + movq __percpu 8(%rsi), %rdx /* clear ZF in EFLAGS to indicate failure */ andl $(~X86_EFLAGS_ZF), (%rsp) diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S index 49805257b1255..8632d7dd1f004 100644 --- a/arch/x86/lib/cmpxchg8b_emu.S +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -24,12 +24,12 @@ SYM_FUNC_START(cmpxchg8b_emu) pushfl cli - cmpl 0(%esi), %eax + cmpl (%esi), %eax jne .Lnot_same cmpl 4(%esi), %edx jne .Lnot_same - movl %ebx, 0(%esi) + movl %ebx, (%esi) movl %ecx, 4(%esi) orl $X86_EFLAGS_ZF, (%esp) @@ -38,7 +38,7 @@ SYM_FUNC_START(cmpxchg8b_emu) RET .Lnot_same: - movl 0(%esi), %eax + movl (%esi), %eax movl 4(%esi), %edx andl $(~X86_EFLAGS_ZF), (%esp) @@ -53,18 +53,30 @@ EXPORT_SYMBOL(cmpxchg8b_emu) #ifndef CONFIG_UML +/* + * Emulate 'cmpxchg8b %fs:(%rsi)' + * + * Inputs: + * %esi : memory location to compare + * %eax : low 32 bits of old value + * %edx : high 32 bits of old value + * %ebx : low 32 bits of new value + * %ecx : high 32 bits of new value + * + * Notably this is not LOCK prefixed and is not safe against NMIs + */ SYM_FUNC_START(this_cpu_cmpxchg8b_emu) pushfl cli - cmpl PER_CPU_VAR(0(%esi)), %eax + cmpl __percpu (%esi), %eax jne .Lnot_same2 - cmpl PER_CPU_VAR(4(%esi)), %edx + cmpl __percpu 4(%esi), %edx jne .Lnot_same2 - movl %ebx, PER_CPU_VAR(0(%esi)) - movl %ecx, PER_CPU_VAR(4(%esi)) + movl %ebx, __percpu (%esi) + movl %ecx, __percpu 4(%esi) orl $X86_EFLAGS_ZF, (%esp) @@ -72,8 +84,8 @@ SYM_FUNC_START(this_cpu_cmpxchg8b_emu) RET .Lnot_same2: - movl PER_CPU_VAR(0(%esi)), %eax - movl PER_CPU_VAR(4(%esi)), %edx + movl __percpu (%esi), %eax + movl __percpu 4(%esi), %edx andl $(~X86_EFLAGS_ZF), (%esp) -- GitLab From ed2f752e0e0a21d941ca0ee539ef3d4cd576bc5e Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 20 Oct 2023 18:19:20 +0200 Subject: [PATCH 0012/1519] x86/percpu: Introduce const-qualified const_pcpu_hot to micro-optimize code generation Some variables in pcpu_hot, currently current_task and top_of_stack are actually per-thread variables implemented as per-CPU variables and thus stable for the duration of the respective task. There is already an attempt to eliminate redundant reads from these variables using this_cpu_read_stable() asm macro, which hides the dependency on the read memory address. However, the compiler has limited ability to eliminate asm common subexpressions, so this approach results in a limited success. The solution is to allow more aggressive elimination by aliasing pcpu_hot into a const-qualified const_pcpu_hot, and to read stable per-CPU variables from this constant copy. The current per-CPU infrastructure does not support reads from const-qualified variables. However, when the compiler supports segment qualifiers, it is possible to declare the const-aliased variable in the relevant named address space. The compiler considers access to the variable, declared in this way, as a read from a constant location, and will optimize reads from the variable accordingly. By implementing constant-qualified const_pcpu_hot, the compiler can eliminate redundant reads from the constant variables, reducing the number of loads from current_task from 3766 to 3217 on a test build, a -14.6% reduction. The reduction of loads translates to the following code savings: text data bss dec hex filename 25,477,353 4389456 808452 30675261 1d4113d vmlinux-old.o 25,476,074 4389440 808452 30673966 1d40c2e vmlinux-new.o representing a code size reduction of -1279 bytes. [ mingo: Updated the changelog, EXPORT(const_pcpu_hot). ] Co-developed-by: Nadav Amit Signed-off-by: Nadav Amit Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231020162004.135244-1-ubizjak@gmail.com --- arch/x86/include/asm/current.h | 7 +++++++ arch/x86/include/asm/percpu.h | 6 +++--- arch/x86/include/asm/processor.h | 3 +++ arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/vmlinux.lds.S | 1 + include/linux/compiler.h | 2 +- 6 files changed, 16 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index a1168e7b69e5b..0538d24366733 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -36,8 +36,15 @@ static_assert(sizeof(struct pcpu_hot) == 64); DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot); +/* const-qualified alias to pcpu_hot, aliased by linker. */ +DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override, + const_pcpu_hot); + static __always_inline struct task_struct *get_current(void) { + if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) + return const_pcpu_hot.current_task; + return this_cpu_read_stable(pcpu_hot.current_task); } diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index bbcc1ca737f05..b86b27d15e528 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -413,9 +413,9 @@ do { \ * accessed while this_cpu_read_stable() allows the value to be cached. * this_cpu_read_stable() is more efficient and can be used if its value * is guaranteed to be valid across cpus. The current users include - * get_current() and get_thread_info() both of which are actually - * per-thread variables implemented as per-cpu variables and thus - * stable for the duration of the respective task. + * pcpu_hot.current_task and pcpu_hot.top_of_stack, both of which are + * actually per-thread variables implemented as per-CPU variables and + * thus stable for the duration of the respective task. */ #define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp) #define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 0086920cda061..b47a9975233f6 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -518,6 +518,9 @@ static __always_inline unsigned long current_top_of_stack(void) * and around vm86 mode and sp0 on x86_64 is special because of the * entry trampoline. */ + if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) + return pcpu_hot.top_of_stack; + return this_cpu_read_stable(pcpu_hot.top_of_stack); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 382d4e6b848d2..4cc0ab0dfbb54 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2051,6 +2051,7 @@ DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { .top_of_stack = TOP_OF_INIT_STACK, }; EXPORT_PER_CPU_SYMBOL(pcpu_hot); +EXPORT_PER_CPU_SYMBOL(const_pcpu_hot); #ifdef CONFIG_X86_64 DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 83d41c2601d7b..e701f2dcea295 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -46,6 +46,7 @@ ENTRY(phys_startup_64) #endif jiffies = jiffies_64; +const_pcpu_hot = pcpu_hot; #if defined(CONFIG_X86_64) /* diff --git a/include/linux/compiler.h b/include/linux/compiler.h index d7779a18b24fc..bf9815eaf4aab 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -212,7 +212,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, */ #define ___ADDRESSABLE(sym, __attrs) \ static void * __used __attrs \ - __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)&sym; + __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym; #define __ADDRESSABLE(sym) \ ___ADDRESSABLE(sym, __section(".discard.addressable")) -- GitLab From 0548eb067ed664b93043e033295ca71e3e706245 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 24 Oct 2023 16:28:14 +0200 Subject: [PATCH 0013/1519] x86/percpu: Return correct variable from current_top_of_stack() current_top_of_stack() should return variable from _seg_gs qualified named address space when CONFIG_USE_X86_SEG_SUPPORT=y is enbled. Fixes: ed2f752e0e0a ("x86/percpu: Introduce const-qualified const_pcpu_hot to micro-optimize code generation") Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231024142830.3226-1-ubizjak@gmail.com Cc: Andy Lutomirski Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: Sean Christopherson --- arch/x86/include/asm/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b47a9975233f6..f20e876326690 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -519,7 +519,7 @@ static __always_inline unsigned long current_top_of_stack(void) * entry trampoline. */ if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) - return pcpu_hot.top_of_stack; + return const_pcpu_hot.top_of_stack; return this_cpu_read_stable(pcpu_hot.top_of_stack); } -- GitLab From 8e5647a723c49d73b9f108a8bb38e8c29d3948ea Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 22 Nov 2023 10:37:00 -0600 Subject: [PATCH 0014/1519] x86/mm: Ensure input to pfn_to_kaddr() is treated as a 64-bit type On 64-bit platforms, the pfn_to_kaddr() macro requires that the input value is 64 bits in order to ensure that valid address bits don't get lost when shifting that input by PAGE_SHIFT to calculate the physical address to provide a virtual address for. One such example is in pvalidate_pages() (used by SEV-SNP guests), where the GFN in the struct used for page-state change requests is a 40-bit bit-field, so attempts to pass this GFN field directly into pfn_to_kaddr() ends up causing guest crashes when dealing with addresses above the 1TB range due to the above. Fix this issue with SEV-SNP guests, as well as any similar cases that might cause issues in current/future code, by using an inline function, instead of a macro, so that the input is implicitly cast to the expected 64-bit input type prior to performing the shift operation. While it might be argued that the issue is on the caller side, other archs/macros have taken similar approaches to deal with instances like this, such as ARM explicitly casting the input to phys_addr_t: e48866647b48 ("ARM: 8396/1: use phys_addr_t in pfn_to_kaddr()") A C inline function is even better though. [ mingo: Refined the changelog some more & added __always_inline. ] Fixes: 6c3211796326 ("x86/sev: Add SNP-specific unaccepted memory support") Suggested-by: Dave Hansen Suggested-by: H. Peter Anvin Signed-off-by: Michael Roth Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231122163700.400507-1-michael.roth@amd.com Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Rik van Riel Cc: Linus Torvalds --- arch/x86/include/asm/page.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index d18e5c332cb9f..1b93ff80b43bc 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -66,10 +66,14 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, * virt_addr_valid(kaddr) returns true. */ #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) extern bool __virt_addr_valid(unsigned long kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr)) +static __always_inline void *pfn_to_kaddr(unsigned long pfn) +{ + return __va(pfn << PAGE_SHIFT); +} + static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits) { return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); -- GitLab From 43bda69ed9e3b86d0ba5ff9256e437d50074d7d5 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 5 Nov 2023 22:34:35 +0100 Subject: [PATCH 0015/1519] x86/percpu: Define PER_CPU_VAR macro also for !__ASSEMBLY__ Some C source files define 'asm' statements that use PER_CPU_VAR, so make PER_CPU_VAR macro available also without __ASSEMBLY__. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231105213731.1878100-2-ubizjak@gmail.com --- arch/x86/include/asm/percpu.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index b86b27d15e528..0f12b2004b942 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -84,10 +84,15 @@ }) #endif /* CONFIG_USE_X86_SEG_SUPPORT */ +#define PER_CPU_VAR(var) %__percpu_seg:(var)__percpu_rel + #else /* CONFIG_SMP */ #define __percpu_seg_override #define __percpu_prefix "" #define __force_percpu_prefix "" + +#define PER_CPU_VAR(var) (var)__percpu_rel + #endif /* CONFIG_SMP */ #define __my_cpu_type(var) typeof(var) __percpu_seg_override -- GitLab From 17bce3b2ae2d52e8c5c12274ce4c3a631ce9e66b Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 5 Nov 2023 22:34:36 +0100 Subject: [PATCH 0016/1519] x86/callthunks: Handle %rip-relative relocations in call thunk template Contrary to alternatives, relocations are currently not supported in call thunk templates. Re-use the existing infrastructure from alternative.c to allow %rip-relative relocations when copying call thunk template from its storage location. The patch allows unification of ASM_INCREMENT_CALL_DEPTH, which already uses PER_CPU_VAR macro, with INCREMENT_CALL_DEPTH, used in call thunk template, which is currently limited to use absolute address. Reuse existing relocation infrastructure from alternative.c., as suggested by Peter Zijlstra. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231105213731.1878100-3-ubizjak@gmail.com --- arch/x86/include/asm/text-patching.h | 2 ++ arch/x86/kernel/alternative.c | 3 +-- arch/x86/kernel/callthunks.c | 32 ++++++++++++++++++++++------ 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 29832c338cdc5..ba8d900f3ebee 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -18,6 +18,8 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, #define __parainstructions_end NULL #endif +void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len); + /* * Currently, the max observed size in the kernel code is * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a5ead6a6d233b..aa864157f57ea 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -325,8 +325,7 @@ bool need_reloc(unsigned long offset, u8 *src, size_t src_len) return (target < src || target > src + src_len); } -static void __init_or_module noinline -apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) +void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) { int prev, target = 0; diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index c06bfc086565d..f56fa303d6431 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -24,6 +24,8 @@ static int __initdata_or_module debug_callthunks; +#define MAX_PATCH_LEN (255-1) + #define prdbg(fmt, args...) \ do { \ if (debug_callthunks) \ @@ -184,10 +186,15 @@ static const u8 nops[] = { static void *patch_dest(void *dest, bool direct) { unsigned int tsize = SKL_TMPL_SIZE; + u8 insn_buff[MAX_PATCH_LEN]; u8 *pad = dest - tsize; + memcpy(insn_buff, skl_call_thunk_template, tsize); + apply_relocation(insn_buff, tsize, pad, + skl_call_thunk_template, tsize); + /* Already patched? */ - if (!bcmp(pad, skl_call_thunk_template, tsize)) + if (!bcmp(pad, insn_buff, tsize)) return pad; /* Ensure there are nops */ @@ -197,9 +204,9 @@ static void *patch_dest(void *dest, bool direct) } if (direct) - memcpy(pad, skl_call_thunk_template, tsize); + memcpy(pad, insn_buff, tsize); else - text_poke_copy_locked(pad, skl_call_thunk_template, tsize, true); + text_poke_copy_locked(pad, insn_buff, tsize, true); return pad; } @@ -297,20 +304,27 @@ void *callthunks_translate_call_dest(void *dest) static bool is_callthunk(void *addr) { unsigned int tmpl_size = SKL_TMPL_SIZE; - void *tmpl = skl_call_thunk_template; + u8 insn_buff[MAX_PATCH_LEN]; unsigned long dest; + u8 *pad; dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT); if (!thunks_initialized || skip_addr((void *)dest)) return false; - return !bcmp((void *)(dest - tmpl_size), tmpl, tmpl_size); + *pad = dest - tmpl_size; + + memcpy(insn_buff, skl_call_thunk_template, tmpl_size); + apply_relocation(insn_buff, tmpl_size, pad, + skl_call_thunk_template, tmpl_size); + + return !bcmp(pad, insn_buff, tmpl_size); } int x86_call_depth_emit_accounting(u8 **pprog, void *func) { unsigned int tmpl_size = SKL_TMPL_SIZE; - void *tmpl = skl_call_thunk_template; + u8 insn_buff[MAX_PATCH_LEN]; if (!thunks_initialized) return 0; @@ -319,7 +333,11 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func) if (func && is_callthunk(func)) return 0; - memcpy(*pprog, tmpl, tmpl_size); + memcpy(insn_buff, skl_call_thunk_template, tmpl_size); + apply_relocation(insn_buff, tmpl_size, *pprog, + skl_call_thunk_template, tmpl_size); + + memcpy(*pprog, insn_buff, tmpl_size); *pprog += tmpl_size; return tmpl_size; } -- GitLab From 2adeed985a42ff3334e6898c8c0827e7626c1336 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 5 Nov 2023 22:34:37 +0100 Subject: [PATCH 0017/1519] x86/callthunks: Fix and unify call thunks assembly snippets Currently thunk debug macros explicitly define %gs: segment register prefix for their percpu variables. This is not compatible with !CONFIG_SMP, which requires non-prefixed percpu variables. Fix call thunks debug macros to use PER_CPU_VAR macro from percpu.h to conditionally use %gs: segment register prefix, depending on CONFIG_SMP. Finally, unify ASM_ prefixed assembly macros with their non-prefixed variants. With support of %rip-relative relocations in place, call thunk templates allow %rip-relative addressing, so unified assembly snippet can be used everywhere. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231105213731.1878100-4-ubizjak@gmail.com --- arch/x86/include/asm/nospec-branch.h | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index c55cc243592e9..65fbf6b853afa 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -59,13 +59,13 @@ #ifdef CONFIG_CALL_THUNKS_DEBUG # define CALL_THUNKS_DEBUG_INC_CALLS \ - incq %gs:__x86_call_count; + incq PER_CPU_VAR(__x86_call_count); # define CALL_THUNKS_DEBUG_INC_RETS \ - incq %gs:__x86_ret_count; + incq PER_CPU_VAR(__x86_ret_count); # define CALL_THUNKS_DEBUG_INC_STUFFS \ - incq %gs:__x86_stuffs_count; + incq PER_CPU_VAR(__x86_stuffs_count); # define CALL_THUNKS_DEBUG_INC_CTXSW \ - incq %gs:__x86_ctxsw_count; + incq PER_CPU_VAR(__x86_ctxsw_count); #else # define CALL_THUNKS_DEBUG_INC_CALLS # define CALL_THUNKS_DEBUG_INC_RETS @@ -80,9 +80,6 @@ #define CREDIT_CALL_DEPTH \ movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); -#define ASM_CREDIT_CALL_DEPTH \ - movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); - #define RESET_CALL_DEPTH \ xor %eax, %eax; \ bts $63, %rax; \ @@ -95,20 +92,14 @@ CALL_THUNKS_DEBUG_INC_CALLS #define INCREMENT_CALL_DEPTH \ - sarq $5, %gs:pcpu_hot + X86_call_depth; \ - CALL_THUNKS_DEBUG_INC_CALLS - -#define ASM_INCREMENT_CALL_DEPTH \ sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ CALL_THUNKS_DEBUG_INC_CALLS #else #define CREDIT_CALL_DEPTH -#define ASM_CREDIT_CALL_DEPTH #define RESET_CALL_DEPTH -#define INCREMENT_CALL_DEPTH -#define ASM_INCREMENT_CALL_DEPTH #define RESET_CALL_DEPTH_FROM_CALL +#define INCREMENT_CALL_DEPTH #endif /* @@ -158,7 +149,7 @@ jnz 771b; \ /* barrier for jnz misprediction */ \ lfence; \ - ASM_CREDIT_CALL_DEPTH \ + CREDIT_CALL_DEPTH \ CALL_THUNKS_DEBUG_INC_CTXSW #else /* @@ -325,7 +316,7 @@ .macro CALL_DEPTH_ACCOUNT #ifdef CONFIG_CALL_DEPTH_TRACKING ALTERNATIVE "", \ - __stringify(ASM_INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH + __stringify(INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH #endif .endm -- GitLab From 0978d64f9406122c369d5f46e1eb855646f6c32c Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 3 Nov 2023 11:48:22 +0100 Subject: [PATCH 0018/1519] x86/acpi: Use %rip-relative addressing in wakeup_64.S This is a "nice-to-have" change with minor code generation benefits: - Instruction with %rip-relative address operand is one byte shorter than its absolute address counterpart, - it is also compatible with position independent executable (-fpie) builds, - it is also consistent with what the compiler emits by default when a symbol is accessed. No functional changes intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20231103104900.409470-1-ubizjak@gmail.com --- arch/x86/kernel/acpi/wakeup_64.S | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index d5d8a352eafaa..94ff83f3d3fe9 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -17,7 +17,7 @@ * Hooray, we are in Long 64-bit mode (but still running in low memory) */ SYM_FUNC_START(wakeup_long64) - movq saved_magic, %rax + movq saved_magic(%rip), %rax movq $0x123456789abcdef0, %rdx cmpq %rdx, %rax je 2f @@ -33,14 +33,14 @@ SYM_FUNC_START(wakeup_long64) movw %ax, %es movw %ax, %fs movw %ax, %gs - movq saved_rsp, %rsp + movq saved_rsp(%rip), %rsp - movq saved_rbx, %rbx - movq saved_rdi, %rdi - movq saved_rsi, %rsi - movq saved_rbp, %rbp + movq saved_rbx(%rip), %rbx + movq saved_rdi(%rip), %rdi + movq saved_rsi(%rip), %rsi + movq saved_rbp(%rip), %rbp - movq saved_rip, %rax + movq saved_rip(%rip), %rax ANNOTATE_RETPOLINE_SAFE jmp *%rax SYM_FUNC_END(wakeup_long64) @@ -72,11 +72,11 @@ SYM_FUNC_START(do_suspend_lowlevel) movq $.Lresume_point, saved_rip(%rip) - movq %rsp, saved_rsp - movq %rbp, saved_rbp - movq %rbx, saved_rbx - movq %rdi, saved_rdi - movq %rsi, saved_rsi + movq %rsp, saved_rsp(%rip) + movq %rbp, saved_rbp(%rip) + movq %rbx, saved_rbx(%rip) + movq %rdi, saved_rdi(%rip) + movq %rsi, saved_rsi(%rip) addq $8, %rsp movl $3, %edi -- GitLab From 6724ba89e0b03667d56616614f55e1f772d38fdb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Nov 2023 20:15:51 +0100 Subject: [PATCH 0019/1519] x86/callthunks: Mark apply_relocation() as __init_or_module Do it like the rest of the methods using it. Signed-off-by: Ingo Molnar Cc: Uros Bizjak Link: https://lore.kernel.org/r/20231105213731.1878100-3-ubizjak@gmail.com --- arch/x86/include/asm/text-patching.h | 2 +- arch/x86/kernel/alternative.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index ba8d900f3ebee..fb338f00d3834 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -18,7 +18,7 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, #define __parainstructions_end NULL #endif -void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len); +extern void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len); /* * Currently, the max observed size in the kernel code is diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index aa864157f57ea..50523713989b2 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -325,7 +325,7 @@ bool need_reloc(unsigned long offset, u8 *src, size_t src_len) return (target < src || target > src + src_len); } -void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) +void __init_or_module apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) { int prev, target = 0; -- GitLab From 4604c052b84d66407f5e68045a1939685eac401e Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 30 Nov 2023 17:27:35 +0100 Subject: [PATCH 0020/1519] x86/percpu: Declare const_pcpu_hot as extern const variable const_pcpu_hot is aliased by linker to pcpu_hot, so there is no need to use the DECLARE_PER_CPU_ALIGNED() macro. Also, declare const_pcpu_hot as extern to avoid allocating storage space for the aliased structure. Fixes: ed2f752e0e0a ("x86/percpu: Introduce const-qualified const_pcpu_hot to micro-optimize code generation") Reported-by: kernel test robot Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231130162949.83518-1-ubizjak@gmail.com Closes: https://lore.kernel.org/oe-kbuild-all/202311302257.tSFtZnly-lkp@intel.com/ --- arch/x86/include/asm/current.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 0538d24366733..9fbd7cb2dc866 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -37,8 +37,7 @@ static_assert(sizeof(struct pcpu_hot) == 64); DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot); /* const-qualified alias to pcpu_hot, aliased by linker. */ -DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override, - const_pcpu_hot); +extern const struct pcpu_hot __percpu_seg_override const_pcpu_hot; static __always_inline struct task_struct *get_current(void) { -- GitLab From 9d1c8f21533729b6ead531b676fa7d327cf00819 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 23 Nov 2023 21:34:22 +0100 Subject: [PATCH 0021/1519] x86/smp: Move the call to smp_processor_id() after the early exit in native_stop_other_cpus() Improve code generation in native_stop_other_cpus() a tiny bit: smp_processor_id() accesses a per-CPU variable, so the compiler is not able to move the call after the early exit on its own. Also rename the "cpu" variable to a more descriptive "this_cpu", and use 'cpu' as a separate iterator variable later in the function. No functional change intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231123203605.3474745-1-ubizjak@gmail.com --- arch/x86/kernel/smp.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 6eb06d001bcc6..65dd44e3fc1cf 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -148,14 +148,15 @@ static int register_stop_handler(void) static void native_stop_other_cpus(int wait) { - unsigned int cpu = smp_processor_id(); + unsigned int this_cpu; unsigned long flags, timeout; if (reboot_force) return; /* Only proceed if this is the first CPU to reach this code */ - if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1) + this_cpu = smp_processor_id(); + if (atomic_cmpxchg(&stopping_cpu, -1, this_cpu) != -1) return; /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */ @@ -190,7 +191,7 @@ static void native_stop_other_cpus(int wait) * NMIs. */ cpumask_copy(&cpus_stop_mask, cpu_online_mask); - cpumask_clear_cpu(cpu, &cpus_stop_mask); + cpumask_clear_cpu(this_cpu, &cpus_stop_mask); if (!cpumask_empty(&cpus_stop_mask)) { apic_send_IPI_allbutself(REBOOT_VECTOR); @@ -234,6 +235,8 @@ static void native_stop_other_cpus(int wait) * CPUs to stop. */ if (!smp_no_nmi_ipi && !register_stop_handler()) { + unsigned int cpu; + pr_emerg("Shutting down cpus with NMI\n"); for_each_cpu(cpu, &cpus_stop_mask) -- GitLab From 9e9d673b2c84719937db5d6ab1d8cbcd7d45e974 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 23 Nov 2023 21:34:23 +0100 Subject: [PATCH 0022/1519] x86/smp: Use atomic_try_cmpxchg in native_stop_other_cpus() Use atomic_try_cmpxchg() instead of atomic_cmpxchg(*ptr, old, new) == old. X86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after the CMPXCHG. Tested by building a native Fedora-38 kernel and rebooting a 12-way SMP system using "shutdown -r" command some 100 times. No functional change intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231123203605.3474745-2-ubizjak@gmail.com --- arch/x86/kernel/smp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 65dd44e3fc1cf..1bb79526c2179 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -148,15 +148,16 @@ static int register_stop_handler(void) static void native_stop_other_cpus(int wait) { - unsigned int this_cpu; + unsigned int old_cpu, this_cpu; unsigned long flags, timeout; if (reboot_force) return; /* Only proceed if this is the first CPU to reach this code */ + old_cpu = -1; this_cpu = smp_processor_id(); - if (atomic_cmpxchg(&stopping_cpu, -1, this_cpu) != -1) + if (!atomic_try_cmpxchg(&stopping_cpu, &old_cpu, this_cpu)) return; /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */ -- GitLab From fc50065325f8b88d6986f089ae103b5db858ab96 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 1 Dec 2023 09:57:27 +0100 Subject: [PATCH 0023/1519] x86/callthunks: Correct calculation of dest address in is_callthunk() GCC didn't warn on the invalid use of relocation destination pointer, so the calculated destination value was applied to the uninitialized pointer location in error. Fixes: 17bce3b2ae2d ("x86/callthunks: Handle %rip-relative relocations in call thunk template") Reported-by: Nathan Chancellor Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Closes: https://lore.kernel.org/lkml/20231201035457.GA321497@dev-arch.thelio-3990X/ Link: https://lore.kernel.org/r/20231201085727.3647051-1-ubizjak@gmail.com --- arch/x86/kernel/callthunks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index f56fa303d6431..2324c7f9a8413 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -312,7 +312,7 @@ static bool is_callthunk(void *addr) if (!thunks_initialized || skip_addr((void *)dest)) return false; - *pad = dest - tmpl_size; + pad = (void *)(dest - tmpl_size); memcpy(insn_buff, skl_call_thunk_template, tmpl_size); apply_relocation(insn_buff, tmpl_size, pad, -- GitLab From 0e3703630bd3fead041a6bfb3a3f2a9891af6c34 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 4 Dec 2023 22:02:29 +0100 Subject: [PATCH 0024/1519] x86/percpu: Fix "const_pcpu_hot" version generation failure Version generation for "const_pcpu_hot" symbol failed because genksyms doesn't know the __seg_gs keyword. Effectively revert commit 4604c052b84d ("x86/percpu: Declare const_pcpu_hot as extern const variable") and use this_cpu_read_const() instead to avoid "sparse: dereference of noderef expression" warning when reading const_pcpu_hot. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231204210320.114429-1-ubizjak@gmail.com --- arch/x86/include/asm/current.h | 5 +++-- arch/x86/include/asm/percpu.h | 7 +++++++ arch/x86/include/asm/processor.h | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 9fbd7cb2dc866..c8c5674d69f64 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -37,12 +37,13 @@ static_assert(sizeof(struct pcpu_hot) == 64); DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot); /* const-qualified alias to pcpu_hot, aliased by linker. */ -extern const struct pcpu_hot __percpu_seg_override const_pcpu_hot; +DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override, + const_pcpu_hot); static __always_inline struct task_struct *get_current(void) { if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) - return const_pcpu_hot.current_task; + return this_cpu_read_const(const_pcpu_hot.current_task); return this_cpu_read_stable(pcpu_hot.current_task); } diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 0f12b2004b942..3859abad19ec9 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -28,6 +28,7 @@ #else /* ...!ASSEMBLY */ +#include #include #include @@ -462,6 +463,7 @@ do { \ #define this_cpu_write_8(pcp, val) __raw_cpu_write(volatile, pcp, val) #endif +#define this_cpu_read_const(pcp) __raw_cpu_read(, pcp) #else /* CONFIG_USE_X86_SEG_SUPPORT */ #define raw_cpu_read_1(pcp) percpu_from_op(1, , "mov", pcp) @@ -486,6 +488,11 @@ do { \ #define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val) #endif +/* + * The generic per-cpu infrastrucutre is not suitable for + * reading const-qualified variables. + */ +#define this_cpu_read_const(pcp) ({ BUILD_BUG(); (typeof(pcp))0; }) #endif /* CONFIG_USE_X86_SEG_SUPPORT */ #define raw_cpu_add_1(pcp, val) percpu_add_op(1, , (pcp), val) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f20e876326690..a94a857152c44 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -519,7 +519,7 @@ static __always_inline unsigned long current_top_of_stack(void) * entry trampoline. */ if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) - return const_pcpu_hot.top_of_stack; + return this_cpu_read_const(const_pcpu_hot.top_of_stack); return this_cpu_read_stable(pcpu_hot.top_of_stack); } -- GitLab From 13408c6ae684181d53c870cceddbd3a62ae34c3e Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 4 Dec 2023 22:02:30 +0100 Subject: [PATCH 0025/1519] x86/traps: Use current_top_of_stack() helper in traps.c Use current_top_of_stack() helper in sync_regs() and vc_switch_off_ist() instead of open-coding the reading of the top_of_stack percpu variable explicitly. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231204210320.114429-2-ubizjak@gmail.com --- arch/x86/kernel/traps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c876f1d36a81a..78b1d1a6ed2cc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -772,7 +772,7 @@ DEFINE_IDTENTRY_RAW(exc_int3) */ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { - struct pt_regs *regs = (struct pt_regs *)this_cpu_read(pcpu_hot.top_of_stack) - 1; + struct pt_regs *regs = (struct pt_regs *)current_top_of_stack() - 1; if (regs != eregs) *regs = *eregs; return regs; @@ -790,7 +790,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r * trust it and switch to the current kernel stack */ if (ip_within_syscall_gap(regs)) { - sp = this_cpu_read(pcpu_hot.top_of_stack); + sp = current_top_of_stack(); goto sync; } -- GitLab From 3a1d3829e193c091475ceab481c5f8deab385023 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 4 Dec 2023 22:02:31 +0100 Subject: [PATCH 0026/1519] x86/percpu: Avoid sparse warning with cast to named address space Teach sparse about __seg_fs and __seg_gs named address space qualifiers to to avoid warnings about unexpected keyword at the end of cast operator. Reported-by: kernel test robot Acked-by: Luc Van Oostenryck Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231204210320.114429-3-ubizjak@gmail.com Closes: https://lore.kernel.org/oe-kbuild-all/202310080853.UhMe5iWa-lkp@intel.com/ --- arch/x86/include/asm/percpu.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 3859abad19ec9..e56a37886143a 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -36,6 +36,11 @@ #ifdef CONFIG_CC_HAS_NAMED_AS +#ifdef __CHECKER__ +#define __seg_gs __attribute__((address_space(__seg_gs))) +#define __seg_fs __attribute__((address_space(__seg_fs))) +#endif + #ifdef CONFIG_X86_64 #define __percpu_seg_override __seg_gs #else -- GitLab From 86ed430cf5296ca97a66f1f37e30b7dfe47cd36f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 08:28:41 +0100 Subject: [PATCH 0027/1519] x86/alternatives: Move apply_relocation() out of init section This function is now called from a few places that are no __init_or_module, resulting a link time warning: WARNING: modpost: vmlinux: section mismatch in reference: patch_dest+0x8a (section: .text) -> apply_relocation (section: .init.text) Remove the annotation here. [ mingo: Also sync up add_nop() with these changes. ] Fixes: 17bce3b2ae2d ("x86/callthunks: Handle %rip-relative relocations in call thunk template") Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Uros Bizjak Link: https://lore.kernel.org/r/20231204072856.1033621-1-arnd@kernel.org --- arch/x86/kernel/alternative.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 50523713989b2..1781e020f393f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -44,7 +44,7 @@ EXPORT_SYMBOL_GPL(alternatives_patched); #define DA_ENDBR 0x08 #define DA_SMP 0x10 -static unsigned int __initdata_or_module debug_alternative; +static unsigned int debug_alternative; static int __init debug_alt(char *str) { @@ -132,7 +132,7 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] = * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and * *jump* over instead of executing long and daft NOPs. */ -static void __init_or_module add_nop(u8 *instr, unsigned int len) +static void add_nop(u8 *instr, unsigned int len) { u8 *target = instr + len; @@ -206,7 +206,7 @@ static int skip_nops(u8 *instr, int offset, int len) * Optimize a sequence of NOPs, possibly preceded by an unconditional jump * to the end of the NOP sequence into a single NOP. */ -static bool __init_or_module +static bool __optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target) { int i = *next - insn->length; @@ -325,7 +325,7 @@ bool need_reloc(unsigned long offset, u8 *src, size_t src_len) return (target < src || target > src + src_len); } -void __init_or_module apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) +void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) { int prev, target = 0; -- GitLab From de8c6a352131f642b82474abe0cbb5dd26a7e081 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 13 Dec 2023 16:03:15 +0100 Subject: [PATCH 0028/1519] x86/percpu: Use %RIP-relative address in untagged_addr() %RIP-relative addresses are nowadays correctly handled in alternative instructions, so remove misleading comment and improve assembly to use %RIP-relative address. Also, explicitly using %gs: prefix will segfault for non-SMP builds. Use macros from percpu.h which will DTRT with segment prefix register as far as SMP/non-SMP builds are concerned. Signed-off-by: Uros Bizjak Signed-off-by: Dave Hansen Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Acked-by: Kirill A. Shutemov Cc: Linus Torvalds Link: https://lore.kernel.org/all/20231213150357.5942-1-ubizjak%40gmail.com --- arch/x86/include/asm/uaccess_64.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index f2c02e4469ccc..01455c0b070cc 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -11,6 +11,7 @@ #include #include #include +#include #ifdef CONFIG_ADDRESS_MASKING /* @@ -18,14 +19,10 @@ */ static inline unsigned long __untagged_addr(unsigned long addr) { - /* - * Refer tlbstate_untag_mask directly to avoid RIP-relative relocation - * in alternative instructions. The relocation gets wrong when gets - * copied to the target place. - */ asm (ALTERNATIVE("", - "and %%gs:tlbstate_untag_mask, %[addr]\n\t", X86_FEATURE_LAM) - : [addr] "+r" (addr) : "m" (tlbstate_untag_mask)); + "and " __percpu_arg([mask]) ", %[addr]", X86_FEATURE_LAM) + : [addr] "+r" (addr) + : [mask] "m" (__my_cpu_var(tlbstate_untag_mask))); return addr; } -- GitLab From be83e809ca67bca98fde97ad6b9344237963220b Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 21 Nov 2023 08:07:28 -0800 Subject: [PATCH 0029/1519] x86/bugs: Rename CONFIG_GDS_FORCE_MITIGATION => CONFIG_MITIGATION_GDS_FORCE So the CPU mitigations Kconfig entries - there's 10 meanwhile - are named in a historically idiosyncratic and hence rather inconsistent fashion and have become hard to relate with each other over the years: https://lore.kernel.org/lkml/20231011044252.42bplzjsam3qsasz@treble/ When they were introduced we never expected that we'd eventually have about a dozen of them, and that more organization would be useful, especially for Linux distributions that want to enable them in an informed fashion, and want to make sure all mitigations are configured as expected. For example, the current CONFIG_SPECULATION_MITIGATIONS namespace is only halfway populated, where some mitigations have entries in Kconfig, and they could be modified, while others mitigations do not have Kconfig entries, and can not be controlled at build time. Fine-grained control over these Kconfig entries can help in a number of ways: 1) Users can choose and pick only mitigations that are important for their workloads. 2) Users and developers can choose to disable mitigations that mangle the assembly code generation, making it hard to read. 3) Separate Kconfigs for just source code readability, so that we see *which* butt-ugly piece of crap code is for what reason... In most cases, if a mitigation is disabled at compilation time, it can still be enabled at runtime using kernel command line arguments. This is the first patch of an initial series that renames various mitigation related Kconfig options, unifying them under a single CONFIG_MITIGATION_* namespace: CONFIG_GDS_FORCE_MITIGATION => CONFIG_MITIGATION_GDS_FORCE CONFIG_CPU_IBPB_ENTRY => CONFIG_MITIGATION_IBPB_ENTRY CONFIG_CALL_DEPTH_TRACKING => CONFIG_MITIGATION_CALL_DEPTH_TRACKING CONFIG_PAGE_TABLE_ISOLATION => CONFIG_MITIGATION_PAGE_TABLE_ISOLATION CONFIG_RETPOLINE => CONFIG_MITIGATION_RETPOLINE CONFIG_SLS => CONFIG_MITIGATION_SLS CONFIG_CPU_UNRET_ENTRY => CONFIG_MITIGATION_UNRET_ENTRY CONFIG_CPU_IBRS_ENTRY => CONFIG_MITIGATION_IBRS_ENTRY CONFIG_CPU_SRSO => CONFIG_MITIGATION_SRSO CONFIG_RETHUNK => CONFIG_MITIGATION_RETHUNK Implement step 1/10 of the namespace unification of CPU mitigations related Kconfig options and rename CONFIG_GDS_FORCE_MITIGATION to CONFIG_MITIGATION_GDS_FORCE. [ mingo: Rewrote changelog for clarity. ] Suggested-by: Josh Poimboeuf Signed-off-by: Breno Leitao Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231121160740.1249350-2-leitao@debian.org --- arch/x86/Kconfig | 2 +- arch/x86/kernel/cpu/bugs.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 53f2e7797b1df..1dba33ad06b30 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2587,7 +2587,7 @@ config SLS against straight line speculation. The kernel image might be slightly larger. -config GDS_FORCE_MITIGATION +config MITIGATION_GDS_FORCE bool "Force GDS Mitigation" depends on CPU_SUP_INTEL default n diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bb0ab8466b919..3c7e27b58f0ed 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -671,7 +671,7 @@ enum gds_mitigations { GDS_MITIGATION_HYPERVISOR, }; -#if IS_ENABLED(CONFIG_GDS_FORCE_MITIGATION) +#if IS_ENABLED(CONFIG_MITIGATION_GDS_FORCE) static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE; #else static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL; -- GitLab From e0b8fcfa3cfac171d589ad6085a00c584d571f08 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 21 Nov 2023 08:07:29 -0800 Subject: [PATCH 0030/1519] x86/bugs: Rename CONFIG_CPU_IBPB_ENTRY => CONFIG_MITIGATION_IBPB_ENTRY Step 2/10 of the namespace unification of CPU mitigations related Kconfig options. Suggested-by: Josh Poimboeuf Signed-off-by: Breno Leitao Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231121160740.1249350-3-leitao@debian.org --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/nospec-branch.h | 2 +- arch/x86/kernel/cpu/bugs.c | 11 ++++++----- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1dba33ad06b30..c4e04b6037594 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2554,7 +2554,7 @@ config CALL_THUNKS_DEBUG Only enable this when you are debugging call thunks as this creates a noticeable runtime overhead. If unsure say N. -config CPU_IBPB_ENTRY +config MITIGATION_IBPB_ENTRY bool "Enable IBPB on kernel entry" depends on CPU_SUP_AMD && X86_64 default y diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 262e65539f83c..7341fd945698a 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -289,7 +289,7 @@ * where we have a stack but before any RET instruction. */ .macro __UNTRAIN_RET ibpb_feature, call_depth_insns -#if defined(CONFIG_RETHUNK) || defined(CONFIG_CPU_IBPB_ENTRY) +#if defined(CONFIG_RETHUNK) || defined(CONFIG_MITIGATION_IBPB_ENTRY) VALIDATE_UNRET_END ALTERNATIVE_3 "", \ CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 3c7e27b58f0ed..1de4791091ca8 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -994,10 +994,10 @@ static void __init retbleed_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_IBPB)) { pr_err("WARNING: CPU does not support IBPB.\n"); goto do_cmd_auto; - } else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + } else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { retbleed_mitigation = RETBLEED_MITIGATION_IBPB; } else { - pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); goto do_cmd_auto; } break; @@ -1023,7 +1023,8 @@ static void __init retbleed_select_mitigation(void) boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) retbleed_mitigation = RETBLEED_MITIGATION_UNRET; - else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY) && boot_cpu_has(X86_FEATURE_IBPB)) + else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) && + boot_cpu_has(X86_FEATURE_IBPB)) retbleed_mitigation = RETBLEED_MITIGATION_IBPB; } @@ -2482,13 +2483,13 @@ static void __init srso_select_mitigation(void) break; case SRSO_CMD_IBPB: - if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { if (has_microcode) { setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); srso_mitigation = SRSO_MITIGATION_IBPB; } } else { - pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); } break; -- GitLab From 5fa31af31e726c7f5a8f84800153054ca499338a Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 21 Nov 2023 08:07:30 -0800 Subject: [PATCH 0031/1519] x86/bugs: Rename CONFIG_CALL_DEPTH_TRACKING => CONFIG_MITIGATION_CALL_DEPTH_TRACKING Step 3/10 of the namespace unification of CPU mitigations related Kconfig options. Suggested-by: Josh Poimboeuf Signed-off-by: Breno Leitao Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231121160740.1249350-4-leitao@debian.org --- arch/x86/Kconfig | 4 ++-- arch/x86/include/asm/current.h | 2 +- arch/x86/include/asm/disabled-features.h | 2 +- arch/x86/include/asm/nospec-branch.h | 10 +++++----- arch/x86/kernel/asm-offsets.c | 2 +- arch/x86/kernel/cpu/bugs.c | 6 +++--- arch/x86/lib/retpoline.S | 6 +++--- scripts/Makefile.lib | 2 +- tools/arch/x86/include/asm/disabled-features.h | 2 +- 9 files changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c4e04b6037594..b4a703d45700b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2523,7 +2523,7 @@ config CPU_UNRET_ENTRY help Compile the kernel with support for the retbleed=unret mitigation. -config CALL_DEPTH_TRACKING +config MITIGATION_CALL_DEPTH_TRACKING bool "Mitigate RSB underflow with call depth tracking" depends on CPU_SUP_INTEL && HAVE_CALL_THUNKS select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE @@ -2543,7 +2543,7 @@ config CALL_DEPTH_TRACKING config CALL_THUNKS_DEBUG bool "Enable call thunks and call depth tracking debugging" - depends on CALL_DEPTH_TRACKING + depends on MITIGATION_CALL_DEPTH_TRACKING select FUNCTION_ALIGNMENT_32B default n help diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index a1168e7b69e5b..d4ff517cfbd1f 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -17,7 +17,7 @@ struct pcpu_hot { struct task_struct *current_task; int preempt_count; int cpu_number; -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING u64 call_depth; #endif unsigned long top_of_stack; diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 702d93fdd10e8..c1d3a57956180 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -69,7 +69,7 @@ # define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) #endif -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING # define DISABLE_CALL_DEPTH_TRACKING 0 #else # define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 7341fd945698a..59810237eb421 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -73,7 +73,7 @@ # define CALL_THUNKS_DEBUG_INC_CTXSW #endif -#if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS) +#if defined(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS) #include @@ -309,7 +309,7 @@ .macro CALL_DEPTH_ACCOUNT -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING ALTERNATIVE "", \ __stringify(ASM_INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH #endif @@ -357,7 +357,7 @@ extern void entry_ibpb(void); extern void (*x86_return_thunk)(void); -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING extern void call_depth_return_thunk(void); #define CALL_DEPTH_ACCOUNT \ @@ -371,12 +371,12 @@ DECLARE_PER_CPU(u64, __x86_ret_count); DECLARE_PER_CPU(u64, __x86_stuffs_count); DECLARE_PER_CPU(u64, __x86_ctxsw_count); #endif -#else /* !CONFIG_CALL_DEPTH_TRACKING */ +#else /* !CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ static inline void call_depth_return_thunk(void) {} #define CALL_DEPTH_ACCOUNT "" -#endif /* CONFIG_CALL_DEPTH_TRACKING */ +#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ #ifdef CONFIG_RETPOLINE diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 6913b372ccf76..a98020bf31bb4 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -109,7 +109,7 @@ static void __used common(void) OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack); OFFSET(X86_current_task, pcpu_hot, current_task); -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING OFFSET(X86_call_depth, pcpu_hot, call_depth); #endif #if IS_ENABLED(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1de4791091ca8..b906ed4f30916 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1003,15 +1003,15 @@ static void __init retbleed_select_mitigation(void) break; case RETBLEED_CMD_STUFF: - if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING) && + if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && spectre_v2_enabled == SPECTRE_V2_RETPOLINE) { retbleed_mitigation = RETBLEED_MITIGATION_STUFF; } else { - if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING)) + if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); else - pr_err("WARNING: kernel not compiled with CALL_DEPTH_TRACKING.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n"); goto do_cmd_auto; } diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 7b2589877d065..ff46f48a0cc4d 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -71,7 +71,7 @@ SYM_CODE_END(__x86_indirect_thunk_array) #include #undef GEN -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING .macro CALL_THUNK reg .align RETPOLINE_THUNK_SIZE @@ -327,7 +327,7 @@ __EXPORT_THUNK(entry_untrain_ret) #endif /* CONFIG_CPU_UNRET_ENTRY || CONFIG_CPU_SRSO */ -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING .align 64 SYM_FUNC_START(call_depth_return_thunk) @@ -359,7 +359,7 @@ SYM_FUNC_START(call_depth_return_thunk) int3 SYM_FUNC_END(call_depth_return_thunk) -#endif /* CONFIG_CALL_DEPTH_TRACKING */ +#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ /* * This function name is magical and is used by -mfunction-return=thunk-extern diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 1a965fe68e011..ee233ef916968 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -254,7 +254,7 @@ objtool := $(objtree)/tools/objtool/objtool objtool-args-$(CONFIG_HAVE_JUMP_LABEL_HACK) += --hacks=jump_label objtool-args-$(CONFIG_HAVE_NOINSTR_HACK) += --hacks=noinstr -objtool-args-$(CONFIG_CALL_DEPTH_TRACKING) += --hacks=skylake +objtool-args-$(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) += --hacks=skylake objtool-args-$(CONFIG_X86_KERNEL_IBT) += --ibt objtool-args-$(CONFIG_FINEIBT) += --cfi objtool-args-$(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL) += --mcount diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 702d93fdd10e8..c1d3a57956180 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -69,7 +69,7 @@ # define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) #endif -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING # define DISABLE_CALL_DEPTH_TRACKING 0 #else # define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) -- GitLab From ea4654e0885348f0faa47f6d7b44a08d75ad16e9 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 21 Nov 2023 08:07:31 -0800 Subject: [PATCH 0032/1519] x86/bugs: Rename CONFIG_PAGE_TABLE_ISOLATION => CONFIG_MITIGATION_PAGE_TABLE_ISOLATION Step 4/10 of the namespace unification of CPU mitigations related Kconfig options. [ mingo: Converted new uses that got added since the series was posted. ] Suggested-by: Josh Poimboeuf Signed-off-by: Breno Leitao Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231121160740.1249350-5-leitao@debian.org --- Documentation/arch/x86/pti.rst | 6 +++--- arch/x86/Kconfig | 2 +- arch/x86/boot/compressed/ident_map_64.c | 4 ++-- arch/x86/entry/calling.h | 8 ++++---- arch/x86/entry/entry_64.S | 6 +++--- arch/x86/include/asm/disabled-features.h | 2 +- arch/x86/include/asm/pgalloc.h | 2 +- arch/x86/include/asm/pgtable-3level.h | 2 +- arch/x86/include/asm/pgtable.h | 18 +++++++++--------- arch/x86/include/asm/pgtable_64.h | 3 ++- arch/x86/include/asm/processor-flags.h | 2 +- arch/x86/include/asm/pti.h | 2 +- arch/x86/kernel/dumpstack.c | 2 +- arch/x86/kernel/head_32.S | 4 ++-- arch/x86/kernel/head_64.S | 2 +- arch/x86/kernel/ldt.c | 8 ++++---- arch/x86/mm/Makefile | 2 +- arch/x86/mm/debug_pagetables.c | 4 ++-- arch/x86/mm/dump_pagetables.c | 4 ++-- arch/x86/mm/pgtable.c | 4 ++-- arch/x86/mm/tlb.c | 10 +++++----- include/linux/pti.h | 2 +- tools/arch/x86/include/asm/disabled-features.h | 2 +- 23 files changed, 51 insertions(+), 50 deletions(-) diff --git a/Documentation/arch/x86/pti.rst b/Documentation/arch/x86/pti.rst index e08d35177bc02..57e8392f61d35 100644 --- a/Documentation/arch/x86/pti.rst +++ b/Documentation/arch/x86/pti.rst @@ -26,9 +26,9 @@ comments in pti.c). This approach helps to ensure that side-channel attacks leveraging the paging structures do not function when PTI is enabled. It can be -enabled by setting CONFIG_PAGE_TABLE_ISOLATION=y at compile time. -Once enabled at compile-time, it can be disabled at boot with the -'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt). +enabled by setting CONFIG_MITIGATION_PAGE_TABLE_ISOLATION=y at compile +time. Once enabled at compile-time, it can be disabled at boot with +the 'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt). Page Table Management ===================== diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b4a703d45700b..a1c047004390f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2484,7 +2484,7 @@ menuconfig SPECULATION_MITIGATIONS if SPECULATION_MITIGATIONS -config PAGE_TABLE_ISOLATION +config MITIGATION_PAGE_TABLE_ISOLATION bool "Remove the kernel mapping in user mode" default y depends on (X86_64 || X86_PAE) diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index d040080d7edbd..ff09ca6dbb87e 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -8,8 +8,8 @@ * Copyright (C) 2016 Kees Cook */ -/* No PAGE_TABLE_ISOLATION support needed either: */ -#undef CONFIG_PAGE_TABLE_ISOLATION +/* No MITIGATION_PAGE_TABLE_ISOLATION support needed either: */ +#undef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION #include "error.h" #include "misc.h" diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 9f1d94790a549..7ac2d6f946ed0 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -142,10 +142,10 @@ For 32-bit we have the following conventions - kernel is built with .endif .endm -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* - * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two + * MITIGATION_PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two * halves: */ #define PTI_USER_PGTABLE_BIT PAGE_SHIFT @@ -160,7 +160,7 @@ For 32-bit we have the following conventions - kernel is built with .macro ADJUST_KERNEL_CR3 reg:req ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID - /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ + /* Clear PCID and "MITIGATION_PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ andq $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg .endm @@ -275,7 +275,7 @@ For 32-bit we have the following conventions - kernel is built with .Lend_\@: .endm -#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */ +#else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION=n: */ .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req .endm diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index c40f89ab1b4c7..d08cb3865c8ad 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -562,7 +562,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) #ifdef CONFIG_XEN_PV ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV #endif -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI #endif @@ -578,7 +578,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) jnz .Lnative_iret ud2 -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION .Lpti_restore_regs_and_return_to_usermode: POP_REGS pop_rdi=0 @@ -1096,7 +1096,7 @@ SYM_CODE_END(error_return) * * Registers: * %r14: Used to save/restore the CR3 of the interrupted context - * when PAGE_TABLE_ISOLATION is in use. Do not clobber. + * when MITIGATION_PAGE_TABLE_ISOLATION is in use. Do not clobber. */ SYM_CODE_START(asm_exc_nmi) UNWIND_HINT_IRET_ENTRY diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index c1d3a57956180..fb604ec95a5fa 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -44,7 +44,7 @@ # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) #endif -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION # define DISABLE_PTI 0 #else # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index c7ec5bb88334e..dcd836b59bebd 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -34,7 +34,7 @@ static inline void paravirt_release_p4d(unsigned long pfn) {} */ extern gfp_t __userpte_alloc_gfp; -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * Instead of one PGD, we acquire two PGDs. Being order-1, it is * both 8k in size and 8k-aligned. That lets us just flip bit 12 diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 9e7c0b719c3c1..dabafba957ea6 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -52,7 +52,7 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) static inline void native_set_pud(pud_t *pudp, pud_t pud) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd); #endif pxx_xchg64(pud, pudp, native_pud_val(pud)); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 9d077bca6a103..df0f7d4a96f32 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -909,7 +909,7 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, pmd_t *populate_extra_pmd(unsigned long vaddr); pte_t *populate_extra_pte(unsigned long vaddr); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd); /* @@ -923,12 +923,12 @@ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) return pgd; return __pti_set_user_pgtbl(pgdp, pgd); } -#else /* CONFIG_PAGE_TABLE_ISOLATION */ +#else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { return pgd; } -#endif /* CONFIG_PAGE_TABLE_ISOLATION */ +#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ #endif /* __ASSEMBLY__ */ @@ -1131,7 +1131,7 @@ static inline int p4d_bad(p4d_t p4d) { unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER; - if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; return (p4d_flags(p4d) & ~ignore_flags) != 0; @@ -1177,7 +1177,7 @@ static inline int pgd_bad(pgd_t pgd) if (!pgtable_l5_enabled()) return 0; - if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; @@ -1422,9 +1422,9 @@ static inline bool pgdp_maps_userspace(void *__ptr) #define pgd_leaf pgd_large static inline int pgd_large(pgd_t pgd) { return 0; } -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* - * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages + * All top-level MITIGATION_PAGE_TABLE_ISOLATION page tables are order-1 pages * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and * the user one is in the last 4k. To switch between them, you * just need to flip the 12th bit in their addresses. @@ -1469,7 +1469,7 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) { return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); } -#endif /* CONFIG_PAGE_TABLE_ISOLATION */ +#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); @@ -1484,7 +1484,7 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { memcpy(dst, src, count * sizeof(pgd_t)); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (!static_cpu_has(X86_FEATURE_PTI)) return; /* Clone the user space pgd as well */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 24af25b1551a5..7e9db77231ac7 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -143,7 +143,8 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { pgd_t pgd; - if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) { + if (pgtable_l5_enabled() || + !IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) { WRITE_ONCE(*p4dp, p4d); return; } diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index d8cccadc83a6f..e5f204b9b33df 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -51,7 +51,7 @@ #define CR3_NOFLUSH 0 #endif -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION # define X86_CR3_PTI_PCID_USER_BIT 11 #endif diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h index 07375b476c4fd..ab167c96b9ab4 100644 --- a/arch/x86/include/asm/pti.h +++ b/arch/x86/include/asm/pti.h @@ -3,7 +3,7 @@ #define _ASM_X86_PTI_H #ifndef __ASSEMBLY__ -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION extern void pti_init(void); extern void pti_check_boottime_disable(void); extern void pti_finalize(void); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index f18ca44c904b7..44a91ef5a23b8 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -410,7 +410,7 @@ static void __die_header(const char *str, struct pt_regs *regs, long err) IS_ENABLED(CONFIG_SMP) ? " SMP" : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", - IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? + IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION) ? (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); } NOKPROBE_SYMBOL(__die_header); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 487ac57e2c81f..b50f3641c4d6d 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -414,7 +414,7 @@ __REFDATA .align 4 SYM_DATA(initial_code, .long i386_start_kernel) -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION #define PGD_ALIGN (2 * PAGE_SIZE) #define PTI_USER_PGD_FILL 1024 #else @@ -474,7 +474,7 @@ SYM_DATA_START(initial_page_table) # endif .align PAGE_SIZE /* needs to be page-sized too */ -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * PTI needs another page so sync_initial_pagetable() works correctly * and does not scribble over the data which is placed behind the diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d4918d03efb4b..cd54313706a28 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -622,7 +622,7 @@ SYM_CODE_END(vc_no_ghcb) #define SYM_DATA_START_PAGE_ALIGNED(name) \ SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE) -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * Each PGD needs to be 8k long and 8k aligned. We do not * ever go out to userspace with these, so we do not diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 7a814b41402de..0f19ef355f5f1 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -184,7 +184,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) return new_ldt; } -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION static void do_sanity_check(struct mm_struct *mm, bool had_kernel_mapping, @@ -377,7 +377,7 @@ static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false); } -#else /* !CONFIG_PAGE_TABLE_ISOLATION */ +#else /* !CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) @@ -388,11 +388,11 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) { } -#endif /* CONFIG_PAGE_TABLE_ISOLATION */ +#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ static void free_ldt_pgtables(struct mm_struct *mm) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION struct mmu_gather tlb; unsigned long start = LDT_BASE_ADDR; unsigned long end = LDT_END_ADDR; diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index c80febc44cd2f..031cd10ed17fc 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -60,7 +60,7 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o -obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o +obj-$(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION) += pti.o obj-$(CONFIG_X86_MEM_ENCRYPT) += mem_encrypt.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_amd.o diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index b43301cb2a80c..ae5c213a1cb00 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -22,7 +22,7 @@ static int ptdump_curknl_show(struct seq_file *m, void *v) DEFINE_SHOW_ATTRIBUTE(ptdump_curknl); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION static int ptdump_curusr_show(struct seq_file *m, void *v) { if (current->mm->pgd) @@ -54,7 +54,7 @@ static int __init pt_dump_debug_init(void) debugfs_create_file("current_kernel", 0400, dir, NULL, &ptdump_curknl_fops); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION debugfs_create_file("current_user", 0400, dir, NULL, &ptdump_curusr_fops); #endif diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index e1b599ecbbc26..b7b88c1d91ec4 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -408,7 +408,7 @@ void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, bool user) { pgd_t *pgd = mm->pgd; -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (user && boot_cpu_has(X86_FEATURE_PTI)) pgd = kernel_to_user_pgdp(pgd); #endif @@ -418,7 +418,7 @@ EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); void ptdump_walk_user_pgd_level_checkwx(void) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION pgd_t *pgd = INIT_PGD; if (!(__supported_pte_mask & _PAGE_NX) || diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 0cbc1b8e8e3d1..cceb779d882d8 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -293,7 +293,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) for (i = 0; i < PREALLOCATED_PMDS; i++) mop_up_one_pmd(mm, &pgdp[i]); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (!boot_cpu_has(X86_FEATURE_PTI)) return; @@ -325,7 +325,7 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) } } -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION static void pgd_prepopulate_user_pmd(struct mm_struct *mm, pgd_t *k_pgd, pmd_t *pmds[]) { diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5768d386efab6..4af930947380c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -89,10 +89,10 @@ #define CR3_HW_ASID_BITS 12 /* - * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for + * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for * user/kernel switches */ -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION # define PTI_CONSUMED_PCID_BITS 1 #else # define PTI_CONSUMED_PCID_BITS 0 @@ -114,7 +114,7 @@ static inline u16 kern_pcid(u16 asid) { VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * Make sure that the dynamic ASID space does not conflict with the * bit we are using to switch between user and kernel ASIDs. @@ -149,7 +149,7 @@ static inline u16 kern_pcid(u16 asid) static inline u16 user_pcid(u16 asid) { u16 ret = kern_pcid(asid); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; #endif return ret; @@ -262,7 +262,7 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, static inline void invalidate_user_asid(u16 asid) { /* There is no user ASID if address space separation is off */ - if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + if (!IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) return; /* diff --git a/include/linux/pti.h b/include/linux/pti.h index 1a941efcaa622..1fbf9d6c20efb 100644 --- a/include/linux/pti.h +++ b/include/linux/pti.h @@ -2,7 +2,7 @@ #ifndef _INCLUDE_PTI_H #define _INCLUDE_PTI_H -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION #include #else static inline void pti_init(void) { } diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index c1d3a57956180..fb604ec95a5fa 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -44,7 +44,7 @@ # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) #endif -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION # define DISABLE_PTI 0 #else # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) -- GitLab From aefb2f2e619b6c334bcb31de830aa00ba0b11129 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 21 Nov 2023 08:07:32 -0800 Subject: [PATCH 0033/1519] x86/bugs: Rename CONFIG_RETPOLINE => CONFIG_MITIGATION_RETPOLINE Step 5/10 of the namespace unification of CPU mitigations related Kconfig options. [ mingo: Converted a few more uses in comments/messages as well. ] Suggested-by: Josh Poimboeuf Signed-off-by: Breno Leitao Signed-off-by: Ingo Molnar Reviewed-by: Ariel Miculas Acked-by: Josh Poimboeuf Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231121160740.1249350-6-leitao@debian.org --- Documentation/admin-guide/hw-vuln/spectre.rst | 8 ++++---- Documentation/admin-guide/kernel-parameters.txt | 4 ++-- arch/x86/Kconfig | 6 +++--- arch/x86/Makefile | 4 ++-- arch/x86/entry/vdso/Makefile | 4 ++-- arch/x86/include/asm/disabled-features.h | 2 +- arch/x86/include/asm/linkage.h | 8 ++++---- arch/x86/include/asm/nospec-branch.h | 8 ++++---- arch/x86/kernel/alternative.c | 6 +++--- arch/x86/kernel/cpu/bugs.c | 6 +++--- arch/x86/kernel/ftrace.c | 2 +- arch/x86/kernel/kprobes/opt.c | 2 +- arch/x86/kernel/vmlinux.lds.S | 4 ++-- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/mmu_internal.h | 2 +- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/svm/vmenter.S | 4 ++-- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/lib/Makefile | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- arch/x86/net/bpf_jit_comp32.c | 2 +- arch/x86/purgatory/Makefile | 2 +- include/linux/compiler-gcc.h | 2 +- include/linux/indirect_call_wrapper.h | 2 +- include/linux/module.h | 2 +- include/net/netfilter/nf_tables_core.h | 2 +- include/net/tc_wrapper.h | 2 +- kernel/trace/ring_buffer.c | 2 +- net/netfilter/Makefile | 2 +- net/netfilter/nf_tables_core.c | 6 +++--- net/netfilter/nft_ct.c | 4 ++-- net/netfilter/nft_lookup.c | 2 +- net/sched/sch_api.c | 2 +- scripts/Makefile.lib | 2 +- scripts/generate_rust_target.rs | 2 +- scripts/mod/modpost.c | 2 +- tools/arch/x86/include/asm/disabled-features.h | 2 +- tools/objtool/arch/x86/special.c | 2 +- tools/objtool/check.c | 2 +- 39 files changed, 62 insertions(+), 62 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 32a8893e56177..cce768afec6be 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -473,8 +473,8 @@ Spectre variant 2 -mindirect-branch=thunk-extern -mindirect-branch-register options. If the kernel is compiled with a Clang compiler, the compiler needs to support -mretpoline-external-thunk option. The kernel config - CONFIG_RETPOLINE needs to be turned on, and the CPU needs to run with - the latest updated microcode. + CONFIG_MITIGATION_RETPOLINE needs to be turned on, and the CPU needs + to run with the latest updated microcode. On Intel Skylake-era systems the mitigation covers most, but not all, cases. See :ref:`[3] ` for more details. @@ -609,8 +609,8 @@ kernel command line. Selecting 'on' will, and 'auto' may, choose a mitigation method at run time according to the CPU, the available microcode, the setting of the - CONFIG_RETPOLINE configuration option, and the - compiler with which the kernel was built. + CONFIG_MITIGATION_RETPOLINE configuration option, + and the compiler with which the kernel was built. Selecting 'on' will also enable the mitigation against user space to user space task attacks. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e0891ac76ab32..d93f403777f23 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6007,8 +6007,8 @@ Selecting 'on' will, and 'auto' may, choose a mitigation method at run time according to the CPU, the available microcode, the setting of the - CONFIG_RETPOLINE configuration option, and the - compiler with which the kernel was built. + CONFIG_MITIGATION_RETPOLINE configuration option, + and the compiler with which the kernel was built. Selecting 'on' will also enable the mitigation against user space to user space task attacks. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a1c047004390f..2a3ebd679b0e1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2457,7 +2457,7 @@ config CALL_PADDING config FINEIBT def_bool y - depends on X86_KERNEL_IBT && CFI_CLANG && RETPOLINE + depends on X86_KERNEL_IBT && CFI_CLANG && MITIGATION_RETPOLINE select CALL_PADDING config HAVE_CALL_THUNKS @@ -2495,7 +2495,7 @@ config MITIGATION_PAGE_TABLE_ISOLATION See Documentation/arch/x86/pti.rst for more details. -config RETPOLINE +config MITIGATION_RETPOLINE bool "Avoid speculative indirect branches in kernel" select OBJTOOL if HAVE_OBJTOOL default y @@ -2507,7 +2507,7 @@ config RETPOLINE config RETHUNK bool "Enable return-thunks" - depends on RETPOLINE && CC_HAS_RETURN_THUNK + depends on MITIGATION_RETPOLINE && CC_HAS_RETURN_THUNK select OBJTOOL if HAVE_OBJTOOL default y if X86_64 help diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1a068de12a564..b8d23ed059fb4 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -192,7 +192,7 @@ KBUILD_CFLAGS += -Wno-sign-compare KBUILD_CFLAGS += -fno-asynchronous-unwind-tables # Avoid indirect branches in kernel to deal with Spectre -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) # Additionally, avoid generating expensive indirect jumps which # are subject to retpolines for small number of switch cases. @@ -301,7 +301,7 @@ vdso-install-$(CONFIG_IA32_EMULATION) += arch/x86/entry/vdso/vdso32.so.dbg archprepare: checkbin checkbin: -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE ifeq ($(RETPOLINE_CFLAGS),) @echo "You are building kernel with non-retpoline compiler." >&2 @echo "Please update your compiler." >&2 diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index b1b8dd1608f7e..c4df99aa16158 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -87,7 +87,7 @@ CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ -fno-omit-frame-pointer -foptimize-sibling-calls \ -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE ifneq ($(RETPOLINE_VDSO_CFLAGS),) CFL += $(RETPOLINE_VDSO_CFLAGS) endif @@ -164,7 +164,7 @@ KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE ifneq ($(RETPOLINE_VDSO_CFLAGS),) KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) endif diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index fb604ec95a5fa..24e4010c33b6a 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -50,7 +50,7 @@ # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) #endif -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE # define DISABLE_RETPOLINE 0 #else # define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \ diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 571fe4d2d2328..c5165204c66f0 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -42,25 +42,25 @@ #if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define RET jmp __x86_return_thunk -#else /* CONFIG_RETPOLINE */ +#else /* CONFIG_MITIGATION_RETPOLINE */ #ifdef CONFIG_SLS #define RET ret; int3 #else #define RET ret #endif -#endif /* CONFIG_RETPOLINE */ +#endif /* CONFIG_MITIGATION_RETPOLINE */ #else /* __ASSEMBLY__ */ #if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define ASM_RET "jmp __x86_return_thunk\n\t" -#else /* CONFIG_RETPOLINE */ +#else /* CONFIG_MITIGATION_RETPOLINE */ #ifdef CONFIG_SLS #define ASM_RET "ret; int3\n\t" #else #define ASM_RET "ret\n\t" #endif -#endif /* CONFIG_RETPOLINE */ +#endif /* CONFIG_MITIGATION_RETPOLINE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 59810237eb421..32680cb720038 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -241,7 +241,7 @@ * instruction irrespective of kCFI. */ .macro JMP_NOSPEC reg:req -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE __CS_PREFIX \reg jmp __x86_indirect_thunk_\reg #else @@ -251,7 +251,7 @@ .endm .macro CALL_NOSPEC reg:req -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE __CS_PREFIX \reg call __x86_indirect_thunk_\reg #else @@ -378,7 +378,7 @@ static inline void call_depth_return_thunk(void) {} #endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE #define GEN(reg) \ extern retpoline_thunk_t __x86_indirect_thunk_ ## reg; @@ -399,7 +399,7 @@ static inline void call_depth_return_thunk(void) {} /* * Inline asm uses the %V modifier which is only in newer GCC - * which is ensured when CONFIG_RETPOLINE is defined. + * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined. */ # define CALL_NOSPEC \ ALTERNATIVE_2( \ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 55e205b212710..08c182f5a0f4d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -544,7 +544,7 @@ static inline bool is_jcc32(struct insn *insn) return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80; } -#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL) +#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL) /* * CALL/JMP *%\reg @@ -844,12 +844,12 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #endif /* CONFIG_RETHUNK */ -#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */ +#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } -#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */ +#endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index b906ed4f30916..fc46fd6447f9f 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1103,7 +1103,7 @@ static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = SPECTRE_V2_USER_NONE; -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE static bool spectre_v2_bad_module; bool retpoline_module_ok(bool has_retpoline) @@ -1416,7 +1416,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && - !IS_ENABLED(CONFIG_RETPOLINE)) { + !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; @@ -1470,7 +1470,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) { - if (!IS_ENABLED(CONFIG_RETPOLINE)) { + if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { pr_err("Kernel not compiled with retpoline; no mitigation available!"); return SPECTRE_V2_NONE; } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 12df54ff0e817..93bc52d4a472d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -307,7 +307,7 @@ union ftrace_op_code_union { } __attribute__((packed)); }; -#define RET_SIZE (IS_ENABLED(CONFIG_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_SLS)) +#define RET_SIZE (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_SLS)) static unsigned long create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 517821b48391a..36d6809c6c9e1 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -324,7 +324,7 @@ static int can_optimize(unsigned long paddr) * However, the kernel built with retpolines or IBT has jump * tables disabled so the check can be skipped altogether. */ - if (!IS_ENABLED(CONFIG_RETPOLINE) && + if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && !IS_ENABLED(CONFIG_X86_KERNEL_IBT) && insn_is_indirect_jump(&insn)) return 0; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index a349dbfc6d5ab..bb2ec03f046d1 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -132,7 +132,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT SOFTIRQENTRY_TEXT -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE *(.text..__x86.indirect_thunk) *(.text..__x86.return_thunk) #endif @@ -267,7 +267,7 @@ SECTIONS } #endif -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE /* * List of instructions that call/jmp/jcc to retpoline thunks * __x86_indirect_thunk_*(). These instructions can be patched along diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0b1f991b9a312..6fdc1cf5c33d4 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -263,7 +263,7 @@ static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu) static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { - if (IS_ENABLED(CONFIG_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3) + if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3) return kvm_read_cr3(vcpu); return mmu->get_guest_pgd(vcpu); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index decc1f1536694..bf73a121c5ef1 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -312,7 +312,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (!prefetch) vcpu->stat.pf_taken++; - if (IS_ENABLED(CONFIG_RETPOLINE) && fault.is_tdp) + if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp) r = kvm_tdp_page_fault(vcpu, &fault); else r = vcpu->arch.mmu->page_fault(vcpu, &fault); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7fb51424fc745..b2751b9acf03d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3455,7 +3455,7 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) if (!svm_check_exit_valid(exit_code)) return svm_handle_invalid_exit(vcpu, exit_code); -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE if (exit_code == SVM_EXIT_MSR) return msr_interception(vcpu); else if (exit_code == SVM_EXIT_VINTR) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index ef2ebabb059c8..b9e08837ab960 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -207,7 +207,7 @@ SYM_FUNC_START(__svm_vcpu_run) 7: vmload %_ASM_AX 8: -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE #endif @@ -344,7 +344,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) /* Pop @svm to RDI, guest registers have been saved already. */ pop %_ASM_DI -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE #endif diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e0f86f11c3454..4e1003ba380ae 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6544,7 +6544,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (exit_reason.basic >= kvm_vmx_max_exit_handlers) goto unexpected_vmexit; -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE if (exit_reason.basic == EXIT_REASON_MSR_WRITE) return kvm_emulate_wrmsr(vcpu); else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index ea3a28e7b613c..72cc9c90e9f30 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -49,7 +49,7 @@ lib-$(CONFIG_ARCH_HAS_COPY_MC) += copy_mc.o copy_mc_64.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o -lib-$(CONFIG_RETPOLINE) += retpoline.o +lib-$(CONFIG_MITIGATION_RETPOLINE) += retpoline.o obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o obj-y += iomem.o diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 144a5839acfe5..ad1396b1df253 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -469,7 +469,7 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip); } else { EMIT2(0xFF, 0xE0 + reg); /* jmp *%\reg */ - if (IS_ENABLED(CONFIG_RETPOLINE) || IS_ENABLED(CONFIG_SLS)) + if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) || IS_ENABLED(CONFIG_SLS)) EMIT1(0xCC); /* int3 */ } diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index b18ce19981ece..c10083a8e68e6 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -1273,7 +1273,7 @@ static int emit_jmp_edx(u8 **pprog, u8 *ip) u8 *prog = *pprog; int cnt = 0; -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE EMIT1_off32(0xE9, (u8 *)__x86_indirect_thunk_edx - (ip + 5)); #else EMIT2(0xFF, 0xE2); diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 08aa0f25f12a0..bc31863c5ee63 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -61,7 +61,7 @@ ifdef CONFIG_STACKPROTECTOR_STRONG PURGATORY_CFLAGS_REMOVE += -fstack-protector-strong endif -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE PURGATORY_CFLAGS_REMOVE += $(RETPOLINE_CFLAGS) endif diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 2ceba3fe4ec16..d24f29091f4b0 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -35,7 +35,7 @@ (typeof(ptr)) (__ptr + (off)); \ }) -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE #define __noretpoline __attribute__((__indirect_branch__("keep"))) #endif diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h index c1c76a70a6ce9..fe050dab55a38 100644 --- a/include/linux/indirect_call_wrapper.h +++ b/include/linux/indirect_call_wrapper.h @@ -2,7 +2,7 @@ #ifndef _LINUX_INDIRECT_CALL_WRAPPER_H #define _LINUX_INDIRECT_CALL_WRAPPER_H -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE /* * INDIRECT_CALL_$NR - wrapper for indirect calls with $NR known builtin diff --git a/include/linux/module.h b/include/linux/module.h index 9cd0009bd050c..087b369e8f178 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -885,7 +885,7 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr, static inline void module_bug_cleanup(struct module *mod) {} #endif /* CONFIG_GENERIC_BUG */ -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE extern bool retpoline_module_ok(bool has_retpoline); #else static inline bool retpoline_module_ok(bool has_retpoline) diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 780a5f6ad4a67..ff27cb2e16620 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -93,7 +93,7 @@ extern const struct nft_set_type nft_set_bitmap_type; extern const struct nft_set_type nft_set_pipapo_type; extern const struct nft_set_type nft_set_pipapo_avx2_type; -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext); bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, diff --git a/include/net/tc_wrapper.h b/include/net/tc_wrapper.h index a6d481b5bcbcf..a13ba0326d5e0 100644 --- a/include/net/tc_wrapper.h +++ b/include/net/tc_wrapper.h @@ -4,7 +4,7 @@ #include -#if IS_ENABLED(CONFIG_RETPOLINE) +#if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) #include #include diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9286f88fcd32a..9cb69332921d9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1156,7 +1156,7 @@ static inline u64 rb_time_stamp(struct trace_buffer *buffer) u64 ts; /* Skip retpolines :-( */ - if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local)) + if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && likely(buffer->clock == trace_clock_local)) ts = trace_clock_local(); else ts = buffer->clock(); diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index d4958e7e76310..614815a3ed738 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -101,7 +101,7 @@ endif endif ifdef CONFIG_NFT_CT -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE nf_tables-objs += nft_ct_fast.o endif endif diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index c3e635364701c..a48d5f0e2f3e1 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -21,7 +21,7 @@ #include #include -#if defined(CONFIG_RETPOLINE) && defined(CONFIG_X86) +#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_X86) static struct static_key_false nf_tables_skip_direct_calls; @@ -207,7 +207,7 @@ static void expr_call_ops_eval(const struct nft_expr *expr, struct nft_regs *regs, struct nft_pktinfo *pkt) { -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE unsigned long e; if (nf_skip_indirect_calls()) @@ -236,7 +236,7 @@ static void expr_call_ops_eval(const struct nft_expr *expr, X(e, nft_objref_map_eval); #undef X indirect_call: -#endif /* CONFIG_RETPOLINE */ +#endif /* CONFIG_MITIGATION_RETPOLINE */ expr->ops->eval(expr, regs, pkt); } diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 86bb9d7797d9e..d3e66bcb2a910 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -751,7 +751,7 @@ static bool nft_ct_set_reduce(struct nft_regs_track *track, return false; } -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE static const struct nft_expr_ops nft_ct_get_fast_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), @@ -796,7 +796,7 @@ nft_ct_select_ops(const struct nft_ctx *ctx, return ERR_PTR(-EINVAL); if (tb[NFTA_CT_DREG]) { -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE u32 k = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); switch (k) { diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 870e5b113d13e..a0055f510e31e 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -24,7 +24,7 @@ struct nft_lookup { struct nft_set_binding binding; }; -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE bool nft_set_do_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) { diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index e9eaf637220e9..d577c9e1cb420 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -2353,7 +2353,7 @@ static struct pernet_operations psched_net_ops = { .exit = psched_net_exit, }; -#if IS_ENABLED(CONFIG_RETPOLINE) +#if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper); #endif diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index ee233ef916968..615f2612a7efb 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -262,7 +262,7 @@ ifdef CONFIG_FTRACE_MCOUNT_USE_OBJTOOL objtool-args-$(CONFIG_HAVE_OBJTOOL_NOP_MCOUNT) += --mnop endif objtool-args-$(CONFIG_UNWINDER_ORC) += --orc -objtool-args-$(CONFIG_RETPOLINE) += --retpoline +objtool-args-$(CONFIG_MITIGATION_RETPOLINE) += --retpoline objtool-args-$(CONFIG_RETHUNK) += --rethunk objtool-args-$(CONFIG_SLS) += --sls objtool-args-$(CONFIG_STACK_VALIDATION) += --stackval diff --git a/scripts/generate_rust_target.rs b/scripts/generate_rust_target.rs index 3c6cbe2b278d3..eaf5246037964 100644 --- a/scripts/generate_rust_target.rs +++ b/scripts/generate_rust_target.rs @@ -155,7 +155,7 @@ fn main() { "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", ); let mut features = "-3dnow,-3dnowa,-mmx,+soft-float".to_string(); - if cfg.has("RETPOLINE") { + if cfg.has("MITIGATION_RETPOLINE") { features += ",+retpoline-external-thunk"; } ts.push("features", features); diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index cb6406f485a96..72fead5f973b7 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1843,7 +1843,7 @@ static void add_header(struct buffer *b, struct module *mod) buf_printf(b, "\n" - "#ifdef CONFIG_RETPOLINE\n" + "#ifdef CONFIG_MITIGATION_RETPOLINE\n" "MODULE_INFO(retpoline, \"Y\");\n" "#endif\n"); diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index fb604ec95a5fa..24e4010c33b6a 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -50,7 +50,7 @@ # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) #endif -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE # define DISABLE_RETPOLINE 0 #else # define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \ diff --git a/tools/objtool/arch/x86/special.c b/tools/objtool/arch/x86/special.c index 29e949579ede6..4134d27c696bd 100644 --- a/tools/objtool/arch/x86/special.c +++ b/tools/objtool/arch/x86/special.c @@ -83,7 +83,7 @@ bool arch_support_alt_relocation(struct special_alt *special_alt, * TODO: Once we have DWARF CFI and smarter instruction decoding logic, * ensure the same register is used in the mov and jump instructions. * - * NOTE: RETPOLINE made it harder still to decode dynamic jumps. + * NOTE: MITIGATION_RETPOLINE made it harder still to decode dynamic jumps. */ struct reloc *arch_find_switch_table(struct objtool_file *file, struct instruction *insn) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 548ec3cd7c00c..84067f018f7ec 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3984,7 +3984,7 @@ static int validate_retpoline(struct objtool_file *file) } else continue; } else { - WARN_INSN(insn, "indirect %s found in RETPOLINE build", + WARN_INSN(insn, "indirect %s found in MITIGATION_RETPOLINE build", insn->type == INSN_JUMP_DYNAMIC ? "jump" : "call"); } -- GitLab From 7b75782ffd8243288d0661750b2dcc2596d676cb Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 21 Nov 2023 08:07:33 -0800 Subject: [PATCH 0034/1519] x86/bugs: Rename CONFIG_SLS => CONFIG_MITIGATION_SLS Step 6/10 of the namespace unification of CPU mitigations related Kconfig options. Suggested-by: Josh Poimboeuf Signed-off-by: Breno Leitao Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231121160740.1249350-7-leitao@debian.org --- arch/x86/Kconfig | 2 +- arch/x86/Makefile | 2 +- arch/x86/include/asm/linkage.h | 4 ++-- arch/x86/kernel/alternative.c | 4 ++-- arch/x86/kernel/ftrace.c | 3 ++- arch/x86/net/bpf_jit_comp.c | 4 ++-- scripts/Makefile.lib | 2 +- 7 files changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2a3ebd679b0e1..ba4546556fc55 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2577,7 +2577,7 @@ config CPU_SRSO help Enable the SRSO mitigation needed on AMD Zen1-4 machines. -config SLS +config MITIGATION_SLS bool "Mitigate Straight-Line-Speculation" depends on CC_HAS_SLS && X86_64 select OBJTOOL if HAVE_OBJTOOL diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b8d23ed059fb4..5ce8c30e77010 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -205,7 +205,7 @@ ifdef CONFIG_MITIGATION_RETPOLINE endif endif -ifdef CONFIG_SLS +ifdef CONFIG_MITIGATION_SLS KBUILD_CFLAGS += -mharden-sls=all endif diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index c5165204c66f0..09e2d026df333 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -43,7 +43,7 @@ #if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define RET jmp __x86_return_thunk #else /* CONFIG_MITIGATION_RETPOLINE */ -#ifdef CONFIG_SLS +#ifdef CONFIG_MITIGATION_SLS #define RET ret; int3 #else #define RET ret @@ -55,7 +55,7 @@ #if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define ASM_RET "jmp __x86_return_thunk\n\t" #else /* CONFIG_MITIGATION_RETPOLINE */ -#ifdef CONFIG_SLS +#ifdef CONFIG_MITIGATION_SLS #define ASM_RET "ret; int3\n\t" #else #define ASM_RET "ret\n\t" diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 08c182f5a0f4d..f5442d0d2ee9b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -708,8 +708,8 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) /* * The compiler is supposed to EMIT an INT3 after every unconditional * JMP instruction due to AMD BTC. However, if the compiler is too old - * or SLS isn't enabled, we still need an INT3 after indirect JMPs - * even on Intel. + * or MITIGATION_SLS isn't enabled, we still need an INT3 after + * indirect JMPs even on Intel. */ if (op == JMP32_INSN_OPCODE && i < insn->length) bytes[i++] = INT3_INSN_OPCODE; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 93bc52d4a472d..70139d9d2e017 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -307,7 +307,8 @@ union ftrace_op_code_union { } __attribute__((packed)); }; -#define RET_SIZE (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_SLS)) +#define RET_SIZE \ + (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_MITIGATION_SLS)) static unsigned long create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index ad1396b1df253..63b7aa48793e3 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -469,7 +469,7 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip); } else { EMIT2(0xFF, 0xE0 + reg); /* jmp *%\reg */ - if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) || IS_ENABLED(CONFIG_SLS)) + if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) || IS_ENABLED(CONFIG_MITIGATION_SLS)) EMIT1(0xCC); /* int3 */ } @@ -484,7 +484,7 @@ static void emit_return(u8 **pprog, u8 *ip) emit_jump(&prog, x86_return_thunk, ip); } else { EMIT1(0xC3); /* ret */ - if (IS_ENABLED(CONFIG_SLS)) + if (IS_ENABLED(CONFIG_MITIGATION_SLS)) EMIT1(0xCC); /* int3 */ } diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 615f2612a7efb..b272ca64cd758 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -264,7 +264,7 @@ endif objtool-args-$(CONFIG_UNWINDER_ORC) += --orc objtool-args-$(CONFIG_MITIGATION_RETPOLINE) += --retpoline objtool-args-$(CONFIG_RETHUNK) += --rethunk -objtool-args-$(CONFIG_SLS) += --sls +objtool-args-$(CONFIG_MITIGATION_SLS) += --sls objtool-args-$(CONFIG_STACK_VALIDATION) += --stackval objtool-args-$(CONFIG_HAVE_STATIC_CALL_INLINE) += --static-call objtool-args-$(CONFIG_HAVE_UACCESS_VALIDATION) += --uaccess -- GitLab From ac61d43983a4fe8e3ee600eee44c40868c14340a Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 21 Nov 2023 08:07:34 -0800 Subject: [PATCH 0035/1519] x86/bugs: Rename CONFIG_CPU_UNRET_ENTRY => CONFIG_MITIGATION_UNRET_ENTRY Step 7/10 of the namespace unification of CPU mitigations related Kconfig options. Suggested-by: Josh Poimboeuf Signed-off-by: Breno Leitao Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231121160740.1249350-8-leitao@debian.org --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/disabled-features.h | 2 +- arch/x86/include/asm/nospec-branch.h | 6 +++--- arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/cpu/bugs.c | 6 +++--- arch/x86/kernel/vmlinux.lds.S | 2 +- arch/x86/lib/retpoline.S | 10 +++++----- include/linux/objtool.h | 2 +- scripts/Makefile.vmlinux_o | 2 +- tools/arch/x86/include/asm/disabled-features.h | 2 +- 10 files changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ba4546556fc55..77b8769f30261 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2516,7 +2516,7 @@ config RETHUNK Requires a compiler with -mfunction-return=thunk-extern support for full protection. The kernel may run slower. -config CPU_UNRET_ENTRY +config MITIGATION_UNRET_ENTRY bool "Enable UNRET on kernel entry" depends on CPU_SUP_AMD && RETHUNK && X86_64 default y diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 24e4010c33b6a..151f0d50e7e03 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -63,7 +63,7 @@ # define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31)) #endif -#ifdef CONFIG_CPU_UNRET_ENTRY +#ifdef CONFIG_MITIGATION_UNRET_ENTRY # define DISABLE_UNRET 0 #else # define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 32680cb720038..97478690a5794 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -212,7 +212,7 @@ */ .macro VALIDATE_UNRET_END #if defined(CONFIG_NOINSTR_VALIDATION) && \ - (defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)) + (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)) ANNOTATE_RETPOLINE_SAFE nop #endif @@ -271,7 +271,7 @@ .Lskip_rsb_\@: .endm -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO) +#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO) #define CALL_UNTRAIN_RET "call entry_untrain_ret" #else #define CALL_UNTRAIN_RET "" @@ -334,7 +334,7 @@ extern void __x86_return_thunk(void); static inline void __x86_return_thunk(void) {} #endif -#ifdef CONFIG_CPU_UNRET_ENTRY +#ifdef CONFIG_MITIGATION_UNRET_ENTRY extern void retbleed_return_thunk(void); #else static inline void retbleed_return_thunk(void) {} diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9f42d1c59e095..6c7db2ec2c5e5 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -928,7 +928,7 @@ static void fix_erratum_1386(struct cpuinfo_x86 *c) void init_spectral_chicken(struct cpuinfo_x86 *c) { -#ifdef CONFIG_CPU_UNRET_ENTRY +#ifdef CONFIG_MITIGATION_UNRET_ENTRY u64 value; /* diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index fc46fd6447f9f..2580368c32d1f 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -982,10 +982,10 @@ static void __init retbleed_select_mitigation(void) return; case RETBLEED_CMD_UNRET: - if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) { + if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) { retbleed_mitigation = RETBLEED_MITIGATION_UNRET; } else { - pr_err("WARNING: kernel not compiled with CPU_UNRET_ENTRY.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n"); goto do_cmd_auto; } break; @@ -1021,7 +1021,7 @@ static void __init retbleed_select_mitigation(void) case RETBLEED_CMD_AUTO: if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) + if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) retbleed_mitigation = RETBLEED_MITIGATION_UNRET; else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) && boot_cpu_has(X86_FEATURE_IBPB)) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index bb2ec03f046d1..9c5cca50a36fa 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -504,7 +504,7 @@ INIT_PER_CPU(irq_stack_backing_store); "fixed_percpu_data is not at start of per-cpu area"); #endif -#ifdef CONFIG_CPU_UNRET_ENTRY +#ifdef CONFIG_MITIGATION_UNRET_ENTRY . = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned"); #endif diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index ff46f48a0cc4d..0ad67ccadd4c7 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -230,7 +230,7 @@ SYM_CODE_END(srso_return_thunk) #define JMP_SRSO_ALIAS_UNTRAIN_RET "ud2" #endif /* CONFIG_CPU_SRSO */ -#ifdef CONFIG_CPU_UNRET_ENTRY +#ifdef CONFIG_MITIGATION_UNRET_ENTRY /* * Some generic notes on the untraining sequences: @@ -312,11 +312,11 @@ SYM_CODE_END(retbleed_return_thunk) SYM_FUNC_END(retbleed_untrain_ret) #define JMP_RETBLEED_UNTRAIN_RET "jmp retbleed_untrain_ret" -#else /* !CONFIG_CPU_UNRET_ENTRY */ +#else /* !CONFIG_MITIGATION_UNRET_ENTRY */ #define JMP_RETBLEED_UNTRAIN_RET "ud2" -#endif /* CONFIG_CPU_UNRET_ENTRY */ +#endif /* CONFIG_MITIGATION_UNRET_ENTRY */ -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO) +#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO) SYM_FUNC_START(entry_untrain_ret) ALTERNATIVE_2 JMP_RETBLEED_UNTRAIN_RET, \ @@ -325,7 +325,7 @@ SYM_FUNC_START(entry_untrain_ret) SYM_FUNC_END(entry_untrain_ret) __EXPORT_THUNK(entry_untrain_ret) -#endif /* CONFIG_CPU_UNRET_ENTRY || CONFIG_CPU_SRSO */ +#endif /* CONFIG_MITIGATION_UNRET_ENTRY || CONFIG_CPU_SRSO */ #ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 33212e93f4a63..d030671a4c49d 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -131,7 +131,7 @@ */ .macro VALIDATE_UNRET_BEGIN #if defined(CONFIG_NOINSTR_VALIDATION) && \ - (defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)) + (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)) .Lhere_\@: .pushsection .discard.validate_unret .long .Lhere_\@ - . diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o index 25b3b587d37c0..6277dbd730bbb 100644 --- a/scripts/Makefile.vmlinux_o +++ b/scripts/Makefile.vmlinux_o @@ -38,7 +38,7 @@ objtool-enabled := $(or $(delay-objtool),$(CONFIG_NOINSTR_VALIDATION)) vmlinux-objtool-args-$(delay-objtool) += $(objtool-args-y) vmlinux-objtool-args-$(CONFIG_GCOV_KERNEL) += --no-unreachable vmlinux-objtool-args-$(CONFIG_NOINSTR_VALIDATION) += --noinstr \ - $(if $(or $(CONFIG_CPU_UNRET_ENTRY),$(CONFIG_CPU_SRSO)), --unret) + $(if $(or $(CONFIG_MITIGATION_UNRET_ENTRY),$(CONFIG_CPU_SRSO)), --unret) objtool-args = $(vmlinux-objtool-args-y) --link diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 24e4010c33b6a..151f0d50e7e03 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -63,7 +63,7 @@ # define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31)) #endif -#ifdef CONFIG_CPU_UNRET_ENTRY +#ifdef CONFIG_MITIGATION_UNRET_ENTRY # define DISABLE_UNRET 0 #else # define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) -- GitLab From 1da8d2172ce5175118929363fe568e41f24ad3d6 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 21 Nov 2023 08:07:35 -0800 Subject: [PATCH 0036/1519] x86/bugs: Rename CONFIG_CPU_IBRS_ENTRY => CONFIG_MITIGATION_IBRS_ENTRY Step 8/10 of the namespace unification of CPU mitigations related Kconfig options. Suggested-by: Josh Poimboeuf Signed-off-by: Breno Leitao Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231121160740.1249350-9-leitao@debian.org --- arch/x86/Kconfig | 2 +- arch/x86/entry/calling.h | 4 ++-- arch/x86/kernel/cpu/bugs.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 77b8769f30261..60d38df26f007 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2561,7 +2561,7 @@ config MITIGATION_IBPB_ENTRY help Compile the kernel with support for the retbleed=ibpb mitigation. -config CPU_IBRS_ENTRY +config MITIGATION_IBRS_ENTRY bool "Enable IBRS on kernel entry" depends on CPU_SUP_INTEL && X86_64 default y diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 7ac2d6f946ed0..39e069b68c6ee 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -303,7 +303,7 @@ For 32-bit we have the following conventions - kernel is built with * Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set. */ .macro IBRS_ENTER save_reg -#ifdef CONFIG_CPU_IBRS_ENTRY +#ifdef CONFIG_MITIGATION_IBRS_ENTRY ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS movl $MSR_IA32_SPEC_CTRL, %ecx @@ -332,7 +332,7 @@ For 32-bit we have the following conventions - kernel is built with * regs. Must be called after the last RET. */ .macro IBRS_EXIT save_reg -#ifdef CONFIG_CPU_IBRS_ENTRY +#ifdef CONFIG_MITIGATION_IBRS_ENTRY ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS movl $MSR_IA32_SPEC_CTRL, %ecx diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 2580368c32d1f..e11bacbd8f393 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1439,7 +1439,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return SPECTRE_V2_CMD_AUTO; } - if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_CPU_IBRS_ENTRY)) { + if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) { pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; @@ -1565,7 +1565,7 @@ static void __init spectre_v2_select_mitigation(void) break; } - if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) && + if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) && boot_cpu_has_bug(X86_BUG_RETBLEED) && retbleed_cmd != RETBLEED_CMD_OFF && retbleed_cmd != RETBLEED_CMD_STUFF && -- GitLab From a033eec9a06ce25388e71fa1e888792a718b9c17 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 21 Nov 2023 08:07:36 -0800 Subject: [PATCH 0037/1519] x86/bugs: Rename CONFIG_CPU_SRSO => CONFIG_MITIGATION_SRSO Step 9/10 of the namespace unification of CPU mitigations related Kconfig options. Suggested-by: Josh Poimboeuf Signed-off-by: Breno Leitao Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231121160740.1249350-10-leitao@debian.org --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/nospec-branch.h | 6 +++--- arch/x86/kernel/cpu/bugs.c | 8 ++++---- arch/x86/kernel/vmlinux.lds.S | 4 ++-- arch/x86/lib/retpoline.S | 10 +++++----- include/linux/objtool.h | 2 +- scripts/Makefile.vmlinux_o | 2 +- 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 60d38df26f007..a2743b7f2a18f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2570,7 +2570,7 @@ config MITIGATION_IBRS_ENTRY This mitigates both spectre_v2 and retbleed at great cost to performance. -config CPU_SRSO +config MITIGATION_SRSO bool "Mitigate speculative RAS overflow on AMD" depends on CPU_SUP_AMD && X86_64 && RETHUNK default y diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 97478690a5794..94c70832994f3 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -212,7 +212,7 @@ */ .macro VALIDATE_UNRET_END #if defined(CONFIG_NOINSTR_VALIDATION) && \ - (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)) + (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) ANNOTATE_RETPOLINE_SAFE nop #endif @@ -271,7 +271,7 @@ .Lskip_rsb_\@: .endm -#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO) +#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO) #define CALL_UNTRAIN_RET "call entry_untrain_ret" #else #define CALL_UNTRAIN_RET "" @@ -340,7 +340,7 @@ extern void retbleed_return_thunk(void); static inline void retbleed_return_thunk(void) {} #endif -#ifdef CONFIG_CPU_SRSO +#ifdef CONFIG_MITIGATION_SRSO extern void srso_return_thunk(void); extern void srso_alias_return_thunk(void); #else diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e11bacbd8f393..f2775417bda26 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2458,7 +2458,7 @@ static void __init srso_select_mitigation(void) break; case SRSO_CMD_SAFE_RET: - if (IS_ENABLED(CONFIG_CPU_SRSO)) { + if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { /* * Enable the return thunk for generated code * like ftrace, static_call, etc. @@ -2478,7 +2478,7 @@ static void __init srso_select_mitigation(void) else srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; } else { - pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); } break; @@ -2494,13 +2494,13 @@ static void __init srso_select_mitigation(void) break; case SRSO_CMD_IBPB_ON_VMEXIT: - if (IS_ENABLED(CONFIG_CPU_SRSO)) { + if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; } } else { - pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); } break; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 9c5cca50a36fa..6716fccd59ce0 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -142,7 +142,7 @@ SECTIONS *(.text..__x86.rethunk_untrain) ENTRY_TEXT -#ifdef CONFIG_CPU_SRSO +#ifdef CONFIG_MITIGATION_SRSO /* * See the comment above srso_alias_untrain_ret()'s * definition. @@ -508,7 +508,7 @@ INIT_PER_CPU(irq_stack_backing_store); . = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned"); #endif -#ifdef CONFIG_CPU_SRSO +#ifdef CONFIG_MITIGATION_SRSO . = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned"); /* * GNU ld cannot do XOR until 2.41. diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 0ad67ccadd4c7..67b52cbec6488 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -138,7 +138,7 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) */ .section .text..__x86.return_thunk -#ifdef CONFIG_CPU_SRSO +#ifdef CONFIG_MITIGATION_SRSO /* * srso_alias_untrain_ret() and srso_alias_safe_ret() are placed at @@ -225,10 +225,10 @@ SYM_CODE_END(srso_return_thunk) #define JMP_SRSO_UNTRAIN_RET "jmp srso_untrain_ret" #define JMP_SRSO_ALIAS_UNTRAIN_RET "jmp srso_alias_untrain_ret" -#else /* !CONFIG_CPU_SRSO */ +#else /* !CONFIG_MITIGATION_SRSO */ #define JMP_SRSO_UNTRAIN_RET "ud2" #define JMP_SRSO_ALIAS_UNTRAIN_RET "ud2" -#endif /* CONFIG_CPU_SRSO */ +#endif /* CONFIG_MITIGATION_SRSO */ #ifdef CONFIG_MITIGATION_UNRET_ENTRY @@ -316,7 +316,7 @@ SYM_FUNC_END(retbleed_untrain_ret) #define JMP_RETBLEED_UNTRAIN_RET "ud2" #endif /* CONFIG_MITIGATION_UNRET_ENTRY */ -#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO) +#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO) SYM_FUNC_START(entry_untrain_ret) ALTERNATIVE_2 JMP_RETBLEED_UNTRAIN_RET, \ @@ -325,7 +325,7 @@ SYM_FUNC_START(entry_untrain_ret) SYM_FUNC_END(entry_untrain_ret) __EXPORT_THUNK(entry_untrain_ret) -#endif /* CONFIG_MITIGATION_UNRET_ENTRY || CONFIG_CPU_SRSO */ +#endif /* CONFIG_MITIGATION_UNRET_ENTRY || CONFIG_MITIGATION_SRSO */ #ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING diff --git a/include/linux/objtool.h b/include/linux/objtool.h index d030671a4c49d..b3b8d3dab52d5 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -131,7 +131,7 @@ */ .macro VALIDATE_UNRET_BEGIN #if defined(CONFIG_NOINSTR_VALIDATION) && \ - (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)) + (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) .Lhere_\@: .pushsection .discard.validate_unret .long .Lhere_\@ - . diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o index 6277dbd730bbb..6de297916ce68 100644 --- a/scripts/Makefile.vmlinux_o +++ b/scripts/Makefile.vmlinux_o @@ -38,7 +38,7 @@ objtool-enabled := $(or $(delay-objtool),$(CONFIG_NOINSTR_VALIDATION)) vmlinux-objtool-args-$(delay-objtool) += $(objtool-args-y) vmlinux-objtool-args-$(CONFIG_GCOV_KERNEL) += --no-unreachable vmlinux-objtool-args-$(CONFIG_NOINSTR_VALIDATION) += --noinstr \ - $(if $(or $(CONFIG_MITIGATION_UNRET_ENTRY),$(CONFIG_CPU_SRSO)), --unret) + $(if $(or $(CONFIG_MITIGATION_UNRET_ENTRY),$(CONFIG_MITIGATION_SRSO)), --unret) objtool-args = $(vmlinux-objtool-args-y) --link -- GitLab From 0911b8c52c4d68c57d02f172daa55a42bce703f0 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 21 Nov 2023 08:07:37 -0800 Subject: [PATCH 0038/1519] x86/bugs: Rename CONFIG_RETHUNK => CONFIG_MITIGATION_RETHUNK Step 10/10 of the namespace unification of CPU mitigations related Kconfig options. [ mingo: Added one more case. ] Suggested-by: Josh Poimboeuf Signed-off-by: Breno Leitao Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231121160740.1249350-11-leitao@debian.org --- arch/x86/Kconfig | 8 ++++---- arch/x86/Makefile | 2 +- arch/x86/configs/i386_defconfig | 2 +- arch/x86/include/asm/disabled-features.h | 2 +- arch/x86/include/asm/linkage.h | 4 ++-- arch/x86/include/asm/nospec-branch.h | 4 ++-- arch/x86/include/asm/static_call.h | 2 +- arch/x86/kernel/alternative.c | 4 ++-- arch/x86/kernel/static_call.c | 2 +- arch/x86/lib/retpoline.S | 4 ++-- scripts/Makefile.lib | 2 +- tools/arch/x86/include/asm/disabled-features.h | 2 +- tools/objtool/check.c | 2 +- 13 files changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a2743b7f2a18f..0a9fea390ef30 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2462,7 +2462,7 @@ config FINEIBT config HAVE_CALL_THUNKS def_bool y - depends on CC_HAS_ENTRY_PADDING && RETHUNK && OBJTOOL + depends on CC_HAS_ENTRY_PADDING && MITIGATION_RETHUNK && OBJTOOL config CALL_THUNKS def_bool n @@ -2505,7 +2505,7 @@ config MITIGATION_RETPOLINE branches. Requires a compiler with -mindirect-branch=thunk-extern support for full protection. The kernel may run slower. -config RETHUNK +config MITIGATION_RETHUNK bool "Enable return-thunks" depends on MITIGATION_RETPOLINE && CC_HAS_RETURN_THUNK select OBJTOOL if HAVE_OBJTOOL @@ -2518,7 +2518,7 @@ config RETHUNK config MITIGATION_UNRET_ENTRY bool "Enable UNRET on kernel entry" - depends on CPU_SUP_AMD && RETHUNK && X86_64 + depends on CPU_SUP_AMD && MITIGATION_RETHUNK && X86_64 default y help Compile the kernel with support for the retbleed=unret mitigation. @@ -2572,7 +2572,7 @@ config MITIGATION_IBRS_ENTRY config MITIGATION_SRSO bool "Mitigate speculative RAS overflow on AMD" - depends on CPU_SUP_AMD && X86_64 && RETHUNK + depends on CPU_SUP_AMD && X86_64 && MITIGATION_RETHUNK default y help Enable the SRSO mitigation needed on AMD Zen1-4 machines. diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 5ce8c30e77010..ba046afb850e5 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -22,7 +22,7 @@ RETPOLINE_VDSO_CFLAGS := -mretpoline endif RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch-cs-prefix) -ifdef CONFIG_RETHUNK +ifdef CONFIG_MITIGATION_RETHUNK RETHUNK_CFLAGS := -mfunction-return=thunk-extern RETPOLINE_CFLAGS += $(RETHUNK_CFLAGS) endif diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 73abbbdd26f87..91801138b10bb 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -42,7 +42,7 @@ CONFIG_EFI_STUB=y CONFIG_HZ_1000=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y -# CONFIG_RETHUNK is not set +# CONFIG_MITIGATION_RETHUNK is not set CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y CONFIG_PM_TRACE_RTC=y diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 151f0d50e7e03..36d0c1e05e608 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -57,7 +57,7 @@ (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31))) #endif -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK # define DISABLE_RETHUNK 0 #else # define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31)) diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 09e2d026df333..dc31b13b87a0d 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -40,7 +40,7 @@ #ifdef __ASSEMBLY__ -#if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define RET jmp __x86_return_thunk #else /* CONFIG_MITIGATION_RETPOLINE */ #ifdef CONFIG_MITIGATION_SLS @@ -52,7 +52,7 @@ #else /* __ASSEMBLY__ */ -#if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define ASM_RET "jmp __x86_return_thunk\n\t" #else /* CONFIG_MITIGATION_RETPOLINE */ #ifdef CONFIG_MITIGATION_SLS diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 94c70832994f3..2c0679ebe9146 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -289,7 +289,7 @@ * where we have a stack but before any RET instruction. */ .macro __UNTRAIN_RET ibpb_feature, call_depth_insns -#if defined(CONFIG_RETHUNK) || defined(CONFIG_MITIGATION_IBPB_ENTRY) +#if defined(CONFIG_MITIGATION_RETHUNK) || defined(CONFIG_MITIGATION_IBPB_ENTRY) VALIDATE_UNRET_END ALTERNATIVE_3 "", \ CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ @@ -328,7 +328,7 @@ extern retpoline_thunk_t __x86_indirect_thunk_array[]; extern retpoline_thunk_t __x86_indirect_call_thunk_array[]; extern retpoline_thunk_t __x86_indirect_jump_thunk_array[]; -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK extern void __x86_return_thunk(void); #else static inline void __x86_return_thunk(void) {} diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h index 343b722ccaf21..125c407e2abe6 100644 --- a/arch/x86/include/asm/static_call.h +++ b/arch/x86/include/asm/static_call.h @@ -46,7 +46,7 @@ #define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func) \ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, ".byte 0xe9; .long " #func " - (. + 4)") -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "jmp __x86_return_thunk") #else diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f5442d0d2ee9b..df91abea3420f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -769,7 +769,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) } } -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK /* * Rewrite the compiler generated return thunk tail-calls. @@ -842,7 +842,7 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) } #else void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } -#endif /* CONFIG_RETHUNK */ +#endif /* CONFIG_MITIGATION_RETHUNK */ #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index 77a9316da4357..4eefaac64c6cb 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -172,7 +172,7 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) } EXPORT_SYMBOL_GPL(arch_static_call_transform); -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK /* * This is called by apply_returns() to fix up static call trampolines, * specifically ARCH_DEFINE_STATIC_CALL_NULL_TRAMP which is recorded as diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 67b52cbec6488..0045153ba2224 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -127,7 +127,7 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) #undef GEN #endif -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK /* * Be careful here: that label cannot really be removed because in @@ -386,4 +386,4 @@ SYM_CODE_START(__x86_return_thunk) SYM_CODE_END(__x86_return_thunk) EXPORT_SYMBOL(__x86_return_thunk) -#endif /* CONFIG_RETHUNK */ +#endif /* CONFIG_MITIGATION_RETHUNK */ diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index b272ca64cd758..c3f4cacad7b09 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -263,7 +263,7 @@ objtool-args-$(CONFIG_HAVE_OBJTOOL_NOP_MCOUNT) += --mnop endif objtool-args-$(CONFIG_UNWINDER_ORC) += --orc objtool-args-$(CONFIG_MITIGATION_RETPOLINE) += --retpoline -objtool-args-$(CONFIG_RETHUNK) += --rethunk +objtool-args-$(CONFIG_MITIGATION_RETHUNK) += --rethunk objtool-args-$(CONFIG_MITIGATION_SLS) += --sls objtool-args-$(CONFIG_STACK_VALIDATION) += --stackval objtool-args-$(CONFIG_HAVE_STATIC_CALL_INLINE) += --static-call diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 151f0d50e7e03..36d0c1e05e608 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -57,7 +57,7 @@ (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31))) #endif -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK # define DISABLE_RETHUNK 0 #else # define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31)) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 84067f018f7ec..8440b7bb343cc 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3980,7 +3980,7 @@ static int validate_retpoline(struct objtool_file *file) if (insn->type == INSN_RETURN) { if (opts.rethunk) { - WARN_INSN(insn, "'naked' return found in RETHUNK build"); + WARN_INSN(insn, "'naked' return found in MITIGATION_RETHUNK build"); } else continue; } else { -- GitLab From 31c89007285d365aa36f71d8fb0701581c770a27 Mon Sep 17 00:00:00 2001 From: Audra Mitchell Date: Mon, 15 Jan 2024 12:08:22 -0500 Subject: [PATCH 0039/1519] workqueue.c: Increase workqueue name length Currently we limit the size of the workqueue name to 24 characters due to commit ecf6881ff349 ("workqueue: make workqueue->name[] fixed len") Increase the size to 32 characters and print a warning in the event the requested name is larger than the limit of 32 characters. Signed-off-by: Audra Mitchell Signed-off-by: Tejun Heo --- kernel/workqueue.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 76e60faed8923..8d9dec14b9bb8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -108,7 +108,7 @@ enum { RESCUER_NICE_LEVEL = MIN_NICE, HIGHPRI_NICE_LEVEL = MIN_NICE, - WQ_NAME_LEN = 24, + WQ_NAME_LEN = 32, }; /* @@ -4666,6 +4666,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, va_list args; struct workqueue_struct *wq; struct pool_workqueue *pwq; + int len; /* * Unbound && max_active == 1 used to imply ordered, which is no longer @@ -4692,9 +4693,12 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, } va_start(args, max_active); - vsnprintf(wq->name, sizeof(wq->name), fmt, args); + len = vsnprintf(wq->name, sizeof(wq->name), fmt, args); va_end(args); + if (len >= WQ_NAME_LEN) + pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", wq->name); + max_active = max_active ?: WQ_DFL_ACTIVE; max_active = wq_clamp_max_active(max_active, flags, wq->name); -- GitLab From ab5e5b99a949b9f282c605d00557b2c727856485 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 16 Jan 2024 17:19:26 +0100 Subject: [PATCH 0040/1519] tools/workqueue: Add rescuers printing to wq_dump.py Retrieving rescuers information (e.g., affinity and name) is quite useful when debugging workqueues configurations. Add printing of such information to the existing wq_dump.py script. Signed-off-by: Juri Lelli Signed-off-by: Tejun Heo --- tools/workqueue/wq_dump.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tools/workqueue/wq_dump.py b/tools/workqueue/wq_dump.py index d0df5833f2c18..6da621989e210 100644 --- a/tools/workqueue/wq_dump.py +++ b/tools/workqueue/wq_dump.py @@ -175,3 +175,32 @@ for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_( if wq.flags & WQ_UNBOUND: print(f' {wq.dfl_pwq.pool.id.value_():{max_pool_id_len}}', end='') print('') + +print('') +print('Workqueue -> rescuer') +print('=====================') +print(f'wq_unbound_cpumask={cpumask_str(wq_unbound_cpumask)}') +print('') +print('[ workqueue \ type unbound_cpumask rescuer pid cpumask]') + +for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_(), 'list'): + print(f'{wq.name.string_().decode()[-24:]:24}', end='') + if wq.flags & WQ_UNBOUND: + if wq.flags & WQ_ORDERED: + print(' ordered ', end='') + else: + print(' unbound', end='') + if wq.unbound_attrs.affn_strict: + print(',S ', end='') + else: + print(' ', end='') + print(f' {cpumask_str(wq.unbound_attrs.cpumask):24}', end='') + else: + print(' percpu ', end='') + print(' ', end='') + + if wq.flags & WQ_MEM_RECLAIM: + print(f' {wq.rescuer.task.comm.string_().decode()[-24:]:24}', end='') + print(f' {wq.rescuer.task.pid.value_():5}', end='') + print(f' {cpumask_str(wq.rescuer.task.cpus_ptr)}', end='') + print('') -- GitLab From 85f0ab43f9de62a4b9c1b503b07f1c33e5a6d2ab Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 16 Jan 2024 17:19:27 +0100 Subject: [PATCH 0041/1519] kernel/workqueue: Bind rescuer to unbound cpumask for WQ_UNBOUND At the time they are created unbound workqueues rescuers currently use cpu_possible_mask as their affinity, but this can be too wide in case a workqueue unbound mask has been set as a subset of cpu_possible_mask. Make new rescuers use their associated workqueue unbound cpumask from the start. Signed-off-by: Juri Lelli Signed-off-by: Tejun Heo --- kernel/workqueue.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8d9dec14b9bb8..ed442cefea7c4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4652,7 +4652,10 @@ static int init_rescuer(struct workqueue_struct *wq) } wq->rescuer = rescuer; - kthread_bind_mask(rescuer->task, cpu_possible_mask); + if (wq->flags & WQ_UNBOUND) + kthread_bind_mask(rescuer->task, wq->unbound_attrs->cpumask); + else + kthread_bind_mask(rescuer->task, cpu_possible_mask); wake_up_process(rescuer->task); return 0; -- GitLab From 1a65a6d17cbc58e1aeffb2be962acce49efbef9c Mon Sep 17 00:00:00 2001 From: Xuewen Yan Date: Wed, 10 Jan 2024 11:27:24 +0800 Subject: [PATCH 0042/1519] workqueue: Add rcu lock check at the end of work item execution Currently the workqueue just checks the atomic and locking states after work execution ends. However, sometimes, a work item may not unlock rcu after acquiring rcu_read_lock(). And as a result, it would cause rcu stall, but the rcu stall warning can not dump the work func, because the work has finished. In order to quickly discover those works that do not call rcu_read_unlock() after rcu_read_lock(), add the rcu lock check. Use rcu_preempt_depth() to check the work's rcu status. Normally, this value is 0. If this value is bigger than 0, it means the work are still holding rcu lock. If so, print err info and the work func. tj: Reworded the description for clarity. Minor formatting tweak. Signed-off-by: Xuewen Yan Reviewed-by: Lai Jiangshan Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/workqueue.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ed442cefea7c4..aec3efbaaf934 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2640,11 +2640,12 @@ __acquires(&pool->lock) lock_map_release(&lockdep_map); lock_map_release(&pwq->wq->lockdep_map); - if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { - pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" + if (unlikely(in_atomic() || lockdep_depth(current) > 0 || + rcu_preempt_depth() > 0)) { + pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d/%d\n" " last function: %ps\n", - current->comm, preempt_count(), task_pid_nr(current), - worker->current_func); + current->comm, preempt_count(), rcu_preempt_depth(), + task_pid_nr(current), worker->current_func); debug_show_held_locks(current); dump_stack(); } -- GitLab From 1982a2a02c9197436d4a8ea12f66bafab53f16a0 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 3 Jan 2024 16:06:32 +0100 Subject: [PATCH 0043/1519] xfrm: Clear low order bits of ->flowi4_tos in decode_session4(). Commit 23e7b1bfed61 ("xfrm: Don't accidentally set RTO_ONLINK in decode_session4()") fixed a problem where decode_session4() could erroneously set the RTO_ONLINK flag for IPv4 route lookups. This problem was reintroduced when decode_session4() was modified to use the flow dissector. Fix this by clearing again the two low order bits of ->flowi4_tos. Found by code inspection, compile tested only. Fixes: 7a0207094f1b ("xfrm: policy: replace session decode with flow dissector") Signed-off-by: Guillaume Nault Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 1b7e751597277..7351f32052dc0 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3416,7 +3416,7 @@ decode_session4(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reve } fl4->flowi4_proto = flkeys->basic.ip_proto; - fl4->flowi4_tos = flkeys->ip.tos; + fl4->flowi4_tos = flkeys->ip.tos & ~INET_ECN_MASK; } #if IS_ENABLED(CONFIG_IPV6) -- GitLab From 7bd20b6b87183db2ebf789bcf9d0aa6d06a0defb Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 19 Jan 2024 12:54:39 -0300 Subject: [PATCH 0044/1519] workqueue: mark power efficient workqueue as unbounded if nohz_full enabled A customer using nohz_full has experienced the following interruption: oslat-1004510 [018] timer_cancel: timer=0xffff90a7ca663cf8 oslat-1004510 [018] timer_expire_entry: timer=0xffff90a7ca663cf8 function=delayed_work_timer_fn now=4709188240 baseclk=4709188240 oslat-1004510 [018] workqueue_queue_work: work struct=0xffff90a7ca663cd8 function=fb_flashcursor workqueue=events_power_efficient req_cpu=8192 cpu=18 oslat-1004510 [018] workqueue_activate_work: work struct 0xffff90a7ca663cd8 oslat-1004510 [018] sched_wakeup: kworker/18:1:326 [120] CPU:018 oslat-1004510 [018] timer_expire_exit: timer=0xffff90a7ca663cf8 oslat-1004510 [018] irq_work_entry: vector=246 oslat-1004510 [018] irq_work_exit: vector=246 oslat-1004510 [018] tick_stop: success=0 dependency=SCHED oslat-1004510 [018] hrtimer_start: hrtimer=0xffff90a70009cb00 function=tick_sched_timer/0x0 ... oslat-1004510 [018] softirq_exit: vec=1 [action=TIMER] oslat-1004510 [018] softirq_entry: vec=7 [action=SCHED] oslat-1004510 [018] softirq_exit: vec=7 [action=SCHED] oslat-1004510 [018] tick_stop: success=0 dependency=SCHED oslat-1004510 [018] sched_switch: oslat:1004510 [120] R ==> kworker/18:1:326 [120] kworker/18:1-326 [018] workqueue_execute_start: work struct 0xffff90a7ca663cd8: function fb_flashcursor kworker/18:1-326 [018] workqueue_queue_work: work struct=0xffff9078f119eed0 function=drm_fb_helper_damage_work workqueue=events req_cpu=8192 cpu=18 kworker/18:1-326 [018] workqueue_activate_work: work struct 0xffff9078f119eed0 kworker/18:1-326 [018] timer_start: timer=0xffff90a7ca663cf8 function=delayed_work_timer_fn ... Set wq_power_efficient to true, in case nohz_full is enabled. This makes the power efficient workqueue be unbounded, which allows workqueue items there to be moved to HK CPUs. Signed-off-by: Marcelo Tosatti Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index aec3efbaaf934..8ca65665efe96 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -6638,6 +6638,13 @@ void __init workqueue_init_early(void) wq_update_pod_attrs_buf = alloc_workqueue_attrs(); BUG_ON(!wq_update_pod_attrs_buf); + /* + * If nohz_full is enabled, set power efficient workqueue as unbound. + * This allows workqueue items to be moved to HK CPUs. + */ + if (housekeeping_enabled(HK_TYPE_TICK)) + wq_power_efficient = true; + /* initialize WQ_AFFN_SYSTEM pods */ pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL); pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL); -- GitLab From c5fed8ce65493f71611280f225826e7bd5e49791 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sun, 24 Dec 2023 18:21:28 +0100 Subject: [PATCH 0045/1519] rust: upgrade to Rust 1.75.0 This is the next upgrade to the Rust toolchain, from 1.74.1 to 1.75.0 (i.e. the latest) [1]. See the upgrade policy [2] and the comments on the first upgrade in commit 3ed03f4da06e ("rust: upgrade to Rust 1.68.2"). # Unstable features The `const_maybe_uninit_zeroed` unstable feature [3] was stabilized in Rust 1.75.0, which we were using in the PHYLIB abstractions. The only unstable features allowed to be used outside the `kernel` crate are still `new_uninit,offset_of`, though other code to be upstreamed may increase the list. Please see [4] for details. # Other improvements Rust 1.75.0 stabilized `pointer_byte_offsets` [5] which we could potentially use as an alternative for `ptr_metadata` in the future. # Required changes For this upgrade, no changes were required (i.e. on our side). # `alloc` upgrade and reviewing The vast majority of changes are due to our `alloc` fork being upgraded at once. There are two kinds of changes to be aware of: the ones coming from upstream, which we should follow as closely as possible, and the updates needed in our added fallible APIs to keep them matching the newer infallible APIs coming from upstream. Instead of taking a look at the diff of this patch, an alternative approach is reviewing a diff of the changes between upstream `alloc` and the kernel's. This allows to easily inspect the kernel additions only, especially to check if the fallible methods we already have still match the infallible ones in the new version coming from upstream. Another approach is reviewing the changes introduced in the additions in the kernel fork between the two versions. This is useful to spot potentially unintended changes to our additions. To apply these approaches, one may follow steps similar to the following to generate a pair of patches that show the differences between upstream Rust and the kernel (for the subset of `alloc` we use) before and after applying this patch: # Get the difference with respect to the old version. git -C rust checkout $(linux/scripts/min-tool-version.sh rustc) git -C linux ls-tree -r --name-only HEAD -- rust/alloc | cut -d/ -f3- | grep -Fv README.md | xargs -IPATH cp rust/library/alloc/src/PATH linux/rust/alloc/PATH git -C linux diff --patch-with-stat --summary -R > old.patch git -C linux restore rust/alloc # Apply this patch. git -C linux am rust-upgrade.patch # Get the difference with respect to the new version. git -C rust checkout $(linux/scripts/min-tool-version.sh rustc) git -C linux ls-tree -r --name-only HEAD -- rust/alloc | cut -d/ -f3- | grep -Fv README.md | xargs -IPATH cp rust/library/alloc/src/PATH linux/rust/alloc/PATH git -C linux diff --patch-with-stat --summary -R > new.patch git -C linux restore rust/alloc Now one may check the `new.patch` to take a look at the additions (first approach) or at the difference between those two patches (second approach). For the latter, a side-by-side tool is recommended. Link: https://github.com/rust-lang/rust/blob/stable/RELEASES.md#version-1750-2023-12-28 [1] Link: https://rust-for-linux.com/rust-version-policy [2] Link: https://github.com/rust-lang/rust/issues/91850 [3] Link: https://github.com/Rust-for-Linux/linux/issues/2 [4] Link: https://github.com/rust-lang/rust/issues/96283 [5] Reviewed-by: Vincenzo Palazzo Reviewed-by: Martin Rodriguez Reboredo Tested-by: Boqun Feng Link: https://lore.kernel.org/r/20231224172128.271447-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- Documentation/process/changes.rst | 2 +- rust/alloc/alloc.rs | 9 ++++++++- rust/alloc/boxed.rs | 20 ++++++++++++-------- rust/alloc/lib.rs | 7 ++++--- rust/alloc/raw_vec.rs | 19 +++++++++++++++---- rust/alloc/vec/mod.rs | 16 ++++++++++------ rust/kernel/lib.rs | 1 - scripts/min-tool-version.sh | 2 +- 8 files changed, 51 insertions(+), 25 deletions(-) diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index 50b3d1cb11159..eab7e2f8c1965 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -31,7 +31,7 @@ you probably needn't concern yourself with pcmciautils. ====================== =============== ======================================== GNU C 5.1 gcc --version Clang/LLVM (optional) 11.0.0 clang --version -Rust (optional) 1.74.1 rustc --version +Rust (optional) 1.75.0 rustc --version bindgen (optional) 0.65.1 bindgen --version GNU make 3.82 make --version bash 4.2 bash --version diff --git a/rust/alloc/alloc.rs b/rust/alloc/alloc.rs index 150e13750ff70..8a6be8c981731 100644 --- a/rust/alloc/alloc.rs +++ b/rust/alloc/alloc.rs @@ -379,13 +379,20 @@ const fn ct_error(_: Layout) -> ! { panic!("allocation failed"); } + #[inline] fn rt_error(layout: Layout) -> ! { unsafe { __rust_alloc_error_handler(layout.size(), layout.align()); } } - unsafe { core::intrinsics::const_eval_select((layout,), ct_error, rt_error) } + #[cfg(not(feature = "panic_immediate_abort"))] + unsafe { + core::intrinsics::const_eval_select((layout,), ct_error, rt_error) + } + + #[cfg(feature = "panic_immediate_abort")] + ct_error(layout) } // For alloc test `std::alloc::handle_alloc_error` can be used directly. diff --git a/rust/alloc/boxed.rs b/rust/alloc/boxed.rs index 9620eba172687..f5f40778a1937 100644 --- a/rust/alloc/boxed.rs +++ b/rust/alloc/boxed.rs @@ -161,7 +161,7 @@ use core::marker::Unsize; use core::mem::{self, SizedTypeProperties}; use core::ops::{ - CoerceUnsized, Deref, DerefMut, DispatchFromDyn, Generator, GeneratorState, Receiver, + CoerceUnsized, Coroutine, CoroutineState, Deref, DerefMut, DispatchFromDyn, Receiver, }; use core::pin::Pin; use core::ptr::{self, NonNull, Unique}; @@ -211,7 +211,7 @@ impl Box { /// ``` /// let five = Box::new(5); /// ``` - #[cfg(all(not(no_global_oom_handling)))] + #[cfg(not(no_global_oom_handling))] #[inline(always)] #[stable(feature = "rust1", since = "1.0.0")] #[must_use] @@ -2110,28 +2110,28 @@ fn as_mut(&mut self) -> &mut T { #[stable(feature = "pin", since = "1.33.0")] impl Unpin for Box where A: 'static {} -#[unstable(feature = "generator_trait", issue = "43122")] -impl + Unpin, R, A: Allocator> Generator for Box +#[unstable(feature = "coroutine_trait", issue = "43122")] +impl + Unpin, R, A: Allocator> Coroutine for Box where A: 'static, { type Yield = G::Yield; type Return = G::Return; - fn resume(mut self: Pin<&mut Self>, arg: R) -> GeneratorState { + fn resume(mut self: Pin<&mut Self>, arg: R) -> CoroutineState { G::resume(Pin::new(&mut *self), arg) } } -#[unstable(feature = "generator_trait", issue = "43122")] -impl, R, A: Allocator> Generator for Pin> +#[unstable(feature = "coroutine_trait", issue = "43122")] +impl, R, A: Allocator> Coroutine for Pin> where A: 'static, { type Yield = G::Yield; type Return = G::Return; - fn resume(mut self: Pin<&mut Self>, arg: R) -> GeneratorState { + fn resume(mut self: Pin<&mut Self>, arg: R) -> CoroutineState { G::resume((*self).as_mut(), arg) } } @@ -2448,4 +2448,8 @@ fn cause(&self) -> Option<&dyn core::error::Error> { fn source(&self) -> Option<&(dyn core::error::Error + 'static)> { core::error::Error::source(&**self) } + + fn provide<'b>(&'b self, request: &mut core::error::Request<'b>) { + core::error::Error::provide(&**self, request); + } } diff --git a/rust/alloc/lib.rs b/rust/alloc/lib.rs index 9c7ea73da1080..345cf5c9cf92d 100644 --- a/rust/alloc/lib.rs +++ b/rust/alloc/lib.rs @@ -80,6 +80,8 @@ not(no_sync), target_has_atomic = "ptr" ))] +#![cfg_attr(not(bootstrap), doc(rust_logo))] +#![cfg_attr(not(bootstrap), feature(rustdoc_internals))] #![no_std] #![needs_allocator] // Lints: @@ -115,7 +117,6 @@ #![feature(const_eval_select)] #![feature(const_maybe_uninit_as_mut_ptr)] #![feature(const_maybe_uninit_write)] -#![feature(const_maybe_uninit_zeroed)] #![feature(const_pin)] #![feature(const_refs_to_cell)] #![feature(const_size_of_val)] @@ -141,7 +142,7 @@ #![feature(maybe_uninit_uninit_array)] #![feature(maybe_uninit_uninit_array_transpose)] #![feature(pattern)] -#![feature(pointer_byte_offsets)] +#![feature(ptr_addr_eq)] #![feature(ptr_internals)] #![feature(ptr_metadata)] #![feature(ptr_sub_ptr)] @@ -168,7 +169,7 @@ // // Language features: // tidy-alphabetical-start -#![cfg_attr(not(test), feature(generator_trait))] +#![cfg_attr(not(test), feature(coroutine_trait))] #![cfg_attr(test, feature(panic_update_hook))] #![cfg_attr(test, feature(test))] #![feature(allocator_internals)] diff --git a/rust/alloc/raw_vec.rs b/rust/alloc/raw_vec.rs index a7425582a323f..f1b8cec8cc626 100644 --- a/rust/alloc/raw_vec.rs +++ b/rust/alloc/raw_vec.rs @@ -338,10 +338,13 @@ pub fn reserve_for_push(&mut self, len: usize) { /// The same as `reserve`, but returns on errors instead of panicking or aborting. pub fn try_reserve(&mut self, len: usize, additional: usize) -> Result<(), TryReserveError> { if self.needs_to_grow(len, additional) { - self.grow_amortized(len, additional) - } else { - Ok(()) + self.grow_amortized(len, additional)?; + } + unsafe { + // Inform the optimizer that the reservation has succeeded or wasn't needed + core::intrinsics::assume(!self.needs_to_grow(len, additional)); } + Ok(()) } /// The same as `reserve_for_push`, but returns on errors instead of panicking or aborting. @@ -378,7 +381,14 @@ pub fn try_reserve_exact( len: usize, additional: usize, ) -> Result<(), TryReserveError> { - if self.needs_to_grow(len, additional) { self.grow_exact(len, additional) } else { Ok(()) } + if self.needs_to_grow(len, additional) { + self.grow_exact(len, additional)?; + } + unsafe { + // Inform the optimizer that the reservation has succeeded or wasn't needed + core::intrinsics::assume(!self.needs_to_grow(len, additional)); + } + Ok(()) } /// Shrinks the buffer down to the specified capacity. If the given amount @@ -569,6 +579,7 @@ fn alloc_guard(alloc_size: usize) -> Result<(), TryReserveError> { // ensure that the code generation related to these panics is minimal as there's // only one location which panics rather than a bunch throughout the module. #[cfg(not(no_global_oom_handling))] +#[cfg_attr(not(feature = "panic_immediate_abort"), inline(never))] fn capacity_overflow() -> ! { panic!("capacity overflow"); } diff --git a/rust/alloc/vec/mod.rs b/rust/alloc/vec/mod.rs index 41ca71805ef09..0d95fd7ef3375 100644 --- a/rust/alloc/vec/mod.rs +++ b/rust/alloc/vec/mod.rs @@ -1376,7 +1376,7 @@ pub fn as_mut_slice(&mut self) -> &mut [T] { /// [`as_mut_ptr`]: Vec::as_mut_ptr /// [`as_ptr`]: Vec::as_ptr #[stable(feature = "vec_as_ptr", since = "1.37.0")] - #[cfg_attr(not(bootstrap), rustc_never_returns_null_ptr)] + #[rustc_never_returns_null_ptr] #[inline] pub fn as_ptr(&self) -> *const T { // We shadow the slice method of the same name to avoid going through @@ -1436,7 +1436,7 @@ pub fn as_ptr(&self) -> *const T { /// [`as_mut_ptr`]: Vec::as_mut_ptr /// [`as_ptr`]: Vec::as_ptr #[stable(feature = "vec_as_ptr", since = "1.37.0")] - #[cfg_attr(not(bootstrap), rustc_never_returns_null_ptr)] + #[rustc_never_returns_null_ptr] #[inline] pub fn as_mut_ptr(&mut self) -> *mut T { // We shadow the slice method of the same name to avoid going through @@ -1565,7 +1565,8 @@ pub unsafe fn set_len(&mut self, new_len: usize) { #[stable(feature = "rust1", since = "1.0.0")] pub fn swap_remove(&mut self, index: usize) -> T { #[cold] - #[inline(never)] + #[cfg_attr(not(feature = "panic_immediate_abort"), inline(never))] + #[track_caller] fn assert_failed(index: usize, len: usize) -> ! { panic!("swap_remove index (is {index}) should be < len (is {len})"); } @@ -1606,7 +1607,8 @@ fn assert_failed(index: usize, len: usize) -> ! { #[stable(feature = "rust1", since = "1.0.0")] pub fn insert(&mut self, index: usize, element: T) { #[cold] - #[inline(never)] + #[cfg_attr(not(feature = "panic_immediate_abort"), inline(never))] + #[track_caller] fn assert_failed(index: usize, len: usize) -> ! { panic!("insertion index (is {index}) should be <= len (is {len})"); } @@ -1667,7 +1669,7 @@ fn assert_failed(index: usize, len: usize) -> ! { #[track_caller] pub fn remove(&mut self, index: usize) -> T { #[cold] - #[inline(never)] + #[cfg_attr(not(feature = "panic_immediate_abort"), inline(never))] #[track_caller] fn assert_failed(index: usize, len: usize) -> ! { panic!("removal index (is {index}) should be < len (is {len})"); @@ -2097,6 +2099,7 @@ pub fn pop(&mut self) -> Option { } else { unsafe { self.len -= 1; + core::intrinsics::assume(self.len < self.capacity()); Some(ptr::read(self.as_ptr().add(self.len()))) } } @@ -2299,7 +2302,8 @@ pub fn split_off(&mut self, at: usize) -> Self A: Clone, { #[cold] - #[inline(never)] + #[cfg_attr(not(feature = "panic_immediate_abort"), inline(never))] + #[track_caller] fn assert_failed(at: usize, len: usize) -> ! { panic!("`at` split index (is {at}) should be <= len (is {len})"); } diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index 7ac39874aeac3..cb2d024db51fe 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -14,7 +14,6 @@ #![no_std] #![feature(allocator_api)] #![feature(coerce_unsized)] -#![feature(const_maybe_uninit_zeroed)] #![feature(dispatch_from_dyn)] #![feature(new_uninit)] #![feature(offset_of)] diff --git a/scripts/min-tool-version.sh b/scripts/min-tool-version.sh index 9faa4d3d91e35..ef6f286a4d47f 100755 --- a/scripts/min-tool-version.sh +++ b/scripts/min-tool-version.sh @@ -33,7 +33,7 @@ llvm) fi ;; rustc) - echo 1.74.1 + echo 1.75.0 ;; bindgen) echo 0.65.1 -- GitLab From 6b1b2326b2bc3d6a90ec04815130d59ae57f3bc1 Mon Sep 17 00:00:00 2001 From: Charalampos Mitrodimas Date: Fri, 5 Jan 2024 01:29:30 +0000 Subject: [PATCH 0046/1519] rust: sync: `CondVar` rename "wait_list" to "wait_queue_head" Fields named "wait_list" usually are of type "struct list_head". To avoid confusion and because it is of type "Opaque" we are renaming "wait_list" to "wait_queue_head". Signed-off-by: Charalampos Mitrodimas Reviewed-by: Alice Ryhl Reviewed-by: Martin Rodriguez Reboredo Reviewed-by: Benno Lossin Link: https://lore.kernel.org/r/20240105012930.1426214-1-charmitro@posteo.net Signed-off-by: Miguel Ojeda --- rust/kernel/sync/condvar.rs | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/rust/kernel/sync/condvar.rs b/rust/kernel/sync/condvar.rs index f65e19d5a37c1..90108cc6da0b6 100644 --- a/rust/kernel/sync/condvar.rs +++ b/rust/kernel/sync/condvar.rs @@ -73,7 +73,7 @@ macro_rules! new_condvar { #[pin_data] pub struct CondVar { #[pin] - pub(crate) wait_list: Opaque, + pub(crate) wait_queue_head: Opaque, /// A condvar needs to be pinned because it contains a [`struct list_head`] that is /// self-referential, so it cannot be safely moved once it is initialised. @@ -96,7 +96,7 @@ pub fn new(name: &'static CStr, key: &'static LockClassKey) -> impl PinInit(&self, wait_state: u32, guard: &mut Guar // SAFETY: `wait` points to valid memory. unsafe { bindings::init_wait(wait.get()) }; - // SAFETY: Both `wait` and `wait_list` point to valid memory. + // SAFETY: Both `wait` and `wait_queue_head` point to valid memory. unsafe { - bindings::prepare_to_wait_exclusive(self.wait_list.get(), wait.get(), wait_state as _) + bindings::prepare_to_wait_exclusive( + self.wait_queue_head.get(), + wait.get(), + wait_state as _, + ) }; // SAFETY: No arguments, switches to another thread. guard.do_unlocked(|| unsafe { bindings::schedule() }); - // SAFETY: Both `wait` and `wait_list` point to valid memory. - unsafe { bindings::finish_wait(self.wait_list.get(), wait.get()) }; + // SAFETY: Both `wait` and `wait_queue_head` point to valid memory. + unsafe { bindings::finish_wait(self.wait_queue_head.get(), wait.get()) }; } /// Releases the lock and waits for a notification in uninterruptible mode. @@ -144,10 +148,10 @@ pub fn wait_interruptible(&self, guard: &mut Guard<'_, T, /// Calls the kernel function to notify the appropriate number of threads with the given flags. fn notify(&self, count: i32, flags: u32) { - // SAFETY: `wait_list` points to valid memory. + // SAFETY: `wait_queue_head` points to valid memory. unsafe { bindings::__wake_up( - self.wait_list.get(), + self.wait_queue_head.get(), bindings::TASK_NORMAL, count, flags as _, -- GitLab From fe12cfc17429a4ddfdf9e71711bd125b8854ed7d Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 9 Jan 2024 15:29:27 +0800 Subject: [PATCH 0047/1519] fs: fix a typo in attr.c The word "filesytem" should be "filesystem" Signed-off-by: Jay Link: https://lore.kernel.org/r/20240109072927.29626-1-merqqcury@gmail.com Signed-off-by: Christian Brauner --- fs/attr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/attr.c b/fs/attr.c index 5a13f0c8495fd..49d23b5dbab4b 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -352,7 +352,7 @@ int may_setattr(struct mnt_idmap *idmap, struct inode *inode, EXPORT_SYMBOL(may_setattr); /** - * notify_change - modify attributes of a filesytem object + * notify_change - modify attributes of a filesystem object * @idmap: idmap of the mount the inode was found from * @dentry: object affected * @attr: new attributes -- GitLab From 73f65b8b0325551110dedf8d27ac738bdc331802 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 8 Jan 2024 18:20:40 +0100 Subject: [PATCH 0048/1519] fs: Wrong function name in comment This comment refers to function mark_buffer_inode_dirty(), but the function is actually called mark_buffer_dirty_inode(), so fix the comment. Signed-off-by: Andreas Gruenbacher Link: https://lore.kernel.org/r/20240108172040.178173-1-agruenba@redhat.com Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index d3bcf601d3e5a..dcafee512089a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -464,7 +464,7 @@ EXPORT_SYMBOL(mark_buffer_async_write); * a successful fsync(). For example, ext2 indirect blocks need to be * written back and waited upon before fsync() returns. * - * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), + * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(), * inode_has_buffers() and invalidate_inode_buffers() are provided for the * management of a list of dependent buffers at ->i_mapping->i_private_list. * -- GitLab From 6c8ac6e24eb04d1eebc442029d7a7528e5461cb8 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Thu, 11 Jan 2024 17:22:40 +1100 Subject: [PATCH 0049/1519] initramfs: remove duplicate built-in __initramfs_start unpacking If initrd_start cpio extraction fails, CONFIG_BLK_DEV_RAM triggers fallback to initrd.image handling via populate_initrd_image(). The populate_initrd_image() call follows successful extraction of any built-in cpio archive at __initramfs_start, but currently performs built-in archive extraction a second time. Prior to commit b2a74d5f9d446 ("initramfs: remove clean_rootfs"), the second built-in initramfs unpack call was used to repopulate entries removed by clean_rootfs(), but it's no longer necessary now the contents of the previous extraction are retained. Signed-off-by: David Disseldorp Link: https://lore.kernel.org/r/20240111062240.9362-1-ddiss@suse.de Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- init/initramfs.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/init/initramfs.c b/init/initramfs.c index 76deb48c38cb1..d3c623dde01a8 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -679,8 +679,6 @@ static void __init populate_initrd_image(char *err) struct file *file; loff_t pos = 0; - unpack_to_rootfs(__initramfs_start, __initramfs_size); - printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n", err); file = filp_open("/initrd.image", O_WRONLY | O_CREAT, 0700); -- GitLab From 6b6ec4ca4e33d797f887cb2d7ecf365b652b338c Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Wed, 10 Jan 2024 23:47:40 +0800 Subject: [PATCH 0050/1519] eventfd: add a BUILD_BUG_ON() to ensure consistency between EFD_SEMAPHORE and the uapi introduce a BUILD_BUG_ON to check that the EFD_SEMAPHORE is equal to its definition in the uapi file, just like EFD_CLOEXEC and EFD_NONBLOCK. Signed-off-by: Wen Yang Link: https://lore.kernel.org/r/tencent_0BAA2DEAF9208D49987457E6583F9BE79507@qq.com Cc: Alexander Viro Cc: Christian Brauner Cc: Jens Axboe Cc: Jan Kara Cc: Cc: Signed-off-by: Christian Brauner --- fs/eventfd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/eventfd.c b/fs/eventfd.c index ad8186d47ba76..0252b71099fbc 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -383,6 +383,7 @@ static int do_eventfd(unsigned int count, int flags) /* Check the EFD_* constants for consistency. */ BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); + BUILD_BUG_ON(EFD_SEMAPHORE != (1 << 0)); if (flags & ~EFD_FLAGS_SET) return -EINVAL; -- GitLab From de8a3207aed33283a560193095b156d3b73fc4f0 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Tue, 16 Jan 2024 17:11:37 +0800 Subject: [PATCH 0051/1519] buffer: Use KMEM_CACHE instead of kmem_cache_create() Use the new KMEM_CACHE() macro instead of direct kmem_cache_create to simplify the creation of SLAB caches. Signed-off-by: Kunwu Chan Link: https://lore.kernel.org/r/20240116091137.92375-1-chentao@kylinos.cn Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/buffer.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index dcafee512089a..b55dea034a5d8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3121,12 +3121,8 @@ void __init buffer_init(void) unsigned long nrpages; int ret; - bh_cachep = kmem_cache_create("buffer_head", - sizeof(struct buffer_head), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| - SLAB_MEM_SPREAD), - NULL); - + bh_cachep = KMEM_CACHE(buffer_head, + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); /* * Limit the bh occupancy to 10% of ZONE_NORMAL */ -- GitLab From 8b3d838139bcd1e552f1899191f734264ce2a1a5 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 16 Jan 2024 15:53:35 +0800 Subject: [PATCH 0052/1519] fs: improve dump_mapping() robustness We met a kernel crash issue when running stress-ng testing, and the system crashes when printing the dentry name in dump_mapping(). Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 pc : dentry_name+0xd8/0x224 lr : pointer+0x22c/0x370 sp : ffff800025f134c0 ...... Call trace: dentry_name+0xd8/0x224 pointer+0x22c/0x370 vsnprintf+0x1ec/0x730 vscnprintf+0x2c/0x60 vprintk_store+0x70/0x234 vprintk_emit+0xe0/0x24c vprintk_default+0x3c/0x44 vprintk_func+0x84/0x2d0 printk+0x64/0x88 __dump_page+0x52c/0x530 dump_page+0x14/0x20 set_migratetype_isolate+0x110/0x224 start_isolate_page_range+0xc4/0x20c offline_pages+0x124/0x474 memory_block_offline+0x44/0xf4 memory_subsys_offline+0x3c/0x70 device_offline+0xf0/0x120 ...... The root cause is that, one thread is doing page migration, and we will use the target page's ->mapping field to save 'anon_vma' pointer between page unmap and page move, and now the target page is locked and refcount is 1. Currently, there is another stress-ng thread performing memory hotplug, attempting to offline the target page that is being migrated. It discovers that the refcount of this target page is 1, preventing the offline operation, thus proceeding to dump the page. However, page_mapping() of the target page may return an incorrect file mapping to crash the system in dump_mapping(), since the target page->mapping only saves 'anon_vma' pointer without setting PAGE_MAPPING_ANON flag. The page migration issue has been fixed by commit d1adb25df711 ("mm: migrate: fix getting incorrect page mapping during page migration"). In addition, Matthew suggested we should also improve dump_mapping()'s robustness to resilient against the kernel crash [1]. With checking the 'dentry.parent' and 'dentry.d_name.name' used by dentry_name(), I can see dump_mapping() will output the invalid dentry instead of crashing the system when this issue is reproduced again. [12211.189128] page:fffff7de047741c0 refcount:1 mapcount:0 mapping:ffff989117f55ea0 index:0x1 pfn:0x211dd07 [12211.189144] aops:0x0 ino:1 invalid dentry:74786574206e6870 [12211.189148] flags: 0x57ffffc0000001(locked|node=1|zone=2|lastcpupid=0x1fffff) [12211.189150] page_type: 0xffffffff() [12211.189153] raw: 0057ffffc0000001 0000000000000000 dead000000000122 ffff989117f55ea0 [12211.189154] raw: 0000000000000001 0000000000000001 00000001ffffffff 0000000000000000 [12211.189155] page dumped because: unmovable page [1] https://lore.kernel.org/all/ZXxn%2F0oixJxxAnpF@casper.infradead.org/ Suggested-by: Matthew Wilcox Signed-off-by: Baolin Wang Link: https://lore.kernel.org/r/937ab1f87328516821d39be672b6bc18861d9d3e.1705391420.git.baolin.wang@linux.alibaba.com Signed-off-by: Christian Brauner --- fs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index 91048c4c9c9e7..6d0d542303638 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -588,7 +588,8 @@ void dump_mapping(const struct address_space *mapping) } dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); - if (get_kernel_nofault(dentry, dentry_ptr)) { + if (get_kernel_nofault(dentry, dentry_ptr) || + !dentry.d_parent || !dentry.d_name.name) { pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", a_ops, ino, dentry_ptr); return; -- GitLab From 0f05ee447949cf1b72bd7c415c803ea7f7209403 Mon Sep 17 00:00:00 2001 From: Hu Yadi Date: Fri, 12 Jan 2024 15:40:59 +0800 Subject: [PATCH 0053/1519] selftests/filesystems:fix build error in overlayfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One build issue comes up due to both mount.h included dev_in_maps.c In file included from dev_in_maps.c:10: /usr/include/sys/mount.h:35:3: error: expected identifier before numeric constant 35 | MS_RDONLY = 1, /* Mount read-only. */ | ^~~~~~~~~ In file included from dev_in_maps.c:13: Remove one of them to solve conflict, another error comes up: dev_in_maps.c:170:6: error: implicit declaration of function ‘mount’ [-Werror=implicit-function-declaration] 170 | if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) { | ^~~~~ cc1: all warnings being treated as errors and then , add sys_mount definition to solve it After both above, dev_in_maps.c can be built correctly on my mache(gcc 10.2,glibc-2.32,kernel-5.10) Signed-off-by: Hu Yadi Link: https://lore.kernel.org/r/20240112074059.29673-1-hu.yadi@h3c.com Acked-by: Andrei Vagin Signed-off-by: Christian Brauner --- .../selftests/filesystems/overlayfs/dev_in_maps.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c index e19ab0e857091..759f86e7d263e 100644 --- a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c +++ b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -32,7 +31,11 @@ static int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags) { return syscall(__NR_fsmount, fd, flags, attr_flags); } - +static int sys_mount(const char *src, const char *tgt, const char *fst, + unsigned long flags, const void *data) +{ + return syscall(__NR_mount, src, tgt, fst, flags, data); +} static int sys_move_mount(int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, unsigned int flags) @@ -166,8 +169,7 @@ int main(int argc, char **argv) ksft_test_result_skip("unable to create a new mount namespace\n"); return 1; } - - if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) { + if (sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) { pr_perror("mount"); return 1; } -- GitLab From 9e3f1c59367515e7e40675fe83645a131c05039d Mon Sep 17 00:00:00 2001 From: "Hu.Yadi" Date: Thu, 11 Jan 2024 19:32:29 +0800 Subject: [PATCH 0054/1519] selftests/move_mount_set_group:Make tests build with old libc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace SYS_ with __NR_. Using the __NR_ notation, provided by UAPI, is useful to build tests on systems without the SYS_ definitions. Replace SYS_move_mount with __NR_move_mount Similar changes: commit 87129ef13603 ("selftests/landlock: Make tests build with old libc") Acked-by: Mickaël Salaün Signed-off-by: Hu.Yadi Link: https://lore.kernel.org/r/20240111113229.10820-1-hu.yadi@h3c.com Reviewed-by: Berlin Suggested-by: Jiao Signed-off-by: Christian Brauner --- .../move_mount_set_group/move_mount_set_group_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c b/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c index 50ed5d475dd13..bcf51d785a371 100644 --- a/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c +++ b/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c @@ -218,7 +218,7 @@ static bool move_mount_set_group_supported(void) if (mount(NULL, SET_GROUP_FROM, NULL, MS_SHARED, 0)) return -1; - ret = syscall(SYS_move_mount, AT_FDCWD, SET_GROUP_FROM, + ret = syscall(__NR_move_mount, AT_FDCWD, SET_GROUP_FROM, AT_FDCWD, SET_GROUP_TO, MOVE_MOUNT_SET_GROUP); umount2("/tmp", MNT_DETACH); @@ -363,7 +363,7 @@ TEST_F(move_mount_set_group, complex_sharing_copying) CLONE_VM | CLONE_FILES); ASSERT_GT(pid, 0); ASSERT_EQ(wait_for_pid(pid), 0); - ASSERT_EQ(syscall(SYS_move_mount, ca_from.mntfd, "", + ASSERT_EQ(syscall(__NR_move_mount, ca_from.mntfd, "", ca_to.mntfd, "", MOVE_MOUNT_SET_GROUP | MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); -- GitLab From 73fa7547c70b32cc69685f79be31135797734eb6 Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Mon, 31 Aug 2020 11:32:08 -0400 Subject: [PATCH 0055/1519] vfs: add RWF_NOAPPEND flag for pwritev2 The pwrite function, originally defined by POSIX (thus the "p"), is defined to ignore O_APPEND and write at the offset passed as its argument. However, historically Linux honored O_APPEND if set and ignored the offset. This cannot be changed due to stability policy, but is documented in the man page as a bug. Now that there's a pwritev2 syscall providing a superset of the pwrite functionality that has a flags argument, the conforming behavior can be offered to userspace via a new flag. Since pwritev2 checks flag validity (in kiocb_set_rw_flags) and reports unknown ones with EOPNOTSUPP, callers will not get wrong behavior on old kernels that don't support the new flag; the error is reported and the caller can decide how to handle it. Signed-off-by: Rich Felker Link: https://lore.kernel.org/r/20200831153207.GO3265@brightrain.aerifal.cx Reviewed-by: Jann Horn Signed-off-by: Christian Brauner --- include/linux/fs.h | 8 ++++++++ include/uapi/linux/fs.h | 5 ++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a704951..4f7cfda29143e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3335,6 +3335,8 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return 0; if (unlikely(flags & ~RWF_SUPPORTED)) return -EOPNOTSUPP; + if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND))) + return -EINVAL; if (flags & RWF_NOWAIT) { if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) @@ -3345,6 +3347,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; + if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) { + if (IS_APPEND(file_inode(ki->ki_filp))) + return -EPERM; + ki->ki_flags &= ~IOCB_APPEND; + } + ki->ki_flags |= kiocb_flags; return 0; } diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 48ad69f7722e1..2203d3194b91a 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -301,9 +301,12 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO O_APPEND */ #define RWF_APPEND ((__force __kernel_rwf_t)0x00000010) +/* per-IO negation of O_APPEND */ +#define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND) + RWF_APPEND | RWF_NOAPPEND) /* Pagemap ioctl */ #define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) -- GitLab From 12f7900c575679af411aaa89340bfe3dc68b46b3 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 19 Jan 2024 04:33:39 +0800 Subject: [PATCH 0056/1519] writeback: move wb_wakeup_delayed defination to fs-writeback.c The wb_wakeup_delayed is only used in fs-writeback.c. Move it to fs-writeback.c after defination of wb_wakeup and make it static. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20240118203339.764093-1-shikemeng@huaweicloud.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 25 +++++++++++++++++++++++++ include/linux/backing-dev.h | 1 - mm/backing-dev.c | 25 ------------------------- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3d84fcc471c60..e4f17c53ddfcf 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -141,6 +141,31 @@ static void wb_wakeup(struct bdi_writeback *wb) spin_unlock_irq(&wb->work_lock); } +/* + * This function is used when the first inode for this wb is marked dirty. It + * wakes-up the corresponding bdi thread which should then take care of the + * periodic background write-out of dirty inodes. Since the write-out would + * starts only 'dirty_writeback_interval' centisecs from now anyway, we just + * set up a timer which wakes the bdi thread up later. + * + * Note, we wouldn't bother setting up the timer, but this function is on the + * fast-path (used by '__mark_inode_dirty()'), so we save few context switches + * by delaying the wake-up. + * + * We have to be careful not to postpone flush work if it is scheduled for + * earlier. Thus we use queue_delayed_work(). + */ +static void wb_wakeup_delayed(struct bdi_writeback *wb) +{ + unsigned long timeout; + + timeout = msecs_to_jiffies(dirty_writeback_interval * 10); + spin_lock_irq(&wb->work_lock); + if (test_bit(WB_registered, &wb->state)) + queue_delayed_work(bdi_wq, &wb->dwork, timeout); + spin_unlock_irq(&wb->work_lock); +} + static void finish_writeback_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 1a97277f99b1b..8e7af9a03b41d 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -38,7 +38,6 @@ struct backing_dev_info *bdi_alloc(int node_id); void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); -void wb_wakeup_delayed(struct bdi_writeback *wb); void wb_wait_for_completion(struct wb_completion *done); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1e3447bccdb14..039dc74b505a8 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -372,31 +372,6 @@ static int __init default_bdi_init(void) } subsys_initcall(default_bdi_init); -/* - * This function is used when the first inode for this wb is marked dirty. It - * wakes-up the corresponding bdi thread which should then take care of the - * periodic background write-out of dirty inodes. Since the write-out would - * starts only 'dirty_writeback_interval' centisecs from now anyway, we just - * set up a timer which wakes the bdi thread up later. - * - * Note, we wouldn't bother setting up the timer, but this function is on the - * fast-path (used by '__mark_inode_dirty()'), so we save few context switches - * by delaying the wake-up. - * - * We have to be careful not to postpone flush work if it is scheduled for - * earlier. Thus we use queue_delayed_work(). - */ -void wb_wakeup_delayed(struct bdi_writeback *wb) -{ - unsigned long timeout; - - timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - spin_lock_irq(&wb->work_lock); - if (test_bit(WB_registered, &wb->state)) - queue_delayed_work(bdi_wq, &wb->dwork, timeout); - spin_unlock_irq(&wb->work_lock); -} - static void wb_update_bandwidth_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), -- GitLab From 3948abaa4e2be938ccdfc289385a27342fb13d43 Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Fri, 19 Jan 2024 07:39:06 -0800 Subject: [PATCH 0057/1519] do_sys_name_to_handle(): use kzalloc() to fix kernel-infoleak syzbot identified a kernel information leak vulnerability in do_sys_name_to_handle() and issued the following report [1]. [1] "BUG: KMSAN: kernel-infoleak in instrument_copy_to_user include/linux/instrumented.h:114 [inline] BUG: KMSAN: kernel-infoleak in _copy_to_user+0xbc/0x100 lib/usercopy.c:40 instrument_copy_to_user include/linux/instrumented.h:114 [inline] _copy_to_user+0xbc/0x100 lib/usercopy.c:40 copy_to_user include/linux/uaccess.h:191 [inline] do_sys_name_to_handle fs/fhandle.c:73 [inline] __do_sys_name_to_handle_at fs/fhandle.c:112 [inline] __se_sys_name_to_handle_at+0x949/0xb10 fs/fhandle.c:94 __x64_sys_name_to_handle_at+0xe4/0x140 fs/fhandle.c:94 ... Uninit was created at: slab_post_alloc_hook+0x129/0xa70 mm/slab.h:768 slab_alloc_node mm/slub.c:3478 [inline] __kmem_cache_alloc_node+0x5c9/0x970 mm/slub.c:3517 __do_kmalloc_node mm/slab_common.c:1006 [inline] __kmalloc+0x121/0x3c0 mm/slab_common.c:1020 kmalloc include/linux/slab.h:604 [inline] do_sys_name_to_handle fs/fhandle.c:39 [inline] __do_sys_name_to_handle_at fs/fhandle.c:112 [inline] __se_sys_name_to_handle_at+0x441/0xb10 fs/fhandle.c:94 __x64_sys_name_to_handle_at+0xe4/0x140 fs/fhandle.c:94 ... Bytes 18-19 of 20 are uninitialized Memory access of size 20 starts at ffff888128a46380 Data copied to user address 0000000020000240" Per Chuck Lever's suggestion, use kzalloc() instead of kmalloc() to solve the problem. Fixes: 990d6c2d7aee ("vfs: Add name to file handle conversion support") Suggested-by: Chuck Lever III Reported-and-tested-by: Signed-off-by: Nikita Zhandarovich Link: https://lore.kernel.org/r/20240119153906.4367-1-n.zhandarovich@fintech.ru Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fhandle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fhandle.c b/fs/fhandle.c index 18b3ba8dc8ead..57a12614addfd 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -36,7 +36,7 @@ static long do_sys_name_to_handle(const struct path *path, if (f_handle.handle_bytes > MAX_HANDLE_SZ) return -EINVAL; - handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + handle = kzalloc(sizeof(struct file_handle) + f_handle.handle_bytes, GFP_KERNEL); if (!handle) return -ENOMEM; -- GitLab From 6727980b67852dae6b82a97e24fbadbdd218c033 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Fri, 12 Jan 2024 14:43:50 -0300 Subject: [PATCH 0058/1519] kselftests: lib.mk: Add TEST_GEN_MODS_DIR variable Add TEST_GEN_MODS_DIR variable for kselftests. It can point to a directory containing kernel modules that will be used by selftest scripts. The modules are built as external modules for the running kernel. As a result they are always binary compatible and the same tests can be used for older or newer kernels. The build requires "kernel-devel" package to be installed. For example, in the upstream sources, the rpm devel package is produced by "make rpm-pkg" The modules can be built independently by make -C tools/testing/selftests/livepatch/ or they will be automatically built before running the tests via make -C tools/testing/selftests/livepatch/ run_tests Note that they are _not_ built when running the standalone tests by calling, for example, ./test-state.sh. Along with TEST_GEN_MODS_DIR, it was necessary to create a new install rule. INSTALL_MODS_RULE is needed because INSTALL_SINGLE_RULE would copy the entire TEST_GEN_MODS_DIR directory to the destination, even the files created by Kbuild to compile the modules. The new install rule copies only the .ko files, as we would expect the gen_tar to work. Reviewed-by: Joe Lawrence Reviewed-by: Petr Mladek Signed-off-by: Marcos Paulo de Souza Signed-off-by: Shuah Khan --- Documentation/dev-tools/kselftest.rst | 4 ++++ tools/testing/selftests/lib.mk | 26 +++++++++++++++++++++----- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst index ab376b316c36d..7f3582a67318b 100644 --- a/Documentation/dev-tools/kselftest.rst +++ b/Documentation/dev-tools/kselftest.rst @@ -245,6 +245,10 @@ Contributing new tests (details) TEST_PROGS, TEST_GEN_PROGS mean it is the executable tested by default. + TEST_GEN_MODS_DIR should be used by tests that require modules to be built + before the test starts. The variable will contain the name of the directory + containing the modules. + TEST_CUSTOM_PROGS should be used by tests that require custom build rules and prevent common build rule use. diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index aa646e0661f36..0d8b7db927151 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -54,11 +54,15 @@ endif # TEST_PROGS are for test shell scripts. # TEST_CUSTOM_PROGS and TEST_PROGS will be run by common run_tests # and install targets. Common clean doesn't touch them. +# TEST_GEN_MODS_DIR is used to specify a directory with modules to be built +# before the test executes. These modules are cleaned on the clean target as well. TEST_GEN_PROGS := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS)) TEST_GEN_PROGS_EXTENDED := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS_EXTENDED)) TEST_GEN_FILES := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_FILES)) +TEST_GEN_MODS_DIR := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_MODS_DIR)) -all: $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) +all: $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) \ + $(if $(TEST_GEN_MODS_DIR),gen_mods_dir) define RUN_TESTS BASE_DIR="$(selfdir)"; \ @@ -71,8 +75,8 @@ endef run_tests: all ifdef building_out_of_srctree - @if [ "X$(TEST_PROGS)$(TEST_PROGS_EXTENDED)$(TEST_FILES)" != "X" ]; then \ - rsync -aq --copy-unsafe-links $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(OUTPUT); \ + @if [ "X$(TEST_PROGS)$(TEST_PROGS_EXTENDED)$(TEST_FILES)$(TEST_GEN_MODS_DIR)" != "X" ]; then \ + rsync -aq --copy-unsafe-links $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(TEST_GEN_MODS_DIR) $(OUTPUT); \ fi @if [ "X$(TEST_PROGS)" != "X" ]; then \ $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) \ @@ -84,11 +88,22 @@ else @$(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_PROGS)) endif +gen_mods_dir: + $(Q)$(MAKE) -C $(TEST_GEN_MODS_DIR) + +clean_mods_dir: + $(Q)$(MAKE) -C $(TEST_GEN_MODS_DIR) clean + define INSTALL_SINGLE_RULE $(if $(INSTALL_LIST),@mkdir -p $(INSTALL_PATH)) $(if $(INSTALL_LIST),rsync -a --copy-unsafe-links $(INSTALL_LIST) $(INSTALL_PATH)/) endef +define INSTALL_MODS_RULE + $(if $(INSTALL_LIST),@mkdir -p $(INSTALL_PATH)/$(INSTALL_LIST)) + $(if $(INSTALL_LIST),rsync -a --copy-unsafe-links $(INSTALL_LIST)/*.ko $(INSTALL_PATH)/$(INSTALL_LIST)) +endef + define INSTALL_RULE $(eval INSTALL_LIST = $(TEST_PROGS)) $(INSTALL_SINGLE_RULE) $(eval INSTALL_LIST = $(TEST_PROGS_EXTENDED)) $(INSTALL_SINGLE_RULE) @@ -97,6 +112,7 @@ define INSTALL_RULE $(eval INSTALL_LIST = $(TEST_CUSTOM_PROGS)) $(INSTALL_SINGLE_RULE) $(eval INSTALL_LIST = $(TEST_GEN_PROGS_EXTENDED)) $(INSTALL_SINGLE_RULE) $(eval INSTALL_LIST = $(TEST_GEN_FILES)) $(INSTALL_SINGLE_RULE) + $(eval INSTALL_LIST = $(notdir $(TEST_GEN_MODS_DIR))) $(INSTALL_MODS_RULE) $(eval INSTALL_LIST = $(wildcard config settings)) $(INSTALL_SINGLE_RULE) endef @@ -122,7 +138,7 @@ define CLEAN $(RM) -r $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(EXTRA_CLEAN) endef -clean: +clean: $(if $(TEST_GEN_MODS_DIR),clean_mods_dir) $(CLEAN) # Enables to extend CFLAGS and LDFLAGS from command line, e.g. @@ -153,4 +169,4 @@ $(OUTPUT)/%:%.S $(LINK.S) $^ $(LDLIBS) -o $@ endif -.PHONY: run_tests all clean install emit_tests +.PHONY: run_tests all clean install emit_tests gen_mods_dir clean_mods_dir -- GitLab From c4bbe83d27c2446a033cc0381c3fb6be5e8c41c7 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Fri, 12 Jan 2024 14:43:51 -0300 Subject: [PATCH 0059/1519] livepatch: Move tests from lib/livepatch to selftests/livepatch The modules are being moved from lib/livepatch to tools/testing/selftests/livepatch/test_modules. This code moving will allow writing more complex tests, like for example an userspace C code that will call a livepatched kernel function. The modules are now built as out-of-tree modules, but being part of the kernel source means they will be maintained. Another advantage of the code moving is to be able to easily change, debug and rebuild the tests by running make on the selftests/livepatch directory, which is not currently possible since the modules on lib/livepatch are build and installed using the "modules" target. The current approach also keeps the ability to execute the tests manually by executing the scripts inside selftests/livepatch directory, as it's currently supported. If the modules are modified, they needed to be rebuilt before running the scripts though. The modules are built before running the selftests when using the kselftest invocations: make kselftest TARGETS=livepatch or make -C tools/testing/selftests/livepatch run_tests Having the modules being built as out-of-modules requires changing the currently used 'modprobe' by 'insmod' and adapt the test scripts that check for the kernel message buffer. Now it is possible to only compile the modules by running: make -C tools/testing/selftests/livepatch/ This way the test modules and other test program can be built in order to be packaged if so desired. As there aren't any modules being built on lib/livepatch, remove the TEST_LIVEPATCH Kconfig and it's references. Note: "make gen_tar" packages the pre-built binaries into the tarball. It means that it will store the test modules pre-built for the kernel running on the build host. Note that these modules need not binary compatible with the kernel built from the same sources. But the same is true for other packaged selftest binaries. The entire kernel sources are needed for rebuilding the selftests on another system. Reviewed-by: Joe Lawrence Reviewed-by: Petr Mladek Signed-off-by: Marcos Paulo de Souza Acked-by: Alexander Gordeev Signed-off-by: Shuah Khan --- MAINTAINERS | 1 - arch/s390/configs/debug_defconfig | 1 - arch/s390/configs/defconfig | 1 - lib/Kconfig.debug | 22 -------- lib/Makefile | 2 - lib/livepatch/Makefile | 14 ------ tools/testing/selftests/livepatch/Makefile | 1 + tools/testing/selftests/livepatch/README | 25 +++++++--- tools/testing/selftests/livepatch/config | 1 - .../testing/selftests/livepatch/functions.sh | 34 +++++-------- .../selftests/livepatch/test-callbacks.sh | 50 +++++++++---------- .../selftests/livepatch/test-ftrace.sh | 6 +-- .../selftests/livepatch/test-livepatch.sh | 10 ++-- .../selftests/livepatch/test-shadow-vars.sh | 2 +- .../testing/selftests/livepatch/test-state.sh | 18 +++---- .../testing/selftests/livepatch/test-sysfs.sh | 6 +-- .../selftests/livepatch/test_modules/Makefile | 19 +++++++ .../test_modules}/test_klp_atomic_replace.c | 0 .../test_modules}/test_klp_callbacks_busy.c | 0 .../test_modules}/test_klp_callbacks_demo.c | 0 .../test_modules}/test_klp_callbacks_demo2.c | 0 .../test_modules}/test_klp_callbacks_mod.c | 0 .../test_modules}/test_klp_livepatch.c | 0 .../test_modules}/test_klp_shadow_vars.c | 0 .../livepatch/test_modules}/test_klp_state.c | 0 .../livepatch/test_modules}/test_klp_state2.c | 0 .../livepatch/test_modules}/test_klp_state3.c | 0 27 files changed, 98 insertions(+), 115 deletions(-) delete mode 100644 lib/livepatch/Makefile create mode 100644 tools/testing/selftests/livepatch/test_modules/Makefile rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_atomic_replace.c (100%) rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_callbacks_busy.c (100%) rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_callbacks_demo.c (100%) rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_callbacks_demo2.c (100%) rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_callbacks_mod.c (100%) rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_livepatch.c (100%) rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_shadow_vars.c (100%) rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_state.c (100%) rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_state2.c (100%) rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_state3.c (100%) diff --git a/MAINTAINERS b/MAINTAINERS index 8d1052fa6a692..ad5bec15bf0fc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12510,7 +12510,6 @@ F: arch/powerpc/include/asm/livepatch.h F: include/linux/livepatch.h F: kernel/livepatch/ F: kernel/module/livepatch.c -F: lib/livepatch/ F: samples/livepatch/ F: tools/testing/selftests/livepatch/ diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index cae2dd34fbb49..f9c44d392125b 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -886,4 +886,3 @@ CONFIG_ATOMIC64_SELFTEST=y CONFIG_STRING_SELFTEST=y CONFIG_TEST_BITOPS=m CONFIG_TEST_BPF=m -CONFIG_TEST_LIVEPATCH=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 42b988873e544..ef78734b235c4 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -815,4 +815,3 @@ CONFIG_KPROBES_SANITY_TEST=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y CONFIG_TEST_BPF=m -CONFIG_TEST_LIVEPATCH=m diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 975a07f9f1cc0..c95d52853bf97 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2857,28 +2857,6 @@ config TEST_MEMCAT_P If unsure, say N. -config TEST_LIVEPATCH - tristate "Test livepatching" - default n - depends on DYNAMIC_DEBUG - depends on LIVEPATCH - depends on m - help - Test kernel livepatching features for correctness. The tests will - load test modules that will be livepatched in various scenarios. - - To run all the livepatching tests: - - make -C tools/testing/selftests TARGETS=livepatch run_tests - - Alternatively, individual tests may be invoked: - - tools/testing/selftests/livepatch/test-callbacks.sh - tools/testing/selftests/livepatch/test-livepatch.sh - tools/testing/selftests/livepatch/test-shadow-vars.sh - - If unsure, say N. - config TEST_OBJAGG tristate "Perform selftest on object aggreration manager" default n diff --git a/lib/Makefile b/lib/Makefile index 6b09731d8e619..95ed57f377fd9 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -134,8 +134,6 @@ endif obj-$(CONFIG_TEST_FPU) += test_fpu.o CFLAGS_test_fpu.o += $(FPU_CFLAGS) -obj-$(CONFIG_TEST_LIVEPATCH) += livepatch/ - # Some KUnit files (hooks.o) need to be built-in even when KUnit is a module, # so we can't just use obj-$(CONFIG_KUNIT). ifdef CONFIG_KUNIT diff --git a/lib/livepatch/Makefile b/lib/livepatch/Makefile deleted file mode 100644 index dcc912b3478fe..0000000000000 --- a/lib/livepatch/Makefile +++ /dev/null @@ -1,14 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for livepatch test code. - -obj-$(CONFIG_TEST_LIVEPATCH) += test_klp_atomic_replace.o \ - test_klp_callbacks_demo.o \ - test_klp_callbacks_demo2.o \ - test_klp_callbacks_busy.o \ - test_klp_callbacks_mod.o \ - test_klp_livepatch.o \ - test_klp_shadow_vars.o \ - test_klp_state.o \ - test_klp_state2.o \ - test_klp_state3.o diff --git a/tools/testing/selftests/livepatch/Makefile b/tools/testing/selftests/livepatch/Makefile index 02fadc9d55e0f..119e2bbebe5d0 100644 --- a/tools/testing/selftests/livepatch/Makefile +++ b/tools/testing/selftests/livepatch/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 +TEST_GEN_MODS_DIR := test_modules TEST_PROGS_EXTENDED := functions.sh TEST_PROGS := \ test-livepatch.sh \ diff --git a/tools/testing/selftests/livepatch/README b/tools/testing/selftests/livepatch/README index 0942dd5826f87..d2035dd64a2be 100644 --- a/tools/testing/selftests/livepatch/README +++ b/tools/testing/selftests/livepatch/README @@ -13,23 +13,36 @@ the message buffer for only the duration of each individual test.) Config ------ -Set these config options and their prerequisites: +Set CONFIG_LIVEPATCH=y option and it's prerequisites. -CONFIG_LIVEPATCH=y -CONFIG_TEST_LIVEPATCH=m +Building the tests +------------------ + +To only build the tests without running them, run: + + % make -C tools/testing/selftests/livepatch + +The command above will compile all test modules and test programs, making them +ready to be packaged if so desired. Running the tests ----------------- -Test kernel modules are built as part of lib/ (make modules) and need to -be installed (make modules_install) as the test scripts will modprobe -them. +Test kernel modules are built before running the livepatch selftests. The +modules are located under test_modules directory, and are built as out-of-tree +modules. This is specially useful since the same sources can be built and +tested on systems with different kABI, ensuring they the tests are backwards +compatible. The modules will be loaded by the test scripts using insmod. To run the livepatch selftests, from the top of the kernel source tree: % make -C tools/testing/selftests TARGETS=livepatch run_tests +or + + % make kselftest TARGETS=livepatch + Adding tests ------------ diff --git a/tools/testing/selftests/livepatch/config b/tools/testing/selftests/livepatch/config index ad23100cb27c8..e88bf518a23ab 100644 --- a/tools/testing/selftests/livepatch/config +++ b/tools/testing/selftests/livepatch/config @@ -1,3 +1,2 @@ CONFIG_LIVEPATCH=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_TEST_LIVEPATCH=m diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh index c8416c54b4637..e60cf09491a66 100644 --- a/tools/testing/selftests/livepatch/functions.sh +++ b/tools/testing/selftests/livepatch/functions.sh @@ -127,16 +127,14 @@ function loop_until() { done } -function assert_mod() { - local mod="$1" - - modprobe --dry-run "$mod" &>/dev/null -} - function is_livepatch_mod() { local mod="$1" - if [[ $(modinfo "$mod" | awk '/^livepatch:/{print $NF}') == "Y" ]]; then + if [[ ! -f "test_modules/$mod.ko" ]]; then + die "Can't find \"test_modules/$mod.ko\", try \"make\"" + fi + + if [[ $(modinfo "test_modules/$mod.ko" | awk '/^livepatch:/{print $NF}') == "Y" ]]; then return 0 fi @@ -146,9 +144,9 @@ function is_livepatch_mod() { function __load_mod() { local mod="$1"; shift - local msg="% modprobe $mod $*" + local msg="% insmod test_modules/$mod.ko $*" log "${msg%% }" - ret=$(modprobe "$mod" "$@" 2>&1) + ret=$(insmod "test_modules/$mod.ko" "$@" 2>&1) if [[ "$ret" != "" ]]; then die "$ret" fi @@ -161,13 +159,10 @@ function __load_mod() { # load_mod(modname, params) - load a kernel module # modname - module name to load -# params - module parameters to pass to modprobe +# params - module parameters to pass to insmod function load_mod() { local mod="$1"; shift - assert_mod "$mod" || - skip "unable to load module ${mod}, verify CONFIG_TEST_LIVEPATCH=m and run self-tests as root" - is_livepatch_mod "$mod" && die "use load_lp() to load the livepatch module $mod" @@ -177,13 +172,10 @@ function load_mod() { # load_lp_nowait(modname, params) - load a kernel module with a livepatch # but do not wait on until the transition finishes # modname - module name to load -# params - module parameters to pass to modprobe +# params - module parameters to pass to insmod function load_lp_nowait() { local mod="$1"; shift - assert_mod "$mod" || - skip "unable to load module ${mod}, verify CONFIG_TEST_LIVEPATCH=m and run self-tests as root" - is_livepatch_mod "$mod" || die "module $mod is not a livepatch" @@ -196,7 +188,7 @@ function load_lp_nowait() { # load_lp(modname, params) - load a kernel module with a livepatch # modname - module name to load -# params - module parameters to pass to modprobe +# params - module parameters to pass to insmod function load_lp() { local mod="$1"; shift @@ -209,13 +201,13 @@ function load_lp() { # load_failing_mod(modname, params) - load a kernel module, expect to fail # modname - module name to load -# params - module parameters to pass to modprobe +# params - module parameters to pass to insmod function load_failing_mod() { local mod="$1"; shift - local msg="% modprobe $mod $*" + local msg="% insmod test_modules/$mod.ko $*" log "${msg%% }" - ret=$(modprobe "$mod" "$@" 2>&1) + ret=$(insmod "test_modules/$mod.ko" "$@" 2>&1) if [[ "$ret" == "" ]]; then die "$mod unexpectedly loaded" fi diff --git a/tools/testing/selftests/livepatch/test-callbacks.sh b/tools/testing/selftests/livepatch/test-callbacks.sh index 90b26dbb26262..32b150e25b10b 100755 --- a/tools/testing/selftests/livepatch/test-callbacks.sh +++ b/tools/testing/selftests/livepatch/test-callbacks.sh @@ -34,9 +34,9 @@ disable_lp $MOD_LIVEPATCH unload_lp $MOD_LIVEPATCH unload_mod $MOD_TARGET -check_result "% modprobe $MOD_TARGET +check_result "% insmod test_modules/$MOD_TARGET.ko $MOD_TARGET: ${MOD_TARGET}_init -% modprobe $MOD_LIVEPATCH +% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition $MOD_LIVEPATCH: pre_patch_callback: vmlinux @@ -81,7 +81,7 @@ disable_lp $MOD_LIVEPATCH unload_lp $MOD_LIVEPATCH unload_mod $MOD_TARGET -check_result "% modprobe $MOD_LIVEPATCH +check_result "% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition $MOD_LIVEPATCH: pre_patch_callback: vmlinux @@ -89,7 +89,7 @@ livepatch: '$MOD_LIVEPATCH': starting patching transition livepatch: '$MOD_LIVEPATCH': completing patching transition $MOD_LIVEPATCH: post_patch_callback: vmlinux livepatch: '$MOD_LIVEPATCH': patching complete -% modprobe $MOD_TARGET +% insmod test_modules/$MOD_TARGET.ko livepatch: applying patch '$MOD_LIVEPATCH' to loading module '$MOD_TARGET' $MOD_LIVEPATCH: pre_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init $MOD_LIVEPATCH: post_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init @@ -129,9 +129,9 @@ unload_mod $MOD_TARGET disable_lp $MOD_LIVEPATCH unload_lp $MOD_LIVEPATCH -check_result "% modprobe $MOD_TARGET +check_result "% insmod test_modules/$MOD_TARGET.ko $MOD_TARGET: ${MOD_TARGET}_init -% modprobe $MOD_LIVEPATCH +% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition $MOD_LIVEPATCH: pre_patch_callback: vmlinux @@ -177,7 +177,7 @@ unload_mod $MOD_TARGET disable_lp $MOD_LIVEPATCH unload_lp $MOD_LIVEPATCH -check_result "% modprobe $MOD_LIVEPATCH +check_result "% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition $MOD_LIVEPATCH: pre_patch_callback: vmlinux @@ -185,7 +185,7 @@ livepatch: '$MOD_LIVEPATCH': starting patching transition livepatch: '$MOD_LIVEPATCH': completing patching transition $MOD_LIVEPATCH: post_patch_callback: vmlinux livepatch: '$MOD_LIVEPATCH': patching complete -% modprobe $MOD_TARGET +% insmod test_modules/$MOD_TARGET.ko livepatch: applying patch '$MOD_LIVEPATCH' to loading module '$MOD_TARGET' $MOD_LIVEPATCH: pre_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init $MOD_LIVEPATCH: post_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init @@ -219,7 +219,7 @@ load_lp $MOD_LIVEPATCH disable_lp $MOD_LIVEPATCH unload_lp $MOD_LIVEPATCH -check_result "% modprobe $MOD_LIVEPATCH +check_result "% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition $MOD_LIVEPATCH: pre_patch_callback: vmlinux @@ -254,9 +254,9 @@ load_mod $MOD_TARGET load_failing_mod $MOD_LIVEPATCH pre_patch_ret=-19 unload_mod $MOD_TARGET -check_result "% modprobe $MOD_TARGET +check_result "% insmod test_modules/$MOD_TARGET.ko $MOD_TARGET: ${MOD_TARGET}_init -% modprobe $MOD_LIVEPATCH pre_patch_ret=-19 +% insmod test_modules/$MOD_LIVEPATCH.ko pre_patch_ret=-19 livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition test_klp_callbacks_demo: pre_patch_callback: vmlinux @@ -265,7 +265,7 @@ livepatch: failed to enable patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': canceling patching transition, going to unpatch livepatch: '$MOD_LIVEPATCH': completing unpatching transition livepatch: '$MOD_LIVEPATCH': unpatching complete -modprobe: ERROR: could not insert '$MOD_LIVEPATCH': No such device +insmod: ERROR: could not insert module test_modules/$MOD_LIVEPATCH.ko: No such device % rmmod $MOD_TARGET $MOD_TARGET: ${MOD_TARGET}_exit" @@ -295,7 +295,7 @@ load_failing_mod $MOD_TARGET disable_lp $MOD_LIVEPATCH unload_lp $MOD_LIVEPATCH -check_result "% modprobe $MOD_LIVEPATCH +check_result "% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition $MOD_LIVEPATCH: pre_patch_callback: vmlinux @@ -304,12 +304,12 @@ livepatch: '$MOD_LIVEPATCH': completing patching transition $MOD_LIVEPATCH: post_patch_callback: vmlinux livepatch: '$MOD_LIVEPATCH': patching complete % echo -19 > /sys/module/$MOD_LIVEPATCH/parameters/pre_patch_ret -% modprobe $MOD_TARGET +% insmod test_modules/$MOD_TARGET.ko livepatch: applying patch '$MOD_LIVEPATCH' to loading module '$MOD_TARGET' $MOD_LIVEPATCH: pre_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init livepatch: pre-patch callback failed for object '$MOD_TARGET' livepatch: patch '$MOD_LIVEPATCH' failed for module '$MOD_TARGET', refusing to load module '$MOD_TARGET' -modprobe: ERROR: could not insert '$MOD_TARGET': No such device +insmod: ERROR: could not insert module test_modules/$MOD_TARGET.ko: No such device % echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled livepatch: '$MOD_LIVEPATCH': initializing unpatching transition $MOD_LIVEPATCH: pre_unpatch_callback: vmlinux @@ -340,11 +340,11 @@ disable_lp $MOD_LIVEPATCH unload_lp $MOD_LIVEPATCH unload_mod $MOD_TARGET_BUSY -check_result "% modprobe $MOD_TARGET_BUSY block_transition=N +check_result "% insmod test_modules/$MOD_TARGET_BUSY.ko block_transition=N $MOD_TARGET_BUSY: ${MOD_TARGET_BUSY}_init $MOD_TARGET_BUSY: busymod_work_func enter $MOD_TARGET_BUSY: busymod_work_func exit -% modprobe $MOD_LIVEPATCH +% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition $MOD_LIVEPATCH: pre_patch_callback: vmlinux @@ -354,7 +354,7 @@ livepatch: '$MOD_LIVEPATCH': completing patching transition $MOD_LIVEPATCH: post_patch_callback: vmlinux $MOD_LIVEPATCH: post_patch_callback: $MOD_TARGET_BUSY -> [MODULE_STATE_LIVE] Normal state livepatch: '$MOD_LIVEPATCH': patching complete -% modprobe $MOD_TARGET +% insmod test_modules/$MOD_TARGET.ko livepatch: applying patch '$MOD_LIVEPATCH' to loading module '$MOD_TARGET' $MOD_LIVEPATCH: pre_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init $MOD_LIVEPATCH: post_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init @@ -421,16 +421,16 @@ disable_lp $MOD_LIVEPATCH unload_lp $MOD_LIVEPATCH unload_mod $MOD_TARGET_BUSY -check_result "% modprobe $MOD_TARGET_BUSY block_transition=Y +check_result "% insmod test_modules/$MOD_TARGET_BUSY.ko block_transition=Y $MOD_TARGET_BUSY: ${MOD_TARGET_BUSY}_init $MOD_TARGET_BUSY: busymod_work_func enter -% modprobe $MOD_LIVEPATCH +% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition $MOD_LIVEPATCH: pre_patch_callback: vmlinux $MOD_LIVEPATCH: pre_patch_callback: $MOD_TARGET_BUSY -> [MODULE_STATE_LIVE] Normal state livepatch: '$MOD_LIVEPATCH': starting patching transition -% modprobe $MOD_TARGET +% insmod test_modules/$MOD_TARGET.ko livepatch: applying patch '$MOD_LIVEPATCH' to loading module '$MOD_TARGET' $MOD_LIVEPATCH: pre_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init $MOD_TARGET: ${MOD_TARGET}_init @@ -467,7 +467,7 @@ disable_lp $MOD_LIVEPATCH unload_lp $MOD_LIVEPATCH2 unload_lp $MOD_LIVEPATCH -check_result "% modprobe $MOD_LIVEPATCH +check_result "% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition $MOD_LIVEPATCH: pre_patch_callback: vmlinux @@ -475,7 +475,7 @@ livepatch: '$MOD_LIVEPATCH': starting patching transition livepatch: '$MOD_LIVEPATCH': completing patching transition $MOD_LIVEPATCH: post_patch_callback: vmlinux livepatch: '$MOD_LIVEPATCH': patching complete -% modprobe $MOD_LIVEPATCH2 +% insmod test_modules/$MOD_LIVEPATCH2.ko livepatch: enabling patch '$MOD_LIVEPATCH2' livepatch: '$MOD_LIVEPATCH2': initializing patching transition $MOD_LIVEPATCH2: pre_patch_callback: vmlinux @@ -523,7 +523,7 @@ disable_lp $MOD_LIVEPATCH2 unload_lp $MOD_LIVEPATCH2 unload_lp $MOD_LIVEPATCH -check_result "% modprobe $MOD_LIVEPATCH +check_result "% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition $MOD_LIVEPATCH: pre_patch_callback: vmlinux @@ -531,7 +531,7 @@ livepatch: '$MOD_LIVEPATCH': starting patching transition livepatch: '$MOD_LIVEPATCH': completing patching transition $MOD_LIVEPATCH: post_patch_callback: vmlinux livepatch: '$MOD_LIVEPATCH': patching complete -% modprobe $MOD_LIVEPATCH2 replace=1 +% insmod test_modules/$MOD_LIVEPATCH2.ko replace=1 livepatch: enabling patch '$MOD_LIVEPATCH2' livepatch: '$MOD_LIVEPATCH2': initializing patching transition $MOD_LIVEPATCH2: pre_patch_callback: vmlinux diff --git a/tools/testing/selftests/livepatch/test-ftrace.sh b/tools/testing/selftests/livepatch/test-ftrace.sh index 825540a5194d4..730218bce99c5 100755 --- a/tools/testing/selftests/livepatch/test-ftrace.sh +++ b/tools/testing/selftests/livepatch/test-ftrace.sh @@ -35,7 +35,7 @@ disable_lp $MOD_LIVEPATCH unload_lp $MOD_LIVEPATCH check_result "livepatch: kernel.ftrace_enabled = 0 -% modprobe $MOD_LIVEPATCH +% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition livepatch: failed to register ftrace handler for function 'cmdline_proc_show' (-16) @@ -44,9 +44,9 @@ livepatch: failed to enable patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': canceling patching transition, going to unpatch livepatch: '$MOD_LIVEPATCH': completing unpatching transition livepatch: '$MOD_LIVEPATCH': unpatching complete -modprobe: ERROR: could not insert '$MOD_LIVEPATCH': Device or resource busy +insmod: ERROR: could not insert module test_modules/$MOD_LIVEPATCH.ko: Device or resource busy livepatch: kernel.ftrace_enabled = 1 -% modprobe $MOD_LIVEPATCH +% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition livepatch: '$MOD_LIVEPATCH': starting patching transition diff --git a/tools/testing/selftests/livepatch/test-livepatch.sh b/tools/testing/selftests/livepatch/test-livepatch.sh index 5fe79ac34be12..e3455a6b11589 100755 --- a/tools/testing/selftests/livepatch/test-livepatch.sh +++ b/tools/testing/selftests/livepatch/test-livepatch.sh @@ -31,7 +31,7 @@ if [[ "$(cat /proc/cmdline)" == "$MOD_LIVEPATCH: this has been live patched" ]] die "livepatch kselftest(s) failed" fi -check_result "% modprobe $MOD_LIVEPATCH +check_result "% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition livepatch: '$MOD_LIVEPATCH': starting patching transition @@ -75,14 +75,14 @@ unload_lp $MOD_LIVEPATCH grep 'live patched' /proc/cmdline > /dev/kmsg grep 'live patched' /proc/meminfo > /dev/kmsg -check_result "% modprobe $MOD_LIVEPATCH +check_result "% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition livepatch: '$MOD_LIVEPATCH': starting patching transition livepatch: '$MOD_LIVEPATCH': completing patching transition livepatch: '$MOD_LIVEPATCH': patching complete $MOD_LIVEPATCH: this has been live patched -% modprobe $MOD_REPLACE replace=0 +% insmod test_modules/$MOD_REPLACE.ko replace=0 livepatch: enabling patch '$MOD_REPLACE' livepatch: '$MOD_REPLACE': initializing patching transition livepatch: '$MOD_REPLACE': starting patching transition @@ -135,14 +135,14 @@ unload_lp $MOD_REPLACE grep 'live patched' /proc/cmdline > /dev/kmsg grep 'live patched' /proc/meminfo > /dev/kmsg -check_result "% modprobe $MOD_LIVEPATCH +check_result "% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition livepatch: '$MOD_LIVEPATCH': starting patching transition livepatch: '$MOD_LIVEPATCH': completing patching transition livepatch: '$MOD_LIVEPATCH': patching complete $MOD_LIVEPATCH: this has been live patched -% modprobe $MOD_REPLACE replace=1 +% insmod test_modules/$MOD_REPLACE.ko replace=1 livepatch: enabling patch '$MOD_REPLACE' livepatch: '$MOD_REPLACE': initializing patching transition livepatch: '$MOD_REPLACE': starting patching transition diff --git a/tools/testing/selftests/livepatch/test-shadow-vars.sh b/tools/testing/selftests/livepatch/test-shadow-vars.sh index e04cb354f56b2..1218c155bffea 100755 --- a/tools/testing/selftests/livepatch/test-shadow-vars.sh +++ b/tools/testing/selftests/livepatch/test-shadow-vars.sh @@ -16,7 +16,7 @@ start_test "basic shadow variable API" load_mod $MOD_TEST unload_mod $MOD_TEST -check_result "% modprobe $MOD_TEST +check_result "% insmod test_modules/$MOD_TEST.ko $MOD_TEST: klp_shadow_get(obj=PTR1, id=0x1234) = PTR0 $MOD_TEST: got expected NULL result $MOD_TEST: shadow_ctor: PTR3 -> PTR2 diff --git a/tools/testing/selftests/livepatch/test-state.sh b/tools/testing/selftests/livepatch/test-state.sh index 38656721c958b..10a52ac061850 100755 --- a/tools/testing/selftests/livepatch/test-state.sh +++ b/tools/testing/selftests/livepatch/test-state.sh @@ -19,7 +19,7 @@ load_lp $MOD_LIVEPATCH disable_lp $MOD_LIVEPATCH unload_lp $MOD_LIVEPATCH -check_result "% modprobe $MOD_LIVEPATCH +check_result "% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition $MOD_LIVEPATCH: pre_patch_callback: vmlinux @@ -51,7 +51,7 @@ unload_lp $MOD_LIVEPATCH disable_lp $MOD_LIVEPATCH2 unload_lp $MOD_LIVEPATCH2 -check_result "% modprobe $MOD_LIVEPATCH +check_result "% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition $MOD_LIVEPATCH: pre_patch_callback: vmlinux @@ -61,7 +61,7 @@ livepatch: '$MOD_LIVEPATCH': completing patching transition $MOD_LIVEPATCH: post_patch_callback: vmlinux $MOD_LIVEPATCH: fix_console_loglevel: fixing console_loglevel livepatch: '$MOD_LIVEPATCH': patching complete -% modprobe $MOD_LIVEPATCH2 +% insmod test_modules/$MOD_LIVEPATCH2.ko livepatch: enabling patch '$MOD_LIVEPATCH2' livepatch: '$MOD_LIVEPATCH2': initializing patching transition $MOD_LIVEPATCH2: pre_patch_callback: vmlinux @@ -96,7 +96,7 @@ disable_lp $MOD_LIVEPATCH2 unload_lp $MOD_LIVEPATCH2 unload_lp $MOD_LIVEPATCH3 -check_result "% modprobe $MOD_LIVEPATCH2 +check_result "% insmod test_modules/$MOD_LIVEPATCH2.ko livepatch: enabling patch '$MOD_LIVEPATCH2' livepatch: '$MOD_LIVEPATCH2': initializing patching transition $MOD_LIVEPATCH2: pre_patch_callback: vmlinux @@ -106,7 +106,7 @@ livepatch: '$MOD_LIVEPATCH2': completing patching transition $MOD_LIVEPATCH2: post_patch_callback: vmlinux $MOD_LIVEPATCH2: fix_console_loglevel: fixing console_loglevel livepatch: '$MOD_LIVEPATCH2': patching complete -% modprobe $MOD_LIVEPATCH3 +% insmod test_modules/$MOD_LIVEPATCH3.ko livepatch: enabling patch '$MOD_LIVEPATCH3' livepatch: '$MOD_LIVEPATCH3': initializing patching transition $MOD_LIVEPATCH3: pre_patch_callback: vmlinux @@ -117,7 +117,7 @@ $MOD_LIVEPATCH3: post_patch_callback: vmlinux $MOD_LIVEPATCH3: fix_console_loglevel: taking over the console_loglevel change livepatch: '$MOD_LIVEPATCH3': patching complete % rmmod $MOD_LIVEPATCH2 -% modprobe $MOD_LIVEPATCH2 +% insmod test_modules/$MOD_LIVEPATCH2.ko livepatch: enabling patch '$MOD_LIVEPATCH2' livepatch: '$MOD_LIVEPATCH2': initializing patching transition $MOD_LIVEPATCH2: pre_patch_callback: vmlinux @@ -149,7 +149,7 @@ load_failing_mod $MOD_LIVEPATCH disable_lp $MOD_LIVEPATCH2 unload_lp $MOD_LIVEPATCH2 -check_result "% modprobe $MOD_LIVEPATCH2 +check_result "% insmod test_modules/$MOD_LIVEPATCH2.ko livepatch: enabling patch '$MOD_LIVEPATCH2' livepatch: '$MOD_LIVEPATCH2': initializing patching transition $MOD_LIVEPATCH2: pre_patch_callback: vmlinux @@ -159,9 +159,9 @@ livepatch: '$MOD_LIVEPATCH2': completing patching transition $MOD_LIVEPATCH2: post_patch_callback: vmlinux $MOD_LIVEPATCH2: fix_console_loglevel: fixing console_loglevel livepatch: '$MOD_LIVEPATCH2': patching complete -% modprobe $MOD_LIVEPATCH +% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: Livepatch patch ($MOD_LIVEPATCH) is not compatible with the already installed livepatches. -modprobe: ERROR: could not insert '$MOD_LIVEPATCH': Invalid argument +insmod: ERROR: could not insert module test_modules/$MOD_LIVEPATCH.ko: Invalid parameters % echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH2/enabled livepatch: '$MOD_LIVEPATCH2': initializing unpatching transition $MOD_LIVEPATCH2: pre_unpatch_callback: vmlinux diff --git a/tools/testing/selftests/livepatch/test-sysfs.sh b/tools/testing/selftests/livepatch/test-sysfs.sh index 7f76f280189a0..6c646afa7395e 100755 --- a/tools/testing/selftests/livepatch/test-sysfs.sh +++ b/tools/testing/selftests/livepatch/test-sysfs.sh @@ -27,7 +27,7 @@ disable_lp $MOD_LIVEPATCH unload_lp $MOD_LIVEPATCH -check_result "% modprobe $MOD_LIVEPATCH +check_result "% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition livepatch: '$MOD_LIVEPATCH': starting patching transition @@ -56,7 +56,7 @@ check_sysfs_value "$MOD_LIVEPATCH" "$MOD_TARGET/patched" "0" disable_lp $MOD_LIVEPATCH unload_lp $MOD_LIVEPATCH -check_result "% modprobe test_klp_callbacks_demo +check_result "% insmod test_modules/test_klp_callbacks_demo.ko livepatch: enabling patch 'test_klp_callbacks_demo' livepatch: 'test_klp_callbacks_demo': initializing patching transition test_klp_callbacks_demo: pre_patch_callback: vmlinux @@ -64,7 +64,7 @@ livepatch: 'test_klp_callbacks_demo': starting patching transition livepatch: 'test_klp_callbacks_demo': completing patching transition test_klp_callbacks_demo: post_patch_callback: vmlinux livepatch: 'test_klp_callbacks_demo': patching complete -% modprobe test_klp_callbacks_mod +% insmod test_modules/test_klp_callbacks_mod.ko livepatch: applying patch 'test_klp_callbacks_demo' to loading module 'test_klp_callbacks_mod' test_klp_callbacks_demo: pre_patch_callback: test_klp_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init test_klp_callbacks_demo: post_patch_callback: test_klp_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init diff --git a/tools/testing/selftests/livepatch/test_modules/Makefile b/tools/testing/selftests/livepatch/test_modules/Makefile new file mode 100644 index 0000000000000..6f7c2103d27d9 --- /dev/null +++ b/tools/testing/selftests/livepatch/test_modules/Makefile @@ -0,0 +1,19 @@ +TESTMODS_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST))))) +KDIR ?= /lib/modules/$(shell uname -r)/build + +obj-m += test_klp_atomic_replace.o \ + test_klp_callbacks_busy.o \ + test_klp_callbacks_demo.o \ + test_klp_callbacks_demo2.o \ + test_klp_callbacks_mod.o \ + test_klp_livepatch.o \ + test_klp_state.o \ + test_klp_state2.o \ + test_klp_state3.o \ + test_klp_shadow_vars.o + +modules: + $(Q)$(MAKE) -C $(KDIR) modules KBUILD_EXTMOD=$(TESTMODS_DIR) + +clean: + $(Q)$(MAKE) -C $(KDIR) clean KBUILD_EXTMOD=$(TESTMODS_DIR) diff --git a/lib/livepatch/test_klp_atomic_replace.c b/tools/testing/selftests/livepatch/test_modules/test_klp_atomic_replace.c similarity index 100% rename from lib/livepatch/test_klp_atomic_replace.c rename to tools/testing/selftests/livepatch/test_modules/test_klp_atomic_replace.c diff --git a/lib/livepatch/test_klp_callbacks_busy.c b/tools/testing/selftests/livepatch/test_modules/test_klp_callbacks_busy.c similarity index 100% rename from lib/livepatch/test_klp_callbacks_busy.c rename to tools/testing/selftests/livepatch/test_modules/test_klp_callbacks_busy.c diff --git a/lib/livepatch/test_klp_callbacks_demo.c b/tools/testing/selftests/livepatch/test_modules/test_klp_callbacks_demo.c similarity index 100% rename from lib/livepatch/test_klp_callbacks_demo.c rename to tools/testing/selftests/livepatch/test_modules/test_klp_callbacks_demo.c diff --git a/lib/livepatch/test_klp_callbacks_demo2.c b/tools/testing/selftests/livepatch/test_modules/test_klp_callbacks_demo2.c similarity index 100% rename from lib/livepatch/test_klp_callbacks_demo2.c rename to tools/testing/selftests/livepatch/test_modules/test_klp_callbacks_demo2.c diff --git a/lib/livepatch/test_klp_callbacks_mod.c b/tools/testing/selftests/livepatch/test_modules/test_klp_callbacks_mod.c similarity index 100% rename from lib/livepatch/test_klp_callbacks_mod.c rename to tools/testing/selftests/livepatch/test_modules/test_klp_callbacks_mod.c diff --git a/lib/livepatch/test_klp_livepatch.c b/tools/testing/selftests/livepatch/test_modules/test_klp_livepatch.c similarity index 100% rename from lib/livepatch/test_klp_livepatch.c rename to tools/testing/selftests/livepatch/test_modules/test_klp_livepatch.c diff --git a/lib/livepatch/test_klp_shadow_vars.c b/tools/testing/selftests/livepatch/test_modules/test_klp_shadow_vars.c similarity index 100% rename from lib/livepatch/test_klp_shadow_vars.c rename to tools/testing/selftests/livepatch/test_modules/test_klp_shadow_vars.c diff --git a/lib/livepatch/test_klp_state.c b/tools/testing/selftests/livepatch/test_modules/test_klp_state.c similarity index 100% rename from lib/livepatch/test_klp_state.c rename to tools/testing/selftests/livepatch/test_modules/test_klp_state.c diff --git a/lib/livepatch/test_klp_state2.c b/tools/testing/selftests/livepatch/test_modules/test_klp_state2.c similarity index 100% rename from lib/livepatch/test_klp_state2.c rename to tools/testing/selftests/livepatch/test_modules/test_klp_state2.c diff --git a/lib/livepatch/test_klp_state3.c b/tools/testing/selftests/livepatch/test_modules/test_klp_state3.c similarity index 100% rename from lib/livepatch/test_klp_state3.c rename to tools/testing/selftests/livepatch/test_modules/test_klp_state3.c -- GitLab From 6a71770442b5b7cf8f880ca1c0a72456c918c757 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Fri, 12 Jan 2024 14:43:52 -0300 Subject: [PATCH 0060/1519] selftests: livepatch: Test livepatching a heavily called syscall The test proves that a syscall can be livepatched. It is interesting because syscalls are called a tricky way. Also the process gets livepatched either when sleeping in the userspace or when entering or leaving the kernel space. The livepatch is a bit tricky: 1. The syscall function name is architecture specific. Also ARCH_HAS_SYSCALL_WRAPPER must be taken in account. 2. The syscall must stay working the same way for other processes on the system. It is solved by decrementing a counter only for PIDs of the test processes. It means that the test processes has to call the livepatched syscall at least once. The test creates one userspace process per online cpu. The processes are calling getpid in a busy loop. The intention is to create random locations when the livepatch gets enabled. Nothing is guarantted. The magic is in the randomness. Reviewed-by: Joe Lawrence Reviewed-by: Petr Mladek Signed-off-by: Marcos Paulo de Souza Signed-off-by: Shuah Khan --- tools/testing/selftests/livepatch/Makefile | 4 +- .../selftests/livepatch/test-syscall.sh | 53 ++++++++ .../livepatch/test_klp-call_getpid.c | 44 +++++++ .../selftests/livepatch/test_modules/Makefile | 3 +- .../livepatch/test_modules/test_klp_syscall.c | 116 ++++++++++++++++++ 5 files changed, 218 insertions(+), 2 deletions(-) create mode 100755 tools/testing/selftests/livepatch/test-syscall.sh create mode 100644 tools/testing/selftests/livepatch/test_klp-call_getpid.c create mode 100644 tools/testing/selftests/livepatch/test_modules/test_klp_syscall.c diff --git a/tools/testing/selftests/livepatch/Makefile b/tools/testing/selftests/livepatch/Makefile index 119e2bbebe5d0..35418a4790be4 100644 --- a/tools/testing/selftests/livepatch/Makefile +++ b/tools/testing/selftests/livepatch/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 +TEST_GEN_FILES := test_klp-call_getpid TEST_GEN_MODS_DIR := test_modules TEST_PROGS_EXTENDED := functions.sh TEST_PROGS := \ @@ -8,7 +9,8 @@ TEST_PROGS := \ test-shadow-vars.sh \ test-state.sh \ test-ftrace.sh \ - test-sysfs.sh + test-sysfs.sh \ + test-syscall.sh TEST_FILES := settings diff --git a/tools/testing/selftests/livepatch/test-syscall.sh b/tools/testing/selftests/livepatch/test-syscall.sh new file mode 100755 index 0000000000000..b76a881d4013a --- /dev/null +++ b/tools/testing/selftests/livepatch/test-syscall.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2023 SUSE +# Author: Marcos Paulo de Souza + +. $(dirname $0)/functions.sh + +MOD_SYSCALL=test_klp_syscall + +setup_config + +# - Start _NRPROC processes calling getpid and load a livepatch to patch the +# getpid syscall. Check if all the processes transitioned to the livepatched +# state. + +start_test "patch getpid syscall while being heavily hammered" + +for i in $(seq 1 $(getconf _NPROCESSORS_ONLN)); do + ./test_klp-call_getpid & + pids[$i]="$!" +done + +pid_list=$(echo ${pids[@]} | tr ' ' ',') +load_lp $MOD_SYSCALL klp_pids=$pid_list + +# wait for all tasks to transition to patched state +loop_until 'grep -q '^0$' /sys/kernel/test_klp_syscall/npids' + +pending_pids=$(cat /sys/kernel/test_klp_syscall/npids) +log "$MOD_SYSCALL: Remaining not livepatched processes: $pending_pids" + +for pid in ${pids[@]}; do + kill $pid || true +done + +disable_lp $MOD_SYSCALL +unload_lp $MOD_SYSCALL + +check_result "% insmod test_modules/$MOD_SYSCALL.ko klp_pids=$pid_list +livepatch: enabling patch '$MOD_SYSCALL' +livepatch: '$MOD_SYSCALL': initializing patching transition +livepatch: '$MOD_SYSCALL': starting patching transition +livepatch: '$MOD_SYSCALL': completing patching transition +livepatch: '$MOD_SYSCALL': patching complete +$MOD_SYSCALL: Remaining not livepatched processes: 0 +% echo 0 > /sys/kernel/livepatch/$MOD_SYSCALL/enabled +livepatch: '$MOD_SYSCALL': initializing unpatching transition +livepatch: '$MOD_SYSCALL': starting unpatching transition +livepatch: '$MOD_SYSCALL': completing unpatching transition +livepatch: '$MOD_SYSCALL': unpatching complete +% rmmod $MOD_SYSCALL" + +exit 0 diff --git a/tools/testing/selftests/livepatch/test_klp-call_getpid.c b/tools/testing/selftests/livepatch/test_klp-call_getpid.c new file mode 100644 index 0000000000000..ce321a2d7308d --- /dev/null +++ b/tools/testing/selftests/livepatch/test_klp-call_getpid.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 SUSE + * Authors: Libor Pechacek + * Marcos Paulo de Souza + */ + +#include +#include +#include +#include +#include + +static int stop; +static int sig_int; + +void hup_handler(int signum) +{ + stop = 1; +} + +void int_handler(int signum) +{ + stop = 1; + sig_int = 1; +} + +int main(int argc, char *argv[]) +{ + long count = 0; + + signal(SIGHUP, &hup_handler); + signal(SIGINT, &int_handler); + + while (!stop) { + (void)syscall(SYS_getpid); + count++; + } + + if (sig_int) + printf("%ld iterations done\n", count); + + return 0; +} diff --git a/tools/testing/selftests/livepatch/test_modules/Makefile b/tools/testing/selftests/livepatch/test_modules/Makefile index 6f7c2103d27d9..f5e880269bff6 100644 --- a/tools/testing/selftests/livepatch/test_modules/Makefile +++ b/tools/testing/selftests/livepatch/test_modules/Makefile @@ -10,7 +10,8 @@ obj-m += test_klp_atomic_replace.o \ test_klp_state.o \ test_klp_state2.o \ test_klp_state3.o \ - test_klp_shadow_vars.o + test_klp_shadow_vars.o \ + test_klp_syscall.o modules: $(Q)$(MAKE) -C $(KDIR) modules KBUILD_EXTMOD=$(TESTMODS_DIR) diff --git a/tools/testing/selftests/livepatch/test_modules/test_klp_syscall.c b/tools/testing/selftests/livepatch/test_modules/test_klp_syscall.c new file mode 100644 index 0000000000000..dd802783ea849 --- /dev/null +++ b/tools/testing/selftests/livepatch/test_modules/test_klp_syscall.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2017-2023 SUSE + * Authors: Libor Pechacek + * Nicolai Stange + * Marcos Paulo de Souza + */ + +#include +#include +#include +#include +#include + +#if defined(__x86_64__) +#define FN_PREFIX __x64_ +#elif defined(__s390x__) +#define FN_PREFIX __s390x_ +#elif defined(__aarch64__) +#define FN_PREFIX __arm64_ +#else +/* powerpc does not select ARCH_HAS_SYSCALL_WRAPPER */ +#define FN_PREFIX +#endif + +/* Protects klp_pids */ +static DEFINE_MUTEX(kpid_mutex); + +static unsigned int npids, npids_pending; +static int klp_pids[NR_CPUS]; +module_param_array(klp_pids, int, &npids_pending, 0); +MODULE_PARM_DESC(klp_pids, "Array of pids to be transitioned to livepatched state."); + +static ssize_t npids_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", npids_pending); +} + +static struct kobj_attribute klp_attr = __ATTR_RO(npids); +static struct kobject *klp_kobj; + +static asmlinkage long lp_sys_getpid(void) +{ + int i; + + mutex_lock(&kpid_mutex); + if (npids_pending > 0) { + for (i = 0; i < npids; i++) { + if (current->pid == klp_pids[i]) { + klp_pids[i] = 0; + npids_pending--; + break; + } + } + } + mutex_unlock(&kpid_mutex); + + return task_tgid_vnr(current); +} + +static struct klp_func vmlinux_funcs[] = { + { + .old_name = __stringify(FN_PREFIX) "sys_getpid", + .new_func = lp_sys_getpid, + }, {} +}; + +static struct klp_object objs[] = { + { + /* name being NULL means vmlinux */ + .funcs = vmlinux_funcs, + }, {} +}; + +static struct klp_patch patch = { + .mod = THIS_MODULE, + .objs = objs, +}; + +static int livepatch_init(void) +{ + int ret; + + klp_kobj = kobject_create_and_add("test_klp_syscall", kernel_kobj); + if (!klp_kobj) + return -ENOMEM; + + ret = sysfs_create_file(klp_kobj, &klp_attr.attr); + if (ret) { + kobject_put(klp_kobj); + return ret; + } + + /* + * Save the number pids to transition to livepatched state before the + * number of pending pids is decremented. + */ + npids = npids_pending; + + return klp_enable_patch(&patch); +} + +static void livepatch_exit(void) +{ + kobject_put(klp_kobj); +} + +module_init(livepatch_init); +module_exit(livepatch_exit); +MODULE_LICENSE("GPL"); +MODULE_INFO(livepatch, "Y"); +MODULE_AUTHOR("Libor Pechacek "); +MODULE_AUTHOR("Nicolai Stange "); +MODULE_AUTHOR("Marcos Paulo de Souza "); +MODULE_DESCRIPTION("Livepatch test: syscall transition"); -- GitLab From 1b908debf53ff3cf0e43e0fa51e7319a23518e6c Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 1 Nov 2023 14:26:15 -0700 Subject: [PATCH 0061/1519] x86/resctrl: Fix unused variable warning in cache_alloc_hsw_probe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In a "W=1" build gcc throws a warning: arch/x86/kernel/cpu/resctrl/core.c: In function ‘cache_alloc_hsw_probe’: arch/x86/kernel/cpu/resctrl/core.c:139:16: warning: variable ‘h’ set but not used Switch from wrmsr_safe() to wrmsrl_safe(), and from rdmsr() to rdmsrl() using a single u64 argument for the MSR value instead of the pair of u32 for the high and low halves. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Babu Moger Acked-by: Reinette Chatre Link: https://lore.kernel.org/r/ZULCd/TGJL9Dmncf@agluck-desk3 --- arch/x86/kernel/cpu/resctrl/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 19e0681f04356..d29ebe345de6c 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -136,15 +136,15 @@ static inline void cache_alloc_hsw_probe(void) { struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3]; struct rdt_resource *r = &hw_res->r_resctrl; - u32 l, h, max_cbm = BIT_MASK(20) - 1; + u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0; - if (wrmsr_safe(MSR_IA32_L3_CBM_BASE, max_cbm, 0)) + if (wrmsrl_safe(MSR_IA32_L3_CBM_BASE, max_cbm)) return; - rdmsr(MSR_IA32_L3_CBM_BASE, l, h); + rdmsrl(MSR_IA32_L3_CBM_BASE, l3_cbm_0); /* If all the bits were set in MSR, return success */ - if (l != max_cbm) + if (l3_cbm_0 != max_cbm) return; hw_res->num_closid = 4; -- GitLab From b57c1a1e7effab067a65bab54c5d83a67cffd043 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 4 Jul 2023 18:18:08 +0800 Subject: [PATCH 0062/1519] EDAC/synopsys: Convert to devm_platform_ioremap_resource() Use devm_platform_ioremap_resource() to simplify code. Signed-off-by: Yangtao Li Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Michal Simek Link: https://lore.kernel.org/r/20230704101811.49637-3-frank.li@vivo.com --- drivers/edac/synopsys_edac.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c index 709babce43ba0..5527055b09641 100644 --- a/drivers/edac/synopsys_edac.c +++ b/drivers/edac/synopsys_edac.c @@ -1324,11 +1324,9 @@ static int mc_probe(struct platform_device *pdev) struct synps_edac_priv *priv; struct mem_ctl_info *mci; void __iomem *baseaddr; - struct resource *res; int rc; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - baseaddr = devm_ioremap_resource(&pdev->dev, res); + baseaddr = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(baseaddr)) return PTR_ERR(baseaddr); -- GitLab From 0976783bb123f30981bc1e7a14d9626a6f63aeac Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Mon, 15 Jan 2024 16:52:27 -0600 Subject: [PATCH 0063/1519] x86/resctrl: Remove hard-coded memory bandwidth limit The QOS Memory Bandwidth Enforcement Limit is reported by CPUID_Fn80000020_EAX_x01 and CPUID_Fn80000020_EAX_x02: Bits Description 31:0 BW_LEN: Size of the QOS Memory Bandwidth Enforcement Limit. Newer processors can support higher bandwidth limit than the current hard-coded value. Remove latter and detect using CPUID instead. Also, update the register variables eax and edx to match the AMD CPUID definition. The CPUID details are documented in the Processor Programming Reference (PPR) Vol 1.1 for AMD Family 19h Model 11h B1 - 55901 Rev 0.25 in the Link tag below. Fixes: 4d05bf71f157 ("x86/resctrl: Introduce AMD QOS feature") Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 Link: https://lore.kernel.org/r/c26a8ca79d399ed076cf8bf2e9fbc58048808289.1705359148.git.babu.moger@amd.com --- arch/x86/kernel/cpu/resctrl/core.c | 10 ++++------ arch/x86/kernel/cpu/resctrl/internal.h | 1 - 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index d29ebe345de6c..aa9810a64258e 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -231,9 +231,7 @@ static bool __get_mem_config_intel(struct rdt_resource *r) static bool __rdt_get_mem_config_amd(struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - union cpuid_0x10_3_eax eax; - union cpuid_0x10_x_edx edx; - u32 ebx, ecx, subleaf; + u32 eax, ebx, ecx, edx, subleaf; /* * Query CPUID_Fn80000020_EDX_x01 for MBA and @@ -241,9 +239,9 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r) */ subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1; - cpuid_count(0x80000020, subleaf, &eax.full, &ebx, &ecx, &edx.full); - hw_res->num_closid = edx.split.cos_max + 1; - r->default_ctrl = MAX_MBA_BW_AMD; + cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx); + hw_res->num_closid = edx + 1; + r->default_ctrl = 1 << eax; /* AMD does not use delay */ r->membw.delay_linear = false; diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index a4f1aa15f0a2a..d2979748fae47 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -18,7 +18,6 @@ #define MBM_OVERFLOW_INTERVAL 1000 #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 -#define MAX_MBA_BW_AMD 0x800 #define MBM_CNTR_WIDTH_OFFSET_AMD 20 #define RMID_VAL_ERROR BIT_ULL(63) -- GitLab From 54e35eb8611cce5550d3d7689679b1a91c864f28 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Mon, 15 Jan 2024 16:52:28 -0600 Subject: [PATCH 0064/1519] x86/resctrl: Read supported bandwidth sources from CPUID If the BMEC (Bandwidth Monitoring Event Configuration) feature is supported, the bandwidth events can be configured. The maximum supported bandwidth bitmask can be read from CPUID: CPUID_Fn80000020_ECX_x03 [Platform QoS Monitoring Bandwidth Event Configuration] Bits Description 31:7 Reserved 6:0 Identifies the bandwidth sources that can be tracked. While at it, move the mask checking to mon_config_write() before iterating over all the domains. Also, print the valid bitmask when the user tries to configure invalid event configuration value. The CPUID details are documented in the Processor Programming Reference (PPR) Vol 1.1 for AMD Family 19h Model 11h B1 - 55901 Rev 0.25 in the Link tag. Fixes: dc2a3e857981 ("x86/resctrl: Add interface to read mbm_total_bytes_config") Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 Link: https://lore.kernel.org/r/669896fa512c7451319fa5ca2fdb6f7e015b5635.1705359148.git.babu.moger@amd.com --- arch/x86/kernel/cpu/resctrl/internal.h | 3 +++ arch/x86/kernel/cpu/resctrl/monitor.c | 6 ++++++ arch/x86/kernel/cpu/resctrl/rdtgroup.c | 14 ++++++++------ 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index d2979748fae47..e3dc35a00a197 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -394,6 +394,8 @@ struct rdt_parse_data { * @msr_update: Function pointer to update QOS MSRs * @mon_scale: cqm counter * mon_scale = occupancy in bytes * @mbm_width: Monitor width, to detect and correct for overflow. + * @mbm_cfg_mask: Bandwidth sources that can be tracked when Bandwidth + * Monitoring Event Configuration (BMEC) is supported. * @cdp_enabled: CDP state of this resource * * Members of this structure are either private to the architecture @@ -408,6 +410,7 @@ struct rdt_hw_resource { struct rdt_resource *r); unsigned int mon_scale; unsigned int mbm_width; + unsigned int mbm_cfg_mask; bool cdp_enabled; }; diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index f136ac046851c..acca577e2b066 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -813,6 +813,12 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) return ret; if (rdt_cpu_has(X86_FEATURE_BMEC)) { + u32 eax, ebx, ecx, edx; + + /* Detect list of bandwidth sources that can be tracked */ + cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); + hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { mbm_total_event.configurable = true; mbm_config_rftype_init("mbm_total_bytes_config"); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 69a1de92384ab..2b69e560b05f1 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1620,12 +1620,6 @@ static int mbm_config_write_domain(struct rdt_resource *r, struct mon_config_info mon_info = {0}; int ret = 0; - /* mon_config cannot be more than the supported set of events */ - if (val > MAX_EVT_CONFIG_BITS) { - rdt_last_cmd_puts("Invalid event configuration\n"); - return -EINVAL; - } - /* * Read the current config value first. If both are the same then * no need to write it again. @@ -1663,6 +1657,7 @@ static int mbm_config_write_domain(struct rdt_resource *r, static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) { + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); char *dom_str = NULL, *id_str; unsigned long dom_id, val; struct rdt_domain *d; @@ -1686,6 +1681,13 @@ static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) return -EINVAL; } + /* Value from user cannot be more than the supported set of events */ + if ((val & hw_res->mbm_cfg_mask) != val) { + rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", + hw_res->mbm_cfg_mask); + return -EINVAL; + } + list_for_each_entry(d, &r->domains, list) { if (d->id == dom_id) { ret = mbm_config_write_domain(r, d, evtid, val); -- GitLab From 9473c4450e9c83d890d435577a3776d925fa748c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 28 Dec 2023 15:15:10 -0500 Subject: [PATCH 0065/1519] exportfs: fix the fallback implementation of the get_name export operation The fallback implementation for the get_name export operation uses readdir() to try to match the inode number to a filename. That filename is then used together with lookup_one() to produce a dentry. A problem arises when we match the '.' or '..' entries, since that causes lookup_one() to fail. This has sometimes been seen to occur for filesystems that violate POSIX requirements around uniqueness of inode numbers, something that is common for snapshot directories. This patch just ensures that we skip '.' and '..' rather than allowing a match. Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Acked-by: Amir Goldstein Link: https://lore.kernel.org/linux-nfs/CAOQ4uxiOZobN76OKB-VBNXWeFKVwLW_eK5QtthGyYzWU9mjb7Q@mail.gmail.com/ Acked-by: Christian Brauner Signed-off-by: Chuck Lever --- fs/exportfs/expfs.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 3ae0154c5680b..dcf7d86c2ce4f 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -244,6 +244,16 @@ struct getdents_callback { int sequence; /* sequence counter */ }; +/* Copied from lookup_one_common() */ +static inline bool is_dot_dotdot(const char *name, size_t len) +{ + if (unlikely(name[0] == '.')) { + if (len < 2 || (len == 2 && name[1] == '.')) + return true; + } + return false; +} + /* * A rather strange filldir function to capture * the name matching the specified inode number. @@ -255,7 +265,7 @@ static bool filldir_one(struct dir_context *ctx, const char *name, int len, container_of(ctx, struct getdents_callback, ctx); buf->sequence++; - if (buf->ino == ino && len <= NAME_MAX) { + if (buf->ino == ino && len <= NAME_MAX && !is_dot_dotdot(name, len)) { memcpy(buf->name, name, len); buf->name[len] = '\0'; buf->found = 1; -- GitLab From 42c3732fa8073717dd7d924472f1c0bc5b452fdc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 30 Dec 2023 19:46:00 -0500 Subject: [PATCH 0066/1519] fs: Create a generic is_dot_dotdot() utility De-duplicate the same functionality in several places by hoisting the is_dot_dotdot() utility function into linux/fs.h. Suggested-by: Amir Goldstein Reviewed-by: Jeff Layton Reviewed-by: Amir Goldstein Acked-by: Christian Brauner Signed-off-by: Chuck Lever --- fs/crypto/fname.c | 8 +------- fs/ecryptfs/crypto.c | 10 ---------- fs/exportfs/expfs.c | 10 ---------- fs/f2fs/f2fs.h | 11 ----------- fs/namei.c | 6 ++---- include/linux/fs.h | 11 +++++++++++ 6 files changed, 14 insertions(+), 42 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 7b3fc189593a5..0ad52fbe51c94 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -74,13 +74,7 @@ struct fscrypt_nokey_name { static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { - if (str->len == 1 && str->name[0] == '.') - return true; - - if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') - return true; - - return false; + return is_dot_dotdot(str->name, str->len); } /** diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 03bd55069d860..2fe0f3af1a08e 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1949,16 +1949,6 @@ int ecryptfs_encrypt_and_encode_filename( return rc; } -static bool is_dot_dotdot(const char *name, size_t name_size) -{ - if (name_size == 1 && name[0] == '.') - return true; - else if (name_size == 2 && name[0] == '.' && name[1] == '.') - return true; - - return false; -} - /** * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext * @plaintext_name: The plaintext name diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index dcf7d86c2ce4f..07ea3d62b2982 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -244,16 +244,6 @@ struct getdents_callback { int sequence; /* sequence counter */ }; -/* Copied from lookup_one_common() */ -static inline bool is_dot_dotdot(const char *name, size_t len) -{ - if (unlikely(name[0] == '.')) { - if (len < 2 || (len == 2 && name[1] == '.')) - return true; - } - return false; -} - /* * A rather strange filldir function to capture * the name matching the specified inode number. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9043cedfa12ba..322a3b8a35339 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3368,17 +3368,6 @@ static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) return is_set_ckpt_flags(sbi, CP_ERROR_FLAG); } -static inline bool is_dot_dotdot(const u8 *name, size_t len) -{ - if (len == 1 && name[0] == '.') - return true; - - if (len == 2 && name[0] == '.' && name[1] == '.') - return true; - - return false; -} - static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { diff --git a/fs/namei.c b/fs/namei.c index 71c13b2990b44..2386a70667fa5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2667,10 +2667,8 @@ static int lookup_one_common(struct mnt_idmap *idmap, if (!len) return -EACCES; - if (unlikely(name[0] == '.')) { - if (len < 2 || (len == 2 && name[1] == '.')) - return -EACCES; - } + if (is_dot_dotdot(name, len)) + return -EACCES; while (len--) { unsigned int c = *(const unsigned char *)name++; diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e3..baa64344a3082 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2846,6 +2846,17 @@ extern bool path_is_under(const struct path *, const struct path *); extern char *file_path(struct file *, char *, int); +/** + * is_dot_dotdot - returns true only if @name is "." or ".." + * @name: file name to check + * @len: length of file name, in bytes + */ +static inline bool is_dot_dotdot(const char *name, size_t len) +{ + return len && unlikely(name[0] == '.') && + (len == 1 || (len == 2 && name[1] == '.')); +} + #include /* needed for stackable file system support */ -- GitLab From bd46543d7f9a8b2c03420499b1885287e96aaf28 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Mon, 15 Jan 2024 23:27:00 +0800 Subject: [PATCH 0067/1519] eventfd: move 'eventfd-count' printing out of spinlock When printing eventfd->count, interrupts will be disabled and a spinlock will be obtained, competing with eventfd_write(). By moving the "eventfd-count" print out of the spinlock and merging multiple seq_printf() into one, it could improve a bit, just like timerfd_show(). Signed-off-by: Wen Yang Link: https://lore.kernel.org/r/tencent_B0B3D2BD9861FD009E03AB18A81783322709@qq.com Cc: Alexander Viro Cc: Jens Axboe Cc: Christian Brauner Cc: Christoph Hellwig Cc: Dylan Yudaken Cc: David Woodhouse Cc: Matthew Wilcox Cc: Eric Biggers Cc: Cc: Signed-off-by: Christian Brauner --- fs/eventfd.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/eventfd.c b/fs/eventfd.c index 0252b71099fbc..fc4d810907639 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -283,13 +283,18 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) { struct eventfd_ctx *ctx = f->private_data; + __u64 cnt; spin_lock_irq(&ctx->wqh.lock); - seq_printf(m, "eventfd-count: %16llx\n", - (unsigned long long)ctx->count); + cnt = ctx->count; spin_unlock_irq(&ctx->wqh.lock); - seq_printf(m, "eventfd-id: %d\n", ctx->id); - seq_printf(m, "eventfd-semaphore: %d\n", + + seq_printf(m, + "eventfd-count: %16llx\n" + "eventfd-id: %d\n" + "eventfd-semaphore: %d\n", + cnt, + ctx->id, !!(ctx->flags & EFD_SEMAPHORE)); } #endif -- GitLab From 49527ca264341f9b6278089e274012a2db367ebf Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Thu, 18 Jan 2024 17:36:00 +0100 Subject: [PATCH 0068/1519] Documentation/kernel-parameters: Add spec_rstack_overflow to mitigations=off mitigations=off disables the SRSO mitigation too. Add it to the list. Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240118163600.17857-1-bp@alien8.de --- Documentation/admin-guide/kernel-parameters.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 31b3a25680d08..64960f5d0d35e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3399,6 +3399,7 @@ nospectre_v1 [X86,PPC] nospectre_v2 [X86,PPC,S390,ARM64] retbleed=off [X86] + spec_rstack_overflow=off [X86] spec_store_bypass_disable=off [X86,PPC] spectre_v2_user=off [X86] srbds=off [X86,INTEL] -- GitLab From 16c31dd7fdf6c7ec88370928d0baf9f45d13c5a6 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 10 Jan 2024 11:58:17 +0900 Subject: [PATCH 0069/1519] Compiler Attributes: counted_by: bump min gcc version GCC is expected to implement this feature in version 15, so bump the version. Signed-off-by: Sergey Senozhatsky Reviewed-by: Nathan Chancellor Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/e1c27b64ae7abe2ebe647be11b71cf1bca84f677.1704855495.git.senozhatsky@chromium.org Signed-off-by: Miguel Ojeda --- include/linux/compiler_attributes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 28566624f008f..215882a1341a3 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -95,7 +95,7 @@ #endif /* - * Optional: only supported since gcc >= 14 + * Optional: only supported since gcc >= 15 * Optional: only supported since clang >= 18 * * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896 -- GitLab From 2993eb7a8d34aee6165e1f6676e81cdf1d22aa62 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 10 Jan 2024 11:58:18 +0900 Subject: [PATCH 0070/1519] Compiler Attributes: counted_by: fixup clang URL The URL in question 404 now, fix it up (and switch to github). Signed-off-by: Sergey Senozhatsky Reviewed-by: Kees Cook Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/b7babeb9c5b14af9189f0d6225673e6e9a8f4ad3.1704855496.git.senozhatsky@chromium.org Signed-off-by: Miguel Ojeda --- include/linux/compiler_attributes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 215882a1341a3..289810685fc55 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -99,7 +99,7 @@ * Optional: only supported since clang >= 18 * * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896 - * clang: https://reviews.llvm.org/D148381 + * clang: https://github.com/llvm/llvm-project/pull/76348 */ #if __has_attribute(__counted_by__) # define __counted_by(member) __attribute__((__counted_by__(member))) -- GitLab From 8d4c171f451d384f3a287eb14bd60825d0b2381b Mon Sep 17 00:00:00 2001 From: Xinyu Li Date: Sun, 21 Jan 2024 14:13:36 +0800 Subject: [PATCH 0071/1519] docs: cgroup-v1: add missing code-block tags Hugetlb.rst lacks two code-block tags, causing a formatting issue. Signed-off-by: Xinyu Li Signed-off-by: Tejun Heo --- .../admin-guide/cgroup-v1/hugetlb.rst | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/hugetlb.rst b/Documentation/admin-guide/cgroup-v1/hugetlb.rst index 0fa724d82abb6..493a8e386700a 100644 --- a/Documentation/admin-guide/cgroup-v1/hugetlb.rst +++ b/Documentation/admin-guide/cgroup-v1/hugetlb.rst @@ -65,10 +65,12 @@ files include:: 1. Page fault accounting -hugetlb..limit_in_bytes -hugetlb..max_usage_in_bytes -hugetlb..usage_in_bytes -hugetlb..failcnt +:: + + hugetlb..limit_in_bytes + hugetlb..max_usage_in_bytes + hugetlb..usage_in_bytes + hugetlb..failcnt The HugeTLB controller allows users to limit the HugeTLB usage (page fault) per control group and enforces the limit during page fault. Since HugeTLB @@ -82,10 +84,12 @@ getting SIGBUS. 2. Reservation accounting -hugetlb..rsvd.limit_in_bytes -hugetlb..rsvd.max_usage_in_bytes -hugetlb..rsvd.usage_in_bytes -hugetlb..rsvd.failcnt +:: + + hugetlb..rsvd.limit_in_bytes + hugetlb..rsvd.max_usage_in_bytes + hugetlb..rsvd.usage_in_bytes + hugetlb..rsvd.failcnt The HugeTLB controller allows to limit the HugeTLB reservations per control group and enforces the controller limit at reservation time and at the fault of -- GitLab From 7ffa8f3d30236e0ab897c30bdb01224ff1fe1c89 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 15 Jan 2024 07:20:25 +0000 Subject: [PATCH 0072/1519] fs: Remove NTFS classic The replacement, NTFS3, was merged over two years ago. It is now time to remove the original from the tree as it is the last user of several APIs, and it is not worth changing. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240115072025.2071931-1-willy@infradead.org Acked-by: Namjae Jeon Acked-by: Dave Chinner Cc: Anton Altaparmakov Cc: Namjae Jeon Signed-off-by: Christian Brauner --- CREDITS | 5 + Documentation/filesystems/ntfs.rst | 466 ---- MAINTAINERS | 10 - fs/Kconfig | 1 - fs/Makefile | 1 - fs/ntfs/Kconfig | 81 - fs/ntfs/Makefile | 15 - fs/ntfs/aops.c | 1744 --------------- fs/ntfs/aops.h | 88 - fs/ntfs/attrib.c | 2624 ----------------------- fs/ntfs/attrib.h | 102 - fs/ntfs/bitmap.c | 179 -- fs/ntfs/bitmap.h | 104 - fs/ntfs/collate.c | 110 - fs/ntfs/collate.h | 36 - fs/ntfs/compress.c | 950 --------- fs/ntfs/debug.c | 159 -- fs/ntfs/debug.h | 57 - fs/ntfs/dir.c | 1540 ------------- fs/ntfs/dir.h | 34 - fs/ntfs/endian.h | 79 - fs/ntfs/file.c | 1997 ----------------- fs/ntfs/index.c | 440 ---- fs/ntfs/index.h | 134 -- fs/ntfs/inode.c | 3102 --------------------------- fs/ntfs/inode.h | 310 --- fs/ntfs/layout.h | 2421 --------------------- fs/ntfs/lcnalloc.c | 1000 --------- fs/ntfs/lcnalloc.h | 131 -- fs/ntfs/logfile.c | 849 -------- fs/ntfs/logfile.h | 295 --- fs/ntfs/malloc.h | 77 - fs/ntfs/mft.c | 2907 ------------------------- fs/ntfs/mft.h | 110 - fs/ntfs/mst.c | 189 -- fs/ntfs/namei.c | 392 ---- fs/ntfs/ntfs.h | 150 -- fs/ntfs/quota.c | 103 - fs/ntfs/quota.h | 21 - fs/ntfs/runlist.c | 1893 ---------------- fs/ntfs/runlist.h | 88 - fs/ntfs/super.c | 3202 ---------------------------- fs/ntfs/sysctl.c | 58 - fs/ntfs/sysctl.h | 27 - fs/ntfs/time.h | 89 - fs/ntfs/types.h | 55 - fs/ntfs/unistr.c | 384 ---- fs/ntfs/upcase.c | 73 - fs/ntfs/usnjrnl.c | 70 - fs/ntfs/usnjrnl.h | 191 -- fs/ntfs/volume.h | 164 -- 51 files changed, 5 insertions(+), 29302 deletions(-) delete mode 100644 Documentation/filesystems/ntfs.rst delete mode 100644 fs/ntfs/Kconfig delete mode 100644 fs/ntfs/Makefile delete mode 100644 fs/ntfs/aops.c delete mode 100644 fs/ntfs/aops.h delete mode 100644 fs/ntfs/attrib.c delete mode 100644 fs/ntfs/attrib.h delete mode 100644 fs/ntfs/bitmap.c delete mode 100644 fs/ntfs/bitmap.h delete mode 100644 fs/ntfs/collate.c delete mode 100644 fs/ntfs/collate.h delete mode 100644 fs/ntfs/compress.c delete mode 100644 fs/ntfs/debug.c delete mode 100644 fs/ntfs/debug.h delete mode 100644 fs/ntfs/dir.c delete mode 100644 fs/ntfs/dir.h delete mode 100644 fs/ntfs/endian.h delete mode 100644 fs/ntfs/file.c delete mode 100644 fs/ntfs/index.c delete mode 100644 fs/ntfs/index.h delete mode 100644 fs/ntfs/inode.c delete mode 100644 fs/ntfs/inode.h delete mode 100644 fs/ntfs/layout.h delete mode 100644 fs/ntfs/lcnalloc.c delete mode 100644 fs/ntfs/lcnalloc.h delete mode 100644 fs/ntfs/logfile.c delete mode 100644 fs/ntfs/logfile.h delete mode 100644 fs/ntfs/malloc.h delete mode 100644 fs/ntfs/mft.c delete mode 100644 fs/ntfs/mft.h delete mode 100644 fs/ntfs/mst.c delete mode 100644 fs/ntfs/namei.c delete mode 100644 fs/ntfs/ntfs.h delete mode 100644 fs/ntfs/quota.c delete mode 100644 fs/ntfs/quota.h delete mode 100644 fs/ntfs/runlist.c delete mode 100644 fs/ntfs/runlist.h delete mode 100644 fs/ntfs/super.c delete mode 100644 fs/ntfs/sysctl.c delete mode 100644 fs/ntfs/sysctl.h delete mode 100644 fs/ntfs/time.h delete mode 100644 fs/ntfs/types.h delete mode 100644 fs/ntfs/unistr.c delete mode 100644 fs/ntfs/upcase.c delete mode 100644 fs/ntfs/usnjrnl.c delete mode 100644 fs/ntfs/usnjrnl.h delete mode 100644 fs/ntfs/volume.h diff --git a/CREDITS b/CREDITS index 5797e8f7e92b0..12e829ccc1abf 100644 --- a/CREDITS +++ b/CREDITS @@ -63,6 +63,11 @@ D: dosfs, LILO, some fd features, ATM, various other hacks here and there S: Buenos Aires S: Argentina +NTFS FILESYSTEM +N: Anton Altaparmakov +E: anton@tuxera.com +D: NTFS filesystem + N: Tim Alpaerts E: tim_alpaerts@toyota-motor-europe.com D: 802.2 class II logical link control layer, diff --git a/Documentation/filesystems/ntfs.rst b/Documentation/filesystems/ntfs.rst deleted file mode 100644 index 5bb093a26485e..0000000000000 --- a/Documentation/filesystems/ntfs.rst +++ /dev/null @@ -1,466 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -================================ -The Linux NTFS filesystem driver -================================ - - -.. Table of contents - - - Overview - - Web site - - Features - - Supported mount options - - Known bugs and (mis-)features - - Using NTFS volume and stripe sets - - The Device-Mapper driver - - The Software RAID / MD driver - - Limitations when using the MD driver - - -Overview -======== - -Linux-NTFS comes with a number of user-space programs known as ntfsprogs. -These include mkntfs, a full-featured ntfs filesystem format utility, -ntfsundelete used for recovering files that were unintentionally deleted -from an NTFS volume and ntfsresize which is used to resize an NTFS partition. -See the web site for more information. - -To mount an NTFS 1.2/3.x (Windows NT4/2000/XP/2003) volume, use the file -system type 'ntfs'. The driver currently supports read-only mode (with no -fault-tolerance, encryption or journalling) and very limited, but safe, write -support. - -For fault tolerance and raid support (i.e. volume and stripe sets), you can -use the kernel's Software RAID / MD driver. See section "Using Software RAID -with NTFS" for details. - - -Web site -======== - -There is plenty of additional information on the linux-ntfs web site -at http://www.linux-ntfs.org/ - -The web site has a lot of additional information, such as a comprehensive -FAQ, documentation on the NTFS on-disk format, information on the Linux-NTFS -userspace utilities, etc. - - -Features -======== - -- This is a complete rewrite of the NTFS driver that used to be in the 2.4 and - earlier kernels. This new driver implements NTFS read support and is - functionally equivalent to the old ntfs driver and it also implements limited - write support. The biggest limitation at present is that files/directories - cannot be created or deleted. See below for the list of write features that - are so far supported. Another limitation is that writing to compressed files - is not implemented at all. Also, neither read nor write access to encrypted - files is so far implemented. -- The new driver has full support for sparse files on NTFS 3.x volumes which - the old driver isn't happy with. -- The new driver supports execution of binaries due to mmap() now being - supported. -- The new driver supports loopback mounting of files on NTFS which is used by - some Linux distributions to enable the user to run Linux from an NTFS - partition by creating a large file while in Windows and then loopback - mounting the file while in Linux and creating a Linux filesystem on it that - is used to install Linux on it. -- A comparison of the two drivers using:: - - time find . -type f -exec md5sum "{}" \; - - run three times in sequence with each driver (after a reboot) on a 1.4GiB - NTFS partition, showed the new driver to be 20% faster in total time elapsed - (from 9:43 minutes on average down to 7:53). The time spent in user space - was unchanged but the time spent in the kernel was decreased by a factor of - 2.5 (from 85 CPU seconds down to 33). -- The driver does not support short file names in general. For backwards - compatibility, we implement access to files using their short file names if - they exist. The driver will not create short file names however, and a - rename will discard any existing short file name. -- The new driver supports exporting of mounted NTFS volumes via NFS. -- The new driver supports async io (aio). -- The new driver supports fsync(2), fdatasync(2), and msync(2). -- The new driver supports readv(2) and writev(2). -- The new driver supports access time updates (including mtime and ctime). -- The new driver supports truncate(2) and open(2) with O_TRUNC. But at present - only very limited support for highly fragmented files, i.e. ones which have - their data attribute split across multiple extents, is included. Another - limitation is that at present truncate(2) will never create sparse files, - since to mark a file sparse we need to modify the directory entry for the - file and we do not implement directory modifications yet. -- The new driver supports write(2) which can both overwrite existing data and - extend the file size so that you can write beyond the existing data. Also, - writing into sparse regions is supported and the holes are filled in with - clusters. But at present only limited support for highly fragmented files, - i.e. ones which have their data attribute split across multiple extents, is - included. Another limitation is that write(2) will never create sparse - files, since to mark a file sparse we need to modify the directory entry for - the file and we do not implement directory modifications yet. - -Supported mount options -======================= - -In addition to the generic mount options described by the manual page for the -mount command (man 8 mount, also see man 5 fstab), the NTFS driver supports the -following mount options: - -======================= ======================================================= -iocharset=name Deprecated option. Still supported but please use - nls=name in the future. See description for nls=name. - -nls=name Character set to use when returning file names. - Unlike VFAT, NTFS suppresses names that contain - unconvertible characters. Note that most character - sets contain insufficient characters to represent all - possible Unicode characters that can exist on NTFS. - To be sure you are not missing any files, you are - advised to use nls=utf8 which is capable of - representing all Unicode characters. - -utf8= Option no longer supported. Currently mapped to - nls=utf8 but please use nls=utf8 in the future and - make sure utf8 is compiled either as module or into - the kernel. See description for nls=name. - -uid= -gid= -umask= Provide default owner, group, and access mode mask. - These options work as documented in mount(8). By - default, the files/directories are owned by root and - he/she has read and write permissions, as well as - browse permission for directories. No one else has any - access permissions. I.e. the mode on all files is by - default rw------- and for directories rwx------, a - consequence of the default fmask=0177 and dmask=0077. - Using a umask of zero will grant all permissions to - everyone, i.e. all files and directories will have mode - rwxrwxrwx. - -fmask= -dmask= Instead of specifying umask which applies both to - files and directories, fmask applies only to files and - dmask only to directories. - -sloppy= If sloppy is specified, ignore unknown mount options. - Otherwise the default behaviour is to abort mount if - any unknown options are found. - -show_sys_files= If show_sys_files is specified, show the system files - in directory listings. Otherwise the default behaviour - is to hide the system files. - Note that even when show_sys_files is specified, "$MFT" - will not be visible due to bugs/mis-features in glibc. - Further, note that irrespective of show_sys_files, all - files are accessible by name, i.e. you can always do - "ls -l \$UpCase" for example to specifically show the - system file containing the Unicode upcase table. - -case_sensitive= If case_sensitive is specified, treat all file names as - case sensitive and create file names in the POSIX - namespace. Otherwise the default behaviour is to treat - file names as case insensitive and to create file names - in the WIN32/LONG name space. Note, the Linux NTFS - driver will never create short file names and will - remove them on rename/delete of the corresponding long - file name. - Note that files remain accessible via their short file - name, if it exists. If case_sensitive, you will need - to provide the correct case of the short file name. - -disable_sparse= If disable_sparse is specified, creation of sparse - regions, i.e. holes, inside files is disabled for the - volume (for the duration of this mount only). By - default, creation of sparse regions is enabled, which - is consistent with the behaviour of traditional Unix - filesystems. - -errors=opt What to do when critical filesystem errors are found. - Following values can be used for "opt": - - ======== ========================================= - continue DEFAULT, try to clean-up as much as - possible, e.g. marking a corrupt inode as - bad so it is no longer accessed, and then - continue. - recover At present only supported is recovery of - the boot sector from the backup copy. - If read-only mount, the recovery is done - in memory only and not written to disk. - ======== ========================================= - - Note that the options are additive, i.e. specifying:: - - errors=continue,errors=recover - - means the driver will attempt to recover and if that - fails it will clean-up as much as possible and - continue. - -mft_zone_multiplier= Set the MFT zone multiplier for the volume (this - setting is not persistent across mounts and can be - changed from mount to mount but cannot be changed on - remount). Values of 1 to 4 are allowed, 1 being the - default. The MFT zone multiplier determines how much - space is reserved for the MFT on the volume. If all - other space is used up, then the MFT zone will be - shrunk dynamically, so this has no impact on the - amount of free space. However, it can have an impact - on performance by affecting fragmentation of the MFT. - In general use the default. If you have a lot of small - files then use a higher value. The values have the - following meaning: - - ===== ================================= - Value MFT zone size (% of volume size) - ===== ================================= - 1 12.5% - 2 25% - 3 37.5% - 4 50% - ===== ================================= - - Note this option is irrelevant for read-only mounts. -======================= ======================================================= - - -Known bugs and (mis-)features -============================= - -- The link count on each directory inode entry is set to 1, due to Linux not - supporting directory hard links. This may well confuse some user space - applications, since the directory names will have the same inode numbers. - This also speeds up ntfs_read_inode() immensely. And we haven't found any - problems with this approach so far. If you find a problem with this, please - let us know. - - -Please send bug reports/comments/feedback/abuse to the Linux-NTFS development -list at sourceforge: linux-ntfs-dev@lists.sourceforge.net - - -Using NTFS volume and stripe sets -================================= - -For support of volume and stripe sets, you can either use the kernel's -Device-Mapper driver or the kernel's Software RAID / MD driver. The former is -the recommended one to use for linear raid. But the latter is required for -raid level 5. For striping and mirroring, either driver should work fine. - - -The Device-Mapper driver ------------------------- - -You will need to create a table of the components of the volume/stripe set and -how they fit together and load this into the kernel using the dmsetup utility -(see man 8 dmsetup). - -Linear volume sets, i.e. linear raid, has been tested and works fine. Even -though untested, there is no reason why stripe sets, i.e. raid level 0, and -mirrors, i.e. raid level 1 should not work, too. Stripes with parity, i.e. -raid level 5, unfortunately cannot work yet because the current version of the -Device-Mapper driver does not support raid level 5. You may be able to use the -Software RAID / MD driver for raid level 5, see the next section for details. - -To create the table describing your volume you will need to know each of its -components and their sizes in sectors, i.e. multiples of 512-byte blocks. - -For NT4 fault tolerant volumes you can obtain the sizes using fdisk. So for -example if one of your partitions is /dev/hda2 you would do:: - - $ fdisk -ul /dev/hda - - Disk /dev/hda: 81.9 GB, 81964302336 bytes - 255 heads, 63 sectors/track, 9964 cylinders, total 160086528 sectors - Units = sectors of 1 * 512 = 512 bytes - - Device Boot Start End Blocks Id System - /dev/hda1 * 63 4209029 2104483+ 83 Linux - /dev/hda2 4209030 37768814 16779892+ 86 NTFS - /dev/hda3 37768815 46170809 4200997+ 83 Linux - -And you would know that /dev/hda2 has a size of 37768814 - 4209030 + 1 = -33559785 sectors. - -For Win2k and later dynamic disks, you can for example use the ldminfo utility -which is part of the Linux LDM tools (the latest version at the time of -writing is linux-ldm-0.0.8.tar.bz2). You can download it from: - - http://www.linux-ntfs.org/ - -Simply extract the downloaded archive (tar xvjf linux-ldm-0.0.8.tar.bz2), go -into it (cd linux-ldm-0.0.8) and change to the test directory (cd test). You -will find the precompiled (i386) ldminfo utility there. NOTE: You will not be -able to compile this yourself easily so use the binary version! - -Then you would use ldminfo in dump mode to obtain the necessary information:: - - $ ./ldminfo --dump /dev/hda - -This would dump the LDM database found on /dev/hda which describes all of your -dynamic disks and all the volumes on them. At the bottom you will see the -VOLUME DEFINITIONS section which is all you really need. You may need to look -further above to determine which of the disks in the volume definitions is -which device in Linux. Hint: Run ldminfo on each of your dynamic disks and -look at the Disk Id close to the top of the output for each (the PRIVATE HEADER -section). You can then find these Disk Ids in the VBLK DATABASE section in the - components where you will get the LDM Name for the disk that is found in -the VOLUME DEFINITIONS section. - -Note you will also need to enable the LDM driver in the Linux kernel. If your -distribution did not enable it, you will need to recompile the kernel with it -enabled. This will create the LDM partitions on each device at boot time. You -would then use those devices (for /dev/hda they would be /dev/hda1, 2, 3, etc) -in the Device-Mapper table. - -You can also bypass using the LDM driver by using the main device (e.g. -/dev/hda) and then using the offsets of the LDM partitions into this device as -the "Start sector of device" when creating the table. Once again ldminfo would -give you the correct information to do this. - -Assuming you know all your devices and their sizes things are easy. - -For a linear raid the table would look like this (note all values are in -512-byte sectors):: - - # Offset into Size of this Raid type Device Start sector - # volume device of device - 0 1028161 linear /dev/hda1 0 - 1028161 3903762 linear /dev/hdb2 0 - 4931923 2103211 linear /dev/hdc1 0 - -For a striped volume, i.e. raid level 0, you will need to know the chunk size -you used when creating the volume. Windows uses 64kiB as the default, so it -will probably be this unless you changes the defaults when creating the array. - -For a raid level 0 the table would look like this (note all values are in -512-byte sectors):: - - # Offset Size Raid Number Chunk 1st Start 2nd Start - # into of the type of size Device in Device in - # volume volume stripes device device - 0 2056320 striped 2 128 /dev/hda1 0 /dev/hdb1 0 - -If there are more than two devices, just add each of them to the end of the -line. - -Finally, for a mirrored volume, i.e. raid level 1, the table would look like -this (note all values are in 512-byte sectors):: - - # Ofs Size Raid Log Number Region Should Number Source Start Target Start - # in of the type type of log size sync? of Device in Device in - # vol volume params mirrors Device Device - 0 2056320 mirror core 2 16 nosync 2 /dev/hda1 0 /dev/hdb1 0 - -If you are mirroring to multiple devices you can specify further targets at the -end of the line. - -Note the "Should sync?" parameter "nosync" means that the two mirrors are -already in sync which will be the case on a clean shutdown of Windows. If the -mirrors are not clean, you can specify the "sync" option instead of "nosync" -and the Device-Mapper driver will then copy the entirety of the "Source Device" -to the "Target Device" or if you specified multiple target devices to all of -them. - -Once you have your table, save it in a file somewhere (e.g. /etc/ntfsvolume1), -and hand it over to dmsetup to work with, like so:: - - $ dmsetup create myvolume1 /etc/ntfsvolume1 - -You can obviously replace "myvolume1" with whatever name you like. - -If it all worked, you will now have the device /dev/device-mapper/myvolume1 -which you can then just use as an argument to the mount command as usual to -mount the ntfs volume. For example:: - - $ mount -t ntfs -o ro /dev/device-mapper/myvolume1 /mnt/myvol1 - -(You need to create the directory /mnt/myvol1 first and of course you can use -anything you like instead of /mnt/myvol1 as long as it is an existing -directory.) - -It is advisable to do the mount read-only to see if the volume has been setup -correctly to avoid the possibility of causing damage to the data on the ntfs -volume. - - -The Software RAID / MD driver ------------------------------ - -An alternative to using the Device-Mapper driver is to use the kernel's -Software RAID / MD driver. For which you need to set up your /etc/raidtab -appropriately (see man 5 raidtab). - -Linear volume sets, i.e. linear raid, as well as stripe sets, i.e. raid level -0, have been tested and work fine (though see section "Limitations when using -the MD driver with NTFS volumes" especially if you want to use linear raid). -Even though untested, there is no reason why mirrors, i.e. raid level 1, and -stripes with parity, i.e. raid level 5, should not work, too. - -You have to use the "persistent-superblock 0" option for each raid-disk in the -NTFS volume/stripe you are configuring in /etc/raidtab as the persistent -superblock used by the MD driver would damage the NTFS volume. - -Windows by default uses a stripe chunk size of 64k, so you probably want the -"chunk-size 64k" option for each raid-disk, too. - -For example, if you have a stripe set consisting of two partitions /dev/hda5 -and /dev/hdb1 your /etc/raidtab would look like this:: - - raiddev /dev/md0 - raid-level 0 - nr-raid-disks 2 - nr-spare-disks 0 - persistent-superblock 0 - chunk-size 64k - device /dev/hda5 - raid-disk 0 - device /dev/hdb1 - raid-disk 1 - -For linear raid, just change the raid-level above to "raid-level linear", for -mirrors, change it to "raid-level 1", and for stripe sets with parity, change -it to "raid-level 5". - -Note for stripe sets with parity you will also need to tell the MD driver -which parity algorithm to use by specifying the option "parity-algorithm -which", where you need to replace "which" with the name of the algorithm to -use (see man 5 raidtab for available algorithms) and you will have to try the -different available algorithms until you find one that works. Make sure you -are working read-only when playing with this as you may damage your data -otherwise. If you find which algorithm works please let us know (email the -linux-ntfs developers list linux-ntfs-dev@lists.sourceforge.net or drop in on -IRC in channel #ntfs on the irc.freenode.net network) so we can update this -documentation. - -Once the raidtab is setup, run for example raid0run -a to start all devices or -raid0run /dev/md0 to start a particular md device, in this case /dev/md0. - -Then just use the mount command as usual to mount the ntfs volume using for -example:: - - mount -t ntfs -o ro /dev/md0 /mnt/myntfsvolume - -It is advisable to do the mount read-only to see if the md volume has been -setup correctly to avoid the possibility of causing damage to the data on the -ntfs volume. - - -Limitations when using the Software RAID / MD driver ------------------------------------------------------ - -Using the md driver will not work properly if any of your NTFS partitions have -an odd number of sectors. This is especially important for linear raid as all -data after the first partition with an odd number of sectors will be offset by -one or more sectors so if you mount such a partition with write support you -will cause massive damage to the data on the volume which will only become -apparent when you try to use the volume again under Windows. - -So when using linear raid, make sure that all your partitions have an even -number of sectors BEFORE attempting to use it. You have been warned! - -Even better is to simply use the Device-Mapper for linear raid and then you do -not have this problem with odd numbers of sectors. diff --git a/MAINTAINERS b/MAINTAINERS index 8d1052fa6a692..a137641f10967 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15566,16 +15566,6 @@ W: https://github.com/davejiang/linux/wiki T: git https://github.com/davejiang/linux.git F: drivers/ntb/hw/intel/ -NTFS FILESYSTEM -M: Anton Altaparmakov -R: Namjae Jeon -L: linux-ntfs-dev@lists.sourceforge.net -S: Supported -W: http://www.tuxera.com/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/aia21/ntfs.git -F: Documentation/filesystems/ntfs.rst -F: fs/ntfs/ - NTFS3 FILESYSTEM M: Konstantin Komarov L: ntfs3@lists.linux.dev diff --git a/fs/Kconfig b/fs/Kconfig index 89fdbefd1075f..ea2f77446080e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -162,7 +162,6 @@ menu "DOS/FAT/EXFAT/NT Filesystems" source "fs/fat/Kconfig" source "fs/exfat/Kconfig" -source "fs/ntfs/Kconfig" source "fs/ntfs3/Kconfig" endmenu diff --git a/fs/Makefile b/fs/Makefile index c09016257f05e..c32b8c586800a 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -91,7 +91,6 @@ obj-y += unicode/ obj-$(CONFIG_SYSV_FS) += sysv/ obj-$(CONFIG_SMBFS) += smb/ obj-$(CONFIG_HPFS_FS) += hpfs/ -obj-$(CONFIG_NTFS_FS) += ntfs/ obj-$(CONFIG_NTFS3_FS) += ntfs3/ obj-$(CONFIG_UFS_FS) += ufs/ obj-$(CONFIG_EFS_FS) += efs/ diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig deleted file mode 100644 index 7b2509741735a..0000000000000 --- a/fs/ntfs/Kconfig +++ /dev/null @@ -1,81 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -config NTFS_FS - tristate "NTFS file system support" - select BUFFER_HEAD - select NLS - help - NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003. - - Saying Y or M here enables read support. There is partial, but - safe, write support available. For write support you must also - say Y to "NTFS write support" below. - - There are also a number of user-space tools available, called - ntfsprogs. These include ntfsundelete and ntfsresize, that work - without NTFS support enabled in the kernel. - - This is a rewrite from scratch of Linux NTFS support and replaced - the old NTFS code starting with Linux 2.5.11. A backport to - the Linux 2.4 kernel series is separately available as a patch - from the project web site. - - For more information see - and . - - To compile this file system support as a module, choose M here: the - module will be called ntfs. - - If you are not using Windows NT, 2000, XP or 2003 in addition to - Linux on your computer it is safe to say N. - -config NTFS_DEBUG - bool "NTFS debugging support" - depends on NTFS_FS - help - If you are experiencing any problems with the NTFS file system, say - Y here. This will result in additional consistency checks to be - performed by the driver as well as additional debugging messages to - be written to the system log. Note that debugging messages are - disabled by default. To enable them, supply the option debug_msgs=1 - at the kernel command line when booting the kernel or as an option - to insmod when loading the ntfs module. Once the driver is active, - you can enable debugging messages by doing (as root): - echo 1 > /proc/sys/fs/ntfs-debug - Replacing the "1" with "0" would disable debug messages. - - If you leave debugging messages disabled, this results in little - overhead, but enabling debug messages results in very significant - slowdown of the system. - - When reporting bugs, please try to have available a full dump of - debugging messages while the misbehaviour was occurring. - -config NTFS_RW - bool "NTFS write support" - depends on NTFS_FS - depends on PAGE_SIZE_LESS_THAN_64KB - help - This enables the partial, but safe, write support in the NTFS driver. - - The only supported operation is overwriting existing files, without - changing the file length. No file or directory creation, deletion or - renaming is possible. Note only non-resident files can be written to - so you may find that some very small files (<500 bytes or so) cannot - be written to. - - While we cannot guarantee that it will not damage any data, we have - so far not received a single report where the driver would have - damaged someones data so we assume it is perfectly safe to use. - - Note: While write support is safe in this version (a rewrite from - scratch of the NTFS support), it should be noted that the old NTFS - write support, included in Linux 2.5.10 and before (since 1997), - is not safe. - - This is currently useful with TopologiLinux. TopologiLinux is run - on top of any DOS/Microsoft Windows system without partitioning your - hard disk. Unlike other Linux distributions TopologiLinux does not - need its own partition. For more information see - - - It is perfectly safe to say N here. diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile deleted file mode 100644 index 3e736572ed005..0000000000000 --- a/fs/ntfs/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# Rules for making the NTFS driver. - -obj-$(CONFIG_NTFS_FS) += ntfs.o - -ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \ - index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ - unistr.o upcase.o - -ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o - -ccflags-y := -DNTFS_VERSION=\"2.1.32\" -ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG -ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW - diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c deleted file mode 100644 index 2d01517a2d59a..0000000000000 --- a/fs/ntfs/aops.c +++ /dev/null @@ -1,1744 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * aops.c - NTFS kernel address space operations and page cache handling. - * - * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. - * Copyright (c) 2002 Richard Russon - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "aops.h" -#include "attrib.h" -#include "debug.h" -#include "inode.h" -#include "mft.h" -#include "runlist.h" -#include "types.h" -#include "ntfs.h" - -/** - * ntfs_end_buffer_async_read - async io completion for reading attributes - * @bh: buffer head on which io is completed - * @uptodate: whether @bh is now uptodate or not - * - * Asynchronous I/O completion handler for reading pages belonging to the - * attribute address space of an inode. The inodes can either be files or - * directories or they can be fake inodes describing some attribute. - * - * If NInoMstProtected(), perform the post read mst fixups when all IO on the - * page has been completed and mark the page uptodate or set the error bit on - * the page. To determine the size of the records that need fixing up, we - * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs - * record size, and index_block_size_bits, to the log(base 2) of the ntfs - * record size. - */ -static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) -{ - unsigned long flags; - struct buffer_head *first, *tmp; - struct page *page; - struct inode *vi; - ntfs_inode *ni; - int page_uptodate = 1; - - page = bh->b_page; - vi = page->mapping->host; - ni = NTFS_I(vi); - - if (likely(uptodate)) { - loff_t i_size; - s64 file_ofs, init_size; - - set_buffer_uptodate(bh); - - file_ofs = ((s64)page->index << PAGE_SHIFT) + - bh_offset(bh); - read_lock_irqsave(&ni->size_lock, flags); - init_size = ni->initialized_size; - i_size = i_size_read(vi); - read_unlock_irqrestore(&ni->size_lock, flags); - if (unlikely(init_size > i_size)) { - /* Race with shrinking truncate. */ - init_size = i_size; - } - /* Check for the current buffer head overflowing. */ - if (unlikely(file_ofs + bh->b_size > init_size)) { - int ofs; - void *kaddr; - - ofs = 0; - if (file_ofs < init_size) - ofs = init_size - file_ofs; - kaddr = kmap_atomic(page); - memset(kaddr + bh_offset(bh) + ofs, 0, - bh->b_size - ofs); - flush_dcache_page(page); - kunmap_atomic(kaddr); - } - } else { - clear_buffer_uptodate(bh); - SetPageError(page); - ntfs_error(ni->vol->sb, "Buffer I/O error, logical block " - "0x%llx.", (unsigned long long)bh->b_blocknr); - } - first = page_buffers(page); - spin_lock_irqsave(&first->b_uptodate_lock, flags); - clear_buffer_async_read(bh); - unlock_buffer(bh); - tmp = bh; - do { - if (!buffer_uptodate(tmp)) - page_uptodate = 0; - if (buffer_async_read(tmp)) { - if (likely(buffer_locked(tmp))) - goto still_busy; - /* Async buffers must be locked. */ - BUG(); - } - tmp = tmp->b_this_page; - } while (tmp != bh); - spin_unlock_irqrestore(&first->b_uptodate_lock, flags); - /* - * If none of the buffers had errors then we can set the page uptodate, - * but we first have to perform the post read mst fixups, if the - * attribute is mst protected, i.e. if NInoMstProteced(ni) is true. - * Note we ignore fixup errors as those are detected when - * map_mft_record() is called which gives us per record granularity - * rather than per page granularity. - */ - if (!NInoMstProtected(ni)) { - if (likely(page_uptodate && !PageError(page))) - SetPageUptodate(page); - } else { - u8 *kaddr; - unsigned int i, recs; - u32 rec_size; - - rec_size = ni->itype.index.block_size; - recs = PAGE_SIZE / rec_size; - /* Should have been verified before we got here... */ - BUG_ON(!recs); - kaddr = kmap_atomic(page); - for (i = 0; i < recs; i++) - post_read_mst_fixup((NTFS_RECORD*)(kaddr + - i * rec_size), rec_size); - kunmap_atomic(kaddr); - flush_dcache_page(page); - if (likely(page_uptodate && !PageError(page))) - SetPageUptodate(page); - } - unlock_page(page); - return; -still_busy: - spin_unlock_irqrestore(&first->b_uptodate_lock, flags); - return; -} - -/** - * ntfs_read_block - fill a @folio of an address space with data - * @folio: page cache folio to fill with data - * - * We read each buffer asynchronously and when all buffers are read in, our io - * completion handler ntfs_end_buffer_read_async(), if required, automatically - * applies the mst fixups to the folio before finally marking it uptodate and - * unlocking it. - * - * We only enforce allocated_size limit because i_size is checked for in - * generic_file_read(). - * - * Return 0 on success and -errno on error. - * - * Contains an adapted version of fs/buffer.c::block_read_full_folio(). - */ -static int ntfs_read_block(struct folio *folio) -{ - loff_t i_size; - VCN vcn; - LCN lcn; - s64 init_size; - struct inode *vi; - ntfs_inode *ni; - ntfs_volume *vol; - runlist_element *rl; - struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; - sector_t iblock, lblock, zblock; - unsigned long flags; - unsigned int blocksize, vcn_ofs; - int i, nr; - unsigned char blocksize_bits; - - vi = folio->mapping->host; - ni = NTFS_I(vi); - vol = ni->vol; - - /* $MFT/$DATA must have its complete runlist in memory at all times. */ - BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni)); - - blocksize = vol->sb->s_blocksize; - blocksize_bits = vol->sb->s_blocksize_bits; - - head = folio_buffers(folio); - if (!head) - head = create_empty_buffers(folio, blocksize, 0); - bh = head; - - /* - * We may be racing with truncate. To avoid some of the problems we - * now take a snapshot of the various sizes and use those for the whole - * of the function. In case of an extending truncate it just means we - * may leave some buffers unmapped which are now allocated. This is - * not a problem since these buffers will just get mapped when a write - * occurs. In case of a shrinking truncate, we will detect this later - * on due to the runlist being incomplete and if the folio is being - * fully truncated, truncate will throw it away as soon as we unlock - * it so no need to worry what we do with it. - */ - iblock = (s64)folio->index << (PAGE_SHIFT - blocksize_bits); - read_lock_irqsave(&ni->size_lock, flags); - lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits; - init_size = ni->initialized_size; - i_size = i_size_read(vi); - read_unlock_irqrestore(&ni->size_lock, flags); - if (unlikely(init_size > i_size)) { - /* Race with shrinking truncate. */ - init_size = i_size; - } - zblock = (init_size + blocksize - 1) >> blocksize_bits; - - /* Loop through all the buffers in the folio. */ - rl = NULL; - nr = i = 0; - do { - int err = 0; - - if (unlikely(buffer_uptodate(bh))) - continue; - if (unlikely(buffer_mapped(bh))) { - arr[nr++] = bh; - continue; - } - bh->b_bdev = vol->sb->s_bdev; - /* Is the block within the allowed limits? */ - if (iblock < lblock) { - bool is_retry = false; - - /* Convert iblock into corresponding vcn and offset. */ - vcn = (VCN)iblock << blocksize_bits >> - vol->cluster_size_bits; - vcn_ofs = ((VCN)iblock << blocksize_bits) & - vol->cluster_size_mask; - if (!rl) { -lock_retry_remap: - down_read(&ni->runlist.lock); - rl = ni->runlist.rl; - } - if (likely(rl != NULL)) { - /* Seek to element containing target vcn. */ - while (rl->length && rl[1].vcn <= vcn) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); - } else - lcn = LCN_RL_NOT_MAPPED; - /* Successful remap. */ - if (lcn >= 0) { - /* Setup buffer head to correct block. */ - bh->b_blocknr = ((lcn << vol->cluster_size_bits) - + vcn_ofs) >> blocksize_bits; - set_buffer_mapped(bh); - /* Only read initialized data blocks. */ - if (iblock < zblock) { - arr[nr++] = bh; - continue; - } - /* Fully non-initialized data block, zero it. */ - goto handle_zblock; - } - /* It is a hole, need to zero it. */ - if (lcn == LCN_HOLE) - goto handle_hole; - /* If first try and runlist unmapped, map and retry. */ - if (!is_retry && lcn == LCN_RL_NOT_MAPPED) { - is_retry = true; - /* - * Attempt to map runlist, dropping lock for - * the duration. - */ - up_read(&ni->runlist.lock); - err = ntfs_map_runlist(ni, vcn); - if (likely(!err)) - goto lock_retry_remap; - rl = NULL; - } else if (!rl) - up_read(&ni->runlist.lock); - /* - * If buffer is outside the runlist, treat it as a - * hole. This can happen due to concurrent truncate - * for example. - */ - if (err == -ENOENT || lcn == LCN_ENOENT) { - err = 0; - goto handle_hole; - } - /* Hard error, zero out region. */ - if (!err) - err = -EIO; - bh->b_blocknr = -1; - folio_set_error(folio); - ntfs_error(vol->sb, "Failed to read from inode 0x%lx, " - "attribute type 0x%x, vcn 0x%llx, " - "offset 0x%x because its location on " - "disk could not be determined%s " - "(error code %i).", ni->mft_no, - ni->type, (unsigned long long)vcn, - vcn_ofs, is_retry ? " even after " - "retrying" : "", err); - } - /* - * Either iblock was outside lblock limits or - * ntfs_rl_vcn_to_lcn() returned error. Just zero that portion - * of the folio and set the buffer uptodate. - */ -handle_hole: - bh->b_blocknr = -1UL; - clear_buffer_mapped(bh); -handle_zblock: - folio_zero_range(folio, i * blocksize, blocksize); - if (likely(!err)) - set_buffer_uptodate(bh); - } while (i++, iblock++, (bh = bh->b_this_page) != head); - - /* Release the lock if we took it. */ - if (rl) - up_read(&ni->runlist.lock); - - /* Check we have at least one buffer ready for i/o. */ - if (nr) { - struct buffer_head *tbh; - - /* Lock the buffers. */ - for (i = 0; i < nr; i++) { - tbh = arr[i]; - lock_buffer(tbh); - tbh->b_end_io = ntfs_end_buffer_async_read; - set_buffer_async_read(tbh); - } - /* Finally, start i/o on the buffers. */ - for (i = 0; i < nr; i++) { - tbh = arr[i]; - if (likely(!buffer_uptodate(tbh))) - submit_bh(REQ_OP_READ, tbh); - else - ntfs_end_buffer_async_read(tbh, 1); - } - return 0; - } - /* No i/o was scheduled on any of the buffers. */ - if (likely(!folio_test_error(folio))) - folio_mark_uptodate(folio); - else /* Signal synchronous i/o error. */ - nr = -EIO; - folio_unlock(folio); - return nr; -} - -/** - * ntfs_read_folio - fill a @folio of a @file with data from the device - * @file: open file to which the folio @folio belongs or NULL - * @folio: page cache folio to fill with data - * - * For non-resident attributes, ntfs_read_folio() fills the @folio of the open - * file @file by calling the ntfs version of the generic block_read_full_folio() - * function, ntfs_read_block(), which in turn creates and reads in the buffers - * associated with the folio asynchronously. - * - * For resident attributes, OTOH, ntfs_read_folio() fills @folio by copying the - * data from the mft record (which at this stage is most likely in memory) and - * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as - * even if the mft record is not cached at this point in time, we need to wait - * for it to be read in before we can do the copy. - * - * Return 0 on success and -errno on error. - */ -static int ntfs_read_folio(struct file *file, struct folio *folio) -{ - struct page *page = &folio->page; - loff_t i_size; - struct inode *vi; - ntfs_inode *ni, *base_ni; - u8 *addr; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *mrec; - unsigned long flags; - u32 attr_len; - int err = 0; - -retry_readpage: - BUG_ON(!PageLocked(page)); - vi = page->mapping->host; - i_size = i_size_read(vi); - /* Is the page fully outside i_size? (truncate in progress) */ - if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >> - PAGE_SHIFT)) { - zero_user(page, 0, PAGE_SIZE); - ntfs_debug("Read outside i_size - truncated?"); - goto done; - } - /* - * This can potentially happen because we clear PageUptodate() during - * ntfs_writepage() of MstProtected() attributes. - */ - if (PageUptodate(page)) { - unlock_page(page); - return 0; - } - ni = NTFS_I(vi); - /* - * Only $DATA attributes can be encrypted and only unnamed $DATA - * attributes can be compressed. Index root can have the flags set but - * this means to create compressed/encrypted files, not that the - * attribute is compressed/encrypted. Note we need to check for - * AT_INDEX_ALLOCATION since this is the type of both directory and - * index inodes. - */ - if (ni->type != AT_INDEX_ALLOCATION) { - /* If attribute is encrypted, deny access, just like NT4. */ - if (NInoEncrypted(ni)) { - BUG_ON(ni->type != AT_DATA); - err = -EACCES; - goto err_out; - } - /* Compressed data streams are handled in compress.c. */ - if (NInoNonResident(ni) && NInoCompressed(ni)) { - BUG_ON(ni->type != AT_DATA); - BUG_ON(ni->name_len); - return ntfs_read_compressed_block(page); - } - } - /* NInoNonResident() == NInoIndexAllocPresent() */ - if (NInoNonResident(ni)) { - /* Normal, non-resident data stream. */ - return ntfs_read_block(folio); - } - /* - * Attribute is resident, implying it is not compressed or encrypted. - * This also means the attribute is smaller than an mft record and - * hence smaller than a page, so can simply zero out any pages with - * index above 0. Note the attribute can actually be marked compressed - * but if it is resident the actual data is not compressed so we are - * ok to ignore the compressed flag here. - */ - if (unlikely(page->index > 0)) { - zero_user(page, 0, PAGE_SIZE); - goto done; - } - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - /* Map, pin, and lock the mft record. */ - mrec = map_mft_record(base_ni); - if (IS_ERR(mrec)) { - err = PTR_ERR(mrec); - goto err_out; - } - /* - * If a parallel write made the attribute non-resident, drop the mft - * record and retry the read_folio. - */ - if (unlikely(NInoNonResident(ni))) { - unmap_mft_record(base_ni); - goto retry_readpage; - } - ctx = ntfs_attr_get_search_ctx(base_ni, mrec); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto unm_err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) - goto put_unm_err_out; - attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); - read_lock_irqsave(&ni->size_lock, flags); - if (unlikely(attr_len > ni->initialized_size)) - attr_len = ni->initialized_size; - i_size = i_size_read(vi); - read_unlock_irqrestore(&ni->size_lock, flags); - if (unlikely(attr_len > i_size)) { - /* Race with shrinking truncate. */ - attr_len = i_size; - } - addr = kmap_atomic(page); - /* Copy the data to the page. */ - memcpy(addr, (u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset), - attr_len); - /* Zero the remainder of the page. */ - memset(addr + attr_len, 0, PAGE_SIZE - attr_len); - flush_dcache_page(page); - kunmap_atomic(addr); -put_unm_err_out: - ntfs_attr_put_search_ctx(ctx); -unm_err_out: - unmap_mft_record(base_ni); -done: - SetPageUptodate(page); -err_out: - unlock_page(page); - return err; -} - -#ifdef NTFS_RW - -/** - * ntfs_write_block - write a @folio to the backing store - * @folio: page cache folio to write out - * @wbc: writeback control structure - * - * This function is for writing folios belonging to non-resident, non-mst - * protected attributes to their backing store. - * - * For a folio with buffers, map and write the dirty buffers asynchronously - * under folio writeback. For a folio without buffers, create buffers for the - * folio, then proceed as above. - * - * If a folio doesn't have buffers the folio dirty state is definitive. If - * a folio does have buffers, the folio dirty state is just a hint, - * and the buffer dirty state is definitive. (A hint which has rules: - * dirty buffers against a clean folio is illegal. Other combinations are - * legal and need to be handled. In particular a dirty folio containing - * clean buffers for example.) - * - * Return 0 on success and -errno on error. - * - * Based on ntfs_read_block() and __block_write_full_folio(). - */ -static int ntfs_write_block(struct folio *folio, struct writeback_control *wbc) -{ - VCN vcn; - LCN lcn; - s64 initialized_size; - loff_t i_size; - sector_t block, dblock, iblock; - struct inode *vi; - ntfs_inode *ni; - ntfs_volume *vol; - runlist_element *rl; - struct buffer_head *bh, *head; - unsigned long flags; - unsigned int blocksize, vcn_ofs; - int err; - bool need_end_writeback; - unsigned char blocksize_bits; - - vi = folio->mapping->host; - ni = NTFS_I(vi); - vol = ni->vol; - - ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " - "0x%lx.", ni->mft_no, ni->type, folio->index); - - BUG_ON(!NInoNonResident(ni)); - BUG_ON(NInoMstProtected(ni)); - blocksize = vol->sb->s_blocksize; - blocksize_bits = vol->sb->s_blocksize_bits; - head = folio_buffers(folio); - if (!head) { - BUG_ON(!folio_test_uptodate(folio)); - head = create_empty_buffers(folio, blocksize, - (1 << BH_Uptodate) | (1 << BH_Dirty)); - } - bh = head; - - /* NOTE: Different naming scheme to ntfs_read_block()! */ - - /* The first block in the folio. */ - block = (s64)folio->index << (PAGE_SHIFT - blocksize_bits); - - read_lock_irqsave(&ni->size_lock, flags); - i_size = i_size_read(vi); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - - /* The first out of bounds block for the data size. */ - dblock = (i_size + blocksize - 1) >> blocksize_bits; - - /* The last (fully or partially) initialized block. */ - iblock = initialized_size >> blocksize_bits; - - /* - * Be very careful. We have no exclusion from block_dirty_folio - * here, and the (potentially unmapped) buffers may become dirty at - * any time. If a buffer becomes dirty here after we've inspected it - * then we just miss that fact, and the folio stays dirty. - * - * Buffers outside i_size may be dirtied by block_dirty_folio; - * handle that here by just cleaning them. - */ - - /* - * Loop through all the buffers in the folio, mapping all the dirty - * buffers to disk addresses and handling any aliases from the - * underlying block device's mapping. - */ - rl = NULL; - err = 0; - do { - bool is_retry = false; - - if (unlikely(block >= dblock)) { - /* - * Mapped buffers outside i_size will occur, because - * this folio can be outside i_size when there is a - * truncate in progress. The contents of such buffers - * were zeroed by ntfs_writepage(). - * - * FIXME: What about the small race window where - * ntfs_writepage() has not done any clearing because - * the folio was within i_size but before we get here, - * vmtruncate() modifies i_size? - */ - clear_buffer_dirty(bh); - set_buffer_uptodate(bh); - continue; - } - - /* Clean buffers are not written out, so no need to map them. */ - if (!buffer_dirty(bh)) - continue; - - /* Make sure we have enough initialized size. */ - if (unlikely((block >= iblock) && - (initialized_size < i_size))) { - /* - * If this folio is fully outside initialized - * size, zero out all folios between the current - * initialized size and the current folio. Just - * use ntfs_read_folio() to do the zeroing - * transparently. - */ - if (block > iblock) { - // TODO: - // For each folio do: - // - read_cache_folio() - // Again for each folio do: - // - wait_on_folio_locked() - // - Check (folio_test_uptodate(folio) && - // !folio_test_error(folio)) - // Update initialized size in the attribute and - // in the inode. - // Again, for each folio do: - // block_dirty_folio(); - // folio_put() - // We don't need to wait on the writes. - // Update iblock. - } - /* - * The current folio straddles initialized size. Zero - * all non-uptodate buffers and set them uptodate (and - * dirty?). Note, there aren't any non-uptodate buffers - * if the folio is uptodate. - * FIXME: For an uptodate folio, the buffers may need to - * be written out because they were not initialized on - * disk before. - */ - if (!folio_test_uptodate(folio)) { - // TODO: - // Zero any non-uptodate buffers up to i_size. - // Set them uptodate and dirty. - } - // TODO: - // Update initialized size in the attribute and in the - // inode (up to i_size). - // Update iblock. - // FIXME: This is inefficient. Try to batch the two - // size changes to happen in one go. - ntfs_error(vol->sb, "Writing beyond initialized size " - "is not supported yet. Sorry."); - err = -EOPNOTSUPP; - break; - // Do NOT set_buffer_new() BUT DO clear buffer range - // outside write request range. - // set_buffer_uptodate() on complete buffers as well as - // set_buffer_dirty(). - } - - /* No need to map buffers that are already mapped. */ - if (buffer_mapped(bh)) - continue; - - /* Unmapped, dirty buffer. Need to map it. */ - bh->b_bdev = vol->sb->s_bdev; - - /* Convert block into corresponding vcn and offset. */ - vcn = (VCN)block << blocksize_bits; - vcn_ofs = vcn & vol->cluster_size_mask; - vcn >>= vol->cluster_size_bits; - if (!rl) { -lock_retry_remap: - down_read(&ni->runlist.lock); - rl = ni->runlist.rl; - } - if (likely(rl != NULL)) { - /* Seek to element containing target vcn. */ - while (rl->length && rl[1].vcn <= vcn) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); - } else - lcn = LCN_RL_NOT_MAPPED; - /* Successful remap. */ - if (lcn >= 0) { - /* Setup buffer head to point to correct block. */ - bh->b_blocknr = ((lcn << vol->cluster_size_bits) + - vcn_ofs) >> blocksize_bits; - set_buffer_mapped(bh); - continue; - } - /* It is a hole, need to instantiate it. */ - if (lcn == LCN_HOLE) { - u8 *kaddr; - unsigned long *bpos, *bend; - - /* Check if the buffer is zero. */ - kaddr = kmap_local_folio(folio, bh_offset(bh)); - bpos = (unsigned long *)kaddr; - bend = (unsigned long *)(kaddr + blocksize); - do { - if (unlikely(*bpos)) - break; - } while (likely(++bpos < bend)); - kunmap_local(kaddr); - if (bpos == bend) { - /* - * Buffer is zero and sparse, no need to write - * it. - */ - bh->b_blocknr = -1; - clear_buffer_dirty(bh); - continue; - } - // TODO: Instantiate the hole. - // clear_buffer_new(bh); - // clean_bdev_bh_alias(bh); - ntfs_error(vol->sb, "Writing into sparse regions is " - "not supported yet. Sorry."); - err = -EOPNOTSUPP; - break; - } - /* If first try and runlist unmapped, map and retry. */ - if (!is_retry && lcn == LCN_RL_NOT_MAPPED) { - is_retry = true; - /* - * Attempt to map runlist, dropping lock for - * the duration. - */ - up_read(&ni->runlist.lock); - err = ntfs_map_runlist(ni, vcn); - if (likely(!err)) - goto lock_retry_remap; - rl = NULL; - } else if (!rl) - up_read(&ni->runlist.lock); - /* - * If buffer is outside the runlist, truncate has cut it out - * of the runlist. Just clean and clear the buffer and set it - * uptodate so it can get discarded by the VM. - */ - if (err == -ENOENT || lcn == LCN_ENOENT) { - bh->b_blocknr = -1; - clear_buffer_dirty(bh); - folio_zero_range(folio, bh_offset(bh), blocksize); - set_buffer_uptodate(bh); - err = 0; - continue; - } - /* Failed to map the buffer, even after retrying. */ - if (!err) - err = -EIO; - bh->b_blocknr = -1; - ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " - "attribute type 0x%x, vcn 0x%llx, offset 0x%x " - "because its location on disk could not be " - "determined%s (error code %i).", ni->mft_no, - ni->type, (unsigned long long)vcn, - vcn_ofs, is_retry ? " even after " - "retrying" : "", err); - break; - } while (block++, (bh = bh->b_this_page) != head); - - /* Release the lock if we took it. */ - if (rl) - up_read(&ni->runlist.lock); - - /* For the error case, need to reset bh to the beginning. */ - bh = head; - - /* Just an optimization, so ->read_folio() is not called later. */ - if (unlikely(!folio_test_uptodate(folio))) { - int uptodate = 1; - do { - if (!buffer_uptodate(bh)) { - uptodate = 0; - bh = head; - break; - } - } while ((bh = bh->b_this_page) != head); - if (uptodate) - folio_mark_uptodate(folio); - } - - /* Setup all mapped, dirty buffers for async write i/o. */ - do { - if (buffer_mapped(bh) && buffer_dirty(bh)) { - lock_buffer(bh); - if (test_clear_buffer_dirty(bh)) { - BUG_ON(!buffer_uptodate(bh)); - mark_buffer_async_write(bh); - } else - unlock_buffer(bh); - } else if (unlikely(err)) { - /* - * For the error case. The buffer may have been set - * dirty during attachment to a dirty folio. - */ - if (err != -ENOMEM) - clear_buffer_dirty(bh); - } - } while ((bh = bh->b_this_page) != head); - - if (unlikely(err)) { - // TODO: Remove the -EOPNOTSUPP check later on... - if (unlikely(err == -EOPNOTSUPP)) - err = 0; - else if (err == -ENOMEM) { - ntfs_warning(vol->sb, "Error allocating memory. " - "Redirtying folio so we try again " - "later."); - /* - * Put the folio back on mapping->dirty_pages, but - * leave its buffer's dirty state as-is. - */ - folio_redirty_for_writepage(wbc, folio); - err = 0; - } else - folio_set_error(folio); - } - - BUG_ON(folio_test_writeback(folio)); - folio_start_writeback(folio); /* Keeps try_to_free_buffers() away. */ - - /* Submit the prepared buffers for i/o. */ - need_end_writeback = true; - do { - struct buffer_head *next = bh->b_this_page; - if (buffer_async_write(bh)) { - submit_bh(REQ_OP_WRITE, bh); - need_end_writeback = false; - } - bh = next; - } while (bh != head); - folio_unlock(folio); - - /* If no i/o was started, need to end writeback here. */ - if (unlikely(need_end_writeback)) - folio_end_writeback(folio); - - ntfs_debug("Done."); - return err; -} - -/** - * ntfs_write_mst_block - write a @page to the backing store - * @page: page cache page to write out - * @wbc: writeback control structure - * - * This function is for writing pages belonging to non-resident, mst protected - * attributes to their backing store. The only supported attributes are index - * allocation and $MFT/$DATA. Both directory inodes and index inodes are - * supported for the index allocation case. - * - * The page must remain locked for the duration of the write because we apply - * the mst fixups, write, and then undo the fixups, so if we were to unlock the - * page before undoing the fixups, any other user of the page will see the - * page contents as corrupt. - * - * We clear the page uptodate flag for the duration of the function to ensure - * exclusion for the $MFT/$DATA case against someone mapping an mft record we - * are about to apply the mst fixups to. - * - * Return 0 on success and -errno on error. - * - * Based on ntfs_write_block(), ntfs_mft_writepage(), and - * write_mft_record_nolock(). - */ -static int ntfs_write_mst_block(struct page *page, - struct writeback_control *wbc) -{ - sector_t block, dblock, rec_block; - struct inode *vi = page->mapping->host; - ntfs_inode *ni = NTFS_I(vi); - ntfs_volume *vol = ni->vol; - u8 *kaddr; - unsigned int rec_size = ni->itype.index.block_size; - ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE]; - struct buffer_head *bh, *head, *tbh, *rec_start_bh; - struct buffer_head *bhs[MAX_BUF_PER_PAGE]; - runlist_element *rl; - int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2; - unsigned bh_size, rec_size_bits; - bool sync, is_mft, page_is_dirty, rec_is_dirty; - unsigned char bh_size_bits; - - if (WARN_ON(rec_size < NTFS_BLOCK_SIZE)) - return -EINVAL; - - ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " - "0x%lx.", vi->i_ino, ni->type, page->index); - BUG_ON(!NInoNonResident(ni)); - BUG_ON(!NInoMstProtected(ni)); - is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino); - /* - * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page - * in its page cache were to be marked dirty. However this should - * never happen with the current driver and considering we do not - * handle this case here we do want to BUG(), at least for now. - */ - BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) || - (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION))); - bh_size = vol->sb->s_blocksize; - bh_size_bits = vol->sb->s_blocksize_bits; - max_bhs = PAGE_SIZE / bh_size; - BUG_ON(!max_bhs); - BUG_ON(max_bhs > MAX_BUF_PER_PAGE); - - /* Were we called for sync purposes? */ - sync = (wbc->sync_mode == WB_SYNC_ALL); - - /* Make sure we have mapped buffers. */ - bh = head = page_buffers(page); - BUG_ON(!bh); - - rec_size_bits = ni->itype.index.block_size_bits; - BUG_ON(!(PAGE_SIZE >> rec_size_bits)); - bhs_per_rec = rec_size >> bh_size_bits; - BUG_ON(!bhs_per_rec); - - /* The first block in the page. */ - rec_block = block = (sector_t)page->index << - (PAGE_SHIFT - bh_size_bits); - - /* The first out of bounds block for the data size. */ - dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits; - - rl = NULL; - err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0; - page_is_dirty = rec_is_dirty = false; - rec_start_bh = NULL; - do { - bool is_retry = false; - - if (likely(block < rec_block)) { - if (unlikely(block >= dblock)) { - clear_buffer_dirty(bh); - set_buffer_uptodate(bh); - continue; - } - /* - * This block is not the first one in the record. We - * ignore the buffer's dirty state because we could - * have raced with a parallel mark_ntfs_record_dirty(). - */ - if (!rec_is_dirty) - continue; - if (unlikely(err2)) { - if (err2 != -ENOMEM) - clear_buffer_dirty(bh); - continue; - } - } else /* if (block == rec_block) */ { - BUG_ON(block > rec_block); - /* This block is the first one in the record. */ - rec_block += bhs_per_rec; - err2 = 0; - if (unlikely(block >= dblock)) { - clear_buffer_dirty(bh); - continue; - } - if (!buffer_dirty(bh)) { - /* Clean records are not written out. */ - rec_is_dirty = false; - continue; - } - rec_is_dirty = true; - rec_start_bh = bh; - } - /* Need to map the buffer if it is not mapped already. */ - if (unlikely(!buffer_mapped(bh))) { - VCN vcn; - LCN lcn; - unsigned int vcn_ofs; - - bh->b_bdev = vol->sb->s_bdev; - /* Obtain the vcn and offset of the current block. */ - vcn = (VCN)block << bh_size_bits; - vcn_ofs = vcn & vol->cluster_size_mask; - vcn >>= vol->cluster_size_bits; - if (!rl) { -lock_retry_remap: - down_read(&ni->runlist.lock); - rl = ni->runlist.rl; - } - if (likely(rl != NULL)) { - /* Seek to element containing target vcn. */ - while (rl->length && rl[1].vcn <= vcn) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); - } else - lcn = LCN_RL_NOT_MAPPED; - /* Successful remap. */ - if (likely(lcn >= 0)) { - /* Setup buffer head to correct block. */ - bh->b_blocknr = ((lcn << - vol->cluster_size_bits) + - vcn_ofs) >> bh_size_bits; - set_buffer_mapped(bh); - } else { - /* - * Remap failed. Retry to map the runlist once - * unless we are working on $MFT which always - * has the whole of its runlist in memory. - */ - if (!is_mft && !is_retry && - lcn == LCN_RL_NOT_MAPPED) { - is_retry = true; - /* - * Attempt to map runlist, dropping - * lock for the duration. - */ - up_read(&ni->runlist.lock); - err2 = ntfs_map_runlist(ni, vcn); - if (likely(!err2)) - goto lock_retry_remap; - if (err2 == -ENOMEM) - page_is_dirty = true; - lcn = err2; - } else { - err2 = -EIO; - if (!rl) - up_read(&ni->runlist.lock); - } - /* Hard error. Abort writing this record. */ - if (!err || err == -ENOMEM) - err = err2; - bh->b_blocknr = -1; - ntfs_error(vol->sb, "Cannot write ntfs record " - "0x%llx (inode 0x%lx, " - "attribute type 0x%x) because " - "its location on disk could " - "not be determined (error " - "code %lli).", - (long long)block << - bh_size_bits >> - vol->mft_record_size_bits, - ni->mft_no, ni->type, - (long long)lcn); - /* - * If this is not the first buffer, remove the - * buffers in this record from the list of - * buffers to write and clear their dirty bit - * if not error -ENOMEM. - */ - if (rec_start_bh != bh) { - while (bhs[--nr_bhs] != rec_start_bh) - ; - if (err2 != -ENOMEM) { - do { - clear_buffer_dirty( - rec_start_bh); - } while ((rec_start_bh = - rec_start_bh-> - b_this_page) != - bh); - } - } - continue; - } - } - BUG_ON(!buffer_uptodate(bh)); - BUG_ON(nr_bhs >= max_bhs); - bhs[nr_bhs++] = bh; - } while (block++, (bh = bh->b_this_page) != head); - if (unlikely(rl)) - up_read(&ni->runlist.lock); - /* If there were no dirty buffers, we are done. */ - if (!nr_bhs) - goto done; - /* Map the page so we can access its contents. */ - kaddr = kmap(page); - /* Clear the page uptodate flag whilst the mst fixups are applied. */ - BUG_ON(!PageUptodate(page)); - ClearPageUptodate(page); - for (i = 0; i < nr_bhs; i++) { - unsigned int ofs; - - /* Skip buffers which are not at the beginning of records. */ - if (i % bhs_per_rec) - continue; - tbh = bhs[i]; - ofs = bh_offset(tbh); - if (is_mft) { - ntfs_inode *tni; - unsigned long mft_no; - - /* Get the mft record number. */ - mft_no = (((s64)page->index << PAGE_SHIFT) + ofs) - >> rec_size_bits; - /* Check whether to write this mft record. */ - tni = NULL; - if (!ntfs_may_write_mft_record(vol, mft_no, - (MFT_RECORD*)(kaddr + ofs), &tni)) { - /* - * The record should not be written. This - * means we need to redirty the page before - * returning. - */ - page_is_dirty = true; - /* - * Remove the buffers in this mft record from - * the list of buffers to write. - */ - do { - bhs[i] = NULL; - } while (++i % bhs_per_rec); - continue; - } - /* - * The record should be written. If a locked ntfs - * inode was returned, add it to the array of locked - * ntfs inodes. - */ - if (tni) - locked_nis[nr_locked_nis++] = tni; - } - /* Apply the mst protection fixups. */ - err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs), - rec_size); - if (unlikely(err2)) { - if (!err || err == -ENOMEM) - err = -EIO; - ntfs_error(vol->sb, "Failed to apply mst fixups " - "(inode 0x%lx, attribute type 0x%x, " - "page index 0x%lx, page offset 0x%x)!" - " Unmount and run chkdsk.", vi->i_ino, - ni->type, page->index, ofs); - /* - * Mark all the buffers in this record clean as we do - * not want to write corrupt data to disk. - */ - do { - clear_buffer_dirty(bhs[i]); - bhs[i] = NULL; - } while (++i % bhs_per_rec); - continue; - } - nr_recs++; - } - /* If no records are to be written out, we are done. */ - if (!nr_recs) - goto unm_done; - flush_dcache_page(page); - /* Lock buffers and start synchronous write i/o on them. */ - for (i = 0; i < nr_bhs; i++) { - tbh = bhs[i]; - if (!tbh) - continue; - if (!trylock_buffer(tbh)) - BUG(); - /* The buffer dirty state is now irrelevant, just clean it. */ - clear_buffer_dirty(tbh); - BUG_ON(!buffer_uptodate(tbh)); - BUG_ON(!buffer_mapped(tbh)); - get_bh(tbh); - tbh->b_end_io = end_buffer_write_sync; - submit_bh(REQ_OP_WRITE, tbh); - } - /* Synchronize the mft mirror now if not @sync. */ - if (is_mft && !sync) - goto do_mirror; -do_wait: - /* Wait on i/o completion of buffers. */ - for (i = 0; i < nr_bhs; i++) { - tbh = bhs[i]; - if (!tbh) - continue; - wait_on_buffer(tbh); - if (unlikely(!buffer_uptodate(tbh))) { - ntfs_error(vol->sb, "I/O error while writing ntfs " - "record buffer (inode 0x%lx, " - "attribute type 0x%x, page index " - "0x%lx, page offset 0x%lx)! Unmount " - "and run chkdsk.", vi->i_ino, ni->type, - page->index, bh_offset(tbh)); - if (!err || err == -ENOMEM) - err = -EIO; - /* - * Set the buffer uptodate so the page and buffer - * states do not become out of sync. - */ - set_buffer_uptodate(tbh); - } - } - /* If @sync, now synchronize the mft mirror. */ - if (is_mft && sync) { -do_mirror: - for (i = 0; i < nr_bhs; i++) { - unsigned long mft_no; - unsigned int ofs; - - /* - * Skip buffers which are not at the beginning of - * records. - */ - if (i % bhs_per_rec) - continue; - tbh = bhs[i]; - /* Skip removed buffers (and hence records). */ - if (!tbh) - continue; - ofs = bh_offset(tbh); - /* Get the mft record number. */ - mft_no = (((s64)page->index << PAGE_SHIFT) + ofs) - >> rec_size_bits; - if (mft_no < vol->mftmirr_size) - ntfs_sync_mft_mirror(vol, mft_no, - (MFT_RECORD*)(kaddr + ofs), - sync); - } - if (!sync) - goto do_wait; - } - /* Remove the mst protection fixups again. */ - for (i = 0; i < nr_bhs; i++) { - if (!(i % bhs_per_rec)) { - tbh = bhs[i]; - if (!tbh) - continue; - post_write_mst_fixup((NTFS_RECORD*)(kaddr + - bh_offset(tbh))); - } - } - flush_dcache_page(page); -unm_done: - /* Unlock any locked inodes. */ - while (nr_locked_nis-- > 0) { - ntfs_inode *tni, *base_tni; - - tni = locked_nis[nr_locked_nis]; - /* Get the base inode. */ - mutex_lock(&tni->extent_lock); - if (tni->nr_extents >= 0) - base_tni = tni; - else { - base_tni = tni->ext.base_ntfs_ino; - BUG_ON(!base_tni); - } - mutex_unlock(&tni->extent_lock); - ntfs_debug("Unlocking %s inode 0x%lx.", - tni == base_tni ? "base" : "extent", - tni->mft_no); - mutex_unlock(&tni->mrec_lock); - atomic_dec(&tni->count); - iput(VFS_I(base_tni)); - } - SetPageUptodate(page); - kunmap(page); -done: - if (unlikely(err && err != -ENOMEM)) { - /* - * Set page error if there is only one ntfs record in the page. - * Otherwise we would loose per-record granularity. - */ - if (ni->itype.index.block_size == PAGE_SIZE) - SetPageError(page); - NVolSetErrors(vol); - } - if (page_is_dirty) { - ntfs_debug("Page still contains one or more dirty ntfs " - "records. Redirtying the page starting at " - "record 0x%lx.", page->index << - (PAGE_SHIFT - rec_size_bits)); - redirty_page_for_writepage(wbc, page); - unlock_page(page); - } else { - /* - * Keep the VM happy. This must be done otherwise the - * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though - * the page is clean. - */ - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); - end_page_writeback(page); - } - if (likely(!err)) - ntfs_debug("Done."); - return err; -} - -/** - * ntfs_writepage - write a @page to the backing store - * @page: page cache page to write out - * @wbc: writeback control structure - * - * This is called from the VM when it wants to have a dirty ntfs page cache - * page cleaned. The VM has already locked the page and marked it clean. - * - * For non-resident attributes, ntfs_writepage() writes the @page by calling - * the ntfs version of the generic block_write_full_folio() function, - * ntfs_write_block(), which in turn if necessary creates and writes the - * buffers associated with the page asynchronously. - * - * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying - * the data to the mft record (which at this stage is most likely in memory). - * The mft record is then marked dirty and written out asynchronously via the - * vfs inode dirty code path for the inode the mft record belongs to or via the - * vm page dirty code path for the page the mft record is in. - * - * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_folio(). - * - * Return 0 on success and -errno on error. - */ -static int ntfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); - loff_t i_size; - struct inode *vi = folio->mapping->host; - ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi); - char *addr; - ntfs_attr_search_ctx *ctx = NULL; - MFT_RECORD *m = NULL; - u32 attr_len; - int err; - -retry_writepage: - BUG_ON(!folio_test_locked(folio)); - i_size = i_size_read(vi); - /* Is the folio fully outside i_size? (truncate in progress) */ - if (unlikely(folio->index >= (i_size + PAGE_SIZE - 1) >> - PAGE_SHIFT)) { - /* - * The folio may have dirty, unmapped buffers. Make them - * freeable here, so the page does not leak. - */ - block_invalidate_folio(folio, 0, folio_size(folio)); - folio_unlock(folio); - ntfs_debug("Write outside i_size - truncated?"); - return 0; - } - /* - * Only $DATA attributes can be encrypted and only unnamed $DATA - * attributes can be compressed. Index root can have the flags set but - * this means to create compressed/encrypted files, not that the - * attribute is compressed/encrypted. Note we need to check for - * AT_INDEX_ALLOCATION since this is the type of both directory and - * index inodes. - */ - if (ni->type != AT_INDEX_ALLOCATION) { - /* If file is encrypted, deny access, just like NT4. */ - if (NInoEncrypted(ni)) { - folio_unlock(folio); - BUG_ON(ni->type != AT_DATA); - ntfs_debug("Denying write access to encrypted file."); - return -EACCES; - } - /* Compressed data streams are handled in compress.c. */ - if (NInoNonResident(ni) && NInoCompressed(ni)) { - BUG_ON(ni->type != AT_DATA); - BUG_ON(ni->name_len); - // TODO: Implement and replace this with - // return ntfs_write_compressed_block(page); - folio_unlock(folio); - ntfs_error(vi->i_sb, "Writing to compressed files is " - "not supported yet. Sorry."); - return -EOPNOTSUPP; - } - // TODO: Implement and remove this check. - if (NInoNonResident(ni) && NInoSparse(ni)) { - folio_unlock(folio); - ntfs_error(vi->i_sb, "Writing to sparse files is not " - "supported yet. Sorry."); - return -EOPNOTSUPP; - } - } - /* NInoNonResident() == NInoIndexAllocPresent() */ - if (NInoNonResident(ni)) { - /* We have to zero every time due to mmap-at-end-of-file. */ - if (folio->index >= (i_size >> PAGE_SHIFT)) { - /* The folio straddles i_size. */ - unsigned int ofs = i_size & (folio_size(folio) - 1); - folio_zero_segment(folio, ofs, folio_size(folio)); - } - /* Handle mst protected attributes. */ - if (NInoMstProtected(ni)) - return ntfs_write_mst_block(page, wbc); - /* Normal, non-resident data stream. */ - return ntfs_write_block(folio, wbc); - } - /* - * Attribute is resident, implying it is not compressed, encrypted, or - * mst protected. This also means the attribute is smaller than an mft - * record and hence smaller than a folio, so can simply return error on - * any folios with index above 0. Note the attribute can actually be - * marked compressed but if it is resident the actual data is not - * compressed so we are ok to ignore the compressed flag here. - */ - BUG_ON(folio_buffers(folio)); - BUG_ON(!folio_test_uptodate(folio)); - if (unlikely(folio->index > 0)) { - ntfs_error(vi->i_sb, "BUG()! folio->index (0x%lx) > 0. " - "Aborting write.", folio->index); - BUG_ON(folio_test_writeback(folio)); - folio_start_writeback(folio); - folio_unlock(folio); - folio_end_writeback(folio); - return -EIO; - } - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - /* Map, pin, and lock the mft record. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - /* - * If a parallel write made the attribute non-resident, drop the mft - * record and retry the writepage. - */ - if (unlikely(NInoNonResident(ni))) { - unmap_mft_record(base_ni); - goto retry_writepage; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) - goto err_out; - /* - * Keep the VM happy. This must be done otherwise - * PAGECACHE_TAG_DIRTY remains set even though the folio is clean. - */ - BUG_ON(folio_test_writeback(folio)); - folio_start_writeback(folio); - folio_unlock(folio); - attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); - i_size = i_size_read(vi); - if (unlikely(attr_len > i_size)) { - /* Race with shrinking truncate or a failed truncate. */ - attr_len = i_size; - /* - * If the truncate failed, fix it up now. If a concurrent - * truncate, we do its job, so it does not have to do anything. - */ - err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr, - attr_len); - /* Shrinking cannot fail. */ - BUG_ON(err); - } - addr = kmap_local_folio(folio, 0); - /* Copy the data from the folio to the mft record. */ - memcpy((u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset), - addr, attr_len); - /* Zero out of bounds area in the page cache folio. */ - memset(addr + attr_len, 0, folio_size(folio) - attr_len); - kunmap_local(addr); - flush_dcache_folio(folio); - flush_dcache_mft_record_page(ctx->ntfs_ino); - /* We are done with the folio. */ - folio_end_writeback(folio); - /* Finally, mark the mft record dirty, so it gets written back. */ - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - return 0; -err_out: - if (err == -ENOMEM) { - ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying " - "page so we try again later."); - /* - * Put the folio back on mapping->dirty_pages, but leave its - * buffers' dirty state as-is. - */ - folio_redirty_for_writepage(wbc, folio); - err = 0; - } else { - ntfs_error(vi->i_sb, "Resident attribute write failed with " - "error %i.", err); - folio_set_error(folio); - NVolSetErrors(ni->vol); - } - folio_unlock(folio); - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - return err; -} - -#endif /* NTFS_RW */ - -/** - * ntfs_bmap - map logical file block to physical device block - * @mapping: address space mapping to which the block to be mapped belongs - * @block: logical block to map to its physical device block - * - * For regular, non-resident files (i.e. not compressed and not encrypted), map - * the logical @block belonging to the file described by the address space - * mapping @mapping to its physical device block. - * - * The size of the block is equal to the @s_blocksize field of the super block - * of the mounted file system which is guaranteed to be smaller than or equal - * to the cluster size thus the block is guaranteed to fit entirely inside the - * cluster which means we do not need to care how many contiguous bytes are - * available after the beginning of the block. - * - * Return the physical device block if the mapping succeeded or 0 if the block - * is sparse or there was an error. - * - * Note: This is a problem if someone tries to run bmap() on $Boot system file - * as that really is in block zero but there is nothing we can do. bmap() is - * just broken in that respect (just like it cannot distinguish sparse from - * not available or error). - */ -static sector_t ntfs_bmap(struct address_space *mapping, sector_t block) -{ - s64 ofs, size; - loff_t i_size; - LCN lcn; - unsigned long blocksize, flags; - ntfs_inode *ni = NTFS_I(mapping->host); - ntfs_volume *vol = ni->vol; - unsigned delta; - unsigned char blocksize_bits, cluster_size_shift; - - ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.", - ni->mft_no, (unsigned long long)block); - if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) { - ntfs_error(vol->sb, "BMAP does not make sense for %s " - "attributes, returning 0.", - (ni->type != AT_DATA) ? "non-data" : - (!NInoNonResident(ni) ? "resident" : - "encrypted")); - return 0; - } - /* None of these can happen. */ - BUG_ON(NInoCompressed(ni)); - BUG_ON(NInoMstProtected(ni)); - blocksize = vol->sb->s_blocksize; - blocksize_bits = vol->sb->s_blocksize_bits; - ofs = (s64)block << blocksize_bits; - read_lock_irqsave(&ni->size_lock, flags); - size = ni->initialized_size; - i_size = i_size_read(VFS_I(ni)); - read_unlock_irqrestore(&ni->size_lock, flags); - /* - * If the offset is outside the initialized size or the block straddles - * the initialized size then pretend it is a hole unless the - * initialized size equals the file size. - */ - if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size))) - goto hole; - cluster_size_shift = vol->cluster_size_bits; - down_read(&ni->runlist.lock); - lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false); - up_read(&ni->runlist.lock); - if (unlikely(lcn < LCN_HOLE)) { - /* - * Step down to an integer to avoid gcc doing a long long - * comparision in the switch when we know @lcn is between - * LCN_HOLE and LCN_EIO (i.e. -1 to -5). - * - * Otherwise older gcc (at least on some architectures) will - * try to use __cmpdi2() which is of course not available in - * the kernel. - */ - switch ((int)lcn) { - case LCN_ENOENT: - /* - * If the offset is out of bounds then pretend it is a - * hole. - */ - goto hole; - case LCN_ENOMEM: - ntfs_error(vol->sb, "Not enough memory to complete " - "mapping for inode 0x%lx. " - "Returning 0.", ni->mft_no); - break; - default: - ntfs_error(vol->sb, "Failed to complete mapping for " - "inode 0x%lx. Run chkdsk. " - "Returning 0.", ni->mft_no); - break; - } - return 0; - } - if (lcn < 0) { - /* It is a hole. */ -hole: - ntfs_debug("Done (returning hole)."); - return 0; - } - /* - * The block is really allocated and fullfils all our criteria. - * Convert the cluster to units of block size and return the result. - */ - delta = ofs & vol->cluster_size_mask; - if (unlikely(sizeof(block) < sizeof(lcn))) { - block = lcn = ((lcn << cluster_size_shift) + delta) >> - blocksize_bits; - /* If the block number was truncated return 0. */ - if (unlikely(block != lcn)) { - ntfs_error(vol->sb, "Physical block 0x%llx is too " - "large to be returned, returning 0.", - (long long)lcn); - return 0; - } - } else - block = ((lcn << cluster_size_shift) + delta) >> - blocksize_bits; - ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn); - return block; -} - -/* - * ntfs_normal_aops - address space operations for normal inodes and attributes - * - * Note these are not used for compressed or mst protected inodes and - * attributes. - */ -const struct address_space_operations ntfs_normal_aops = { - .read_folio = ntfs_read_folio, -#ifdef NTFS_RW - .writepage = ntfs_writepage, - .dirty_folio = block_dirty_folio, -#endif /* NTFS_RW */ - .bmap = ntfs_bmap, - .migrate_folio = buffer_migrate_folio, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_folio = generic_error_remove_folio, -}; - -/* - * ntfs_compressed_aops - address space operations for compressed inodes - */ -const struct address_space_operations ntfs_compressed_aops = { - .read_folio = ntfs_read_folio, -#ifdef NTFS_RW - .writepage = ntfs_writepage, - .dirty_folio = block_dirty_folio, -#endif /* NTFS_RW */ - .migrate_folio = buffer_migrate_folio, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_folio = generic_error_remove_folio, -}; - -/* - * ntfs_mst_aops - general address space operations for mst protecteed inodes - * and attributes - */ -const struct address_space_operations ntfs_mst_aops = { - .read_folio = ntfs_read_folio, /* Fill page with data. */ -#ifdef NTFS_RW - .writepage = ntfs_writepage, /* Write dirty page to disk. */ - .dirty_folio = filemap_dirty_folio, -#endif /* NTFS_RW */ - .migrate_folio = buffer_migrate_folio, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_folio = generic_error_remove_folio, -}; - -#ifdef NTFS_RW - -/** - * mark_ntfs_record_dirty - mark an ntfs record dirty - * @page: page containing the ntfs record to mark dirty - * @ofs: byte offset within @page at which the ntfs record begins - * - * Set the buffers and the page in which the ntfs record is located dirty. - * - * The latter also marks the vfs inode the ntfs record belongs to dirty - * (I_DIRTY_PAGES only). - * - * If the page does not have buffers, we create them and set them uptodate. - * The page may not be locked which is why we need to handle the buffers under - * the mapping->i_private_lock. Once the buffers are marked dirty we no longer - * need the lock since try_to_free_buffers() does not free dirty buffers. - */ -void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { - struct address_space *mapping = page->mapping; - ntfs_inode *ni = NTFS_I(mapping->host); - struct buffer_head *bh, *head, *buffers_to_free = NULL; - unsigned int end, bh_size, bh_ofs; - - BUG_ON(!PageUptodate(page)); - end = ofs + ni->itype.index.block_size; - bh_size = VFS_I(ni)->i_sb->s_blocksize; - spin_lock(&mapping->i_private_lock); - if (unlikely(!page_has_buffers(page))) { - spin_unlock(&mapping->i_private_lock); - bh = head = alloc_page_buffers(page, bh_size, true); - spin_lock(&mapping->i_private_lock); - if (likely(!page_has_buffers(page))) { - struct buffer_head *tail; - - do { - set_buffer_uptodate(bh); - tail = bh; - bh = bh->b_this_page; - } while (bh); - tail->b_this_page = head; - attach_page_private(page, head); - } else - buffers_to_free = bh; - } - bh = head = page_buffers(page); - BUG_ON(!bh); - do { - bh_ofs = bh_offset(bh); - if (bh_ofs + bh_size <= ofs) - continue; - if (unlikely(bh_ofs >= end)) - break; - set_buffer_dirty(bh); - } while ((bh = bh->b_this_page) != head); - spin_unlock(&mapping->i_private_lock); - filemap_dirty_folio(mapping, page_folio(page)); - if (unlikely(buffers_to_free)) { - do { - bh = buffers_to_free->b_this_page; - free_buffer_head(buffers_to_free); - buffers_to_free = bh; - } while (buffers_to_free); - } -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h deleted file mode 100644 index 8d0958a149cb2..0000000000000 --- a/fs/ntfs/aops.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * aops.h - Defines for NTFS kernel address space operations and page cache - * handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_AOPS_H -#define _LINUX_NTFS_AOPS_H - -#include -#include -#include -#include - -#include "inode.h" - -/** - * ntfs_unmap_page - release a page that was mapped using ntfs_map_page() - * @page: the page to release - * - * Unpin, unmap and release a page that was obtained from ntfs_map_page(). - */ -static inline void ntfs_unmap_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - -/** - * ntfs_map_page - map a page into accessible memory, reading it if necessary - * @mapping: address space for which to obtain the page - * @index: index into the page cache for @mapping of the page to map - * - * Read a page from the page cache of the address space @mapping at position - * @index, where @index is in units of PAGE_SIZE, and not in bytes. - * - * If the page is not in memory it is loaded from disk first using the - * read_folio method defined in the address space operations of @mapping - * and the page is added to the page cache of @mapping in the process. - * - * If the page belongs to an mst protected attribute and it is marked as such - * in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no - * error checking is performed. This means the caller has to verify whether - * the ntfs record(s) contained in the page are valid or not using one of the - * ntfs_is_XXXX_record{,p}() macros, where XXXX is the record type you are - * expecting to see. (For details of the macros, see fs/ntfs/layout.h.) - * - * If the page is in high memory it is mapped into memory directly addressible - * by the kernel. - * - * Finally the page count is incremented, thus pinning the page into place. - * - * The above means that page_address(page) can be used on all pages obtained - * with ntfs_map_page() to get the kernel virtual address of the page. - * - * When finished with the page, the caller has to call ntfs_unmap_page() to - * unpin, unmap and release the page. - * - * Note this does not grant exclusive access. If such is desired, the caller - * must provide it independently of the ntfs_{un}map_page() calls by using - * a {rw_}semaphore or other means of serialization. A spin lock cannot be - * used as ntfs_map_page() can block. - * - * The unlocked and uptodate page is returned on success or an encoded error - * on failure. Caller has to test for error using the IS_ERR() macro on the - * return value. If that evaluates to 'true', the negative error code can be - * obtained using PTR_ERR() on the return value of ntfs_map_page(). - */ -static inline struct page *ntfs_map_page(struct address_space *mapping, - unsigned long index) -{ - struct page *page = read_mapping_page(mapping, index, NULL); - - if (!IS_ERR(page)) - kmap(page); - return page; -} - -#ifdef NTFS_RW - -extern void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_AOPS_H */ diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c deleted file mode 100644 index f79408f9127af..0000000000000 --- a/fs/ntfs/attrib.c +++ /dev/null @@ -1,2624 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. - * Copyright (c) 2002 Richard Russon - */ - -#include -#include -#include -#include -#include - -#include "attrib.h" -#include "debug.h" -#include "layout.h" -#include "lcnalloc.h" -#include "malloc.h" -#include "mft.h" -#include "ntfs.h" -#include "types.h" - -/** - * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode - * @ni: ntfs inode for which to map (part of) a runlist - * @vcn: map runlist part containing this vcn - * @ctx: active attribute search context if present or NULL if not - * - * Map the part of a runlist containing the @vcn of the ntfs inode @ni. - * - * If @ctx is specified, it is an active search context of @ni and its base mft - * record. This is needed when ntfs_map_runlist_nolock() encounters unmapped - * runlist fragments and allows their mapping. If you do not have the mft - * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock() - * will perform the necessary mapping and unmapping. - * - * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and - * restores it before returning. Thus, @ctx will be left pointing to the same - * attribute on return as on entry. However, the actual pointers in @ctx may - * point to different memory locations on return, so you must remember to reset - * any cached pointers from the @ctx, i.e. after the call to - * ntfs_map_runlist_nolock(), you will probably want to do: - * m = ctx->mrec; - * a = ctx->attr; - * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that - * you cache ctx->mrec in a variable @m of type MFT_RECORD *. - * - * Return 0 on success and -errno on error. There is one special error code - * which is not an error as such. This is -ENOENT. It means that @vcn is out - * of bounds of the runlist. - * - * Note the runlist can be NULL after this function returns if @vcn is zero and - * the attribute has zero allocated size, i.e. there simply is no runlist. - * - * WARNING: If @ctx is supplied, regardless of whether success or failure is - * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx - * is no longer valid, i.e. you need to either call - * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. - * In that case PTR_ERR(@ctx->mrec) will give you the error code for - * why the mapping of the old inode failed. - * - * Locking: - The runlist described by @ni must be locked for writing on entry - * and is locked on return. Note the runlist will be modified. - * - If @ctx is NULL, the base mft record of @ni must not be mapped on - * entry and it will be left unmapped on return. - * - If @ctx is not NULL, the base mft record must be mapped on entry - * and it will be left mapped on return. - */ -int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx) -{ - VCN end_vcn; - unsigned long flags; - ntfs_inode *base_ni; - MFT_RECORD *m; - ATTR_RECORD *a; - runlist_element *rl; - struct page *put_this_page = NULL; - int err = 0; - bool ctx_is_temporary, ctx_needs_reset; - ntfs_attr_search_ctx old_ctx = { NULL, }; - - ntfs_debug("Mapping runlist part containing vcn 0x%llx.", - (unsigned long long)vcn); - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - if (!ctx) { - ctx_is_temporary = ctx_needs_reset = true; - m = map_mft_record(base_ni); - if (IS_ERR(m)) - return PTR_ERR(m); - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - } else { - VCN allocated_size_vcn; - - BUG_ON(IS_ERR(ctx->mrec)); - a = ctx->attr; - BUG_ON(!a->non_resident); - ctx_is_temporary = false; - end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); - read_lock_irqsave(&ni->size_lock, flags); - allocated_size_vcn = ni->allocated_size >> - ni->vol->cluster_size_bits; - read_unlock_irqrestore(&ni->size_lock, flags); - if (!a->data.non_resident.lowest_vcn && end_vcn <= 0) - end_vcn = allocated_size_vcn - 1; - /* - * If we already have the attribute extent containing @vcn in - * @ctx, no need to look it up again. We slightly cheat in - * that if vcn exceeds the allocated size, we will refuse to - * map the runlist below, so there is definitely no need to get - * the right attribute extent. - */ - if (vcn >= allocated_size_vcn || (a->type == ni->type && - a->name_length == ni->name_len && - !memcmp((u8*)a + le16_to_cpu(a->name_offset), - ni->name, ni->name_len) && - sle64_to_cpu(a->data.non_resident.lowest_vcn) - <= vcn && end_vcn >= vcn)) - ctx_needs_reset = false; - else { - /* Save the old search context. */ - old_ctx = *ctx; - /* - * If the currently mapped (extent) inode is not the - * base inode we will unmap it when we reinitialize the - * search context which means we need to get a - * reference to the page containing the mapped mft - * record so we do not accidentally drop changes to the - * mft record when it has not been marked dirty yet. - */ - if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino != - old_ctx.base_ntfs_ino) { - put_this_page = old_ctx.ntfs_ino->page; - get_page(put_this_page); - } - /* - * Reinitialize the search context so we can lookup the - * needed attribute extent. - */ - ntfs_attr_reinit_search_ctx(ctx); - ctx_needs_reset = true; - } - } - if (ctx_needs_reset) { - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, vcn, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - BUG_ON(!ctx->attr->non_resident); - } - a = ctx->attr; - /* - * Only decompress the mapping pairs if @vcn is inside it. Otherwise - * we get into problems when we try to map an out of bounds vcn because - * we then try to map the already mapped runlist fragment and - * ntfs_mapping_pairs_decompress() fails. - */ - end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1; - if (unlikely(vcn && vcn >= end_vcn)) { - err = -ENOENT; - goto err_out; - } - rl = ntfs_mapping_pairs_decompress(ni->vol, a, ni->runlist.rl); - if (IS_ERR(rl)) - err = PTR_ERR(rl); - else - ni->runlist.rl = rl; -err_out: - if (ctx_is_temporary) { - if (likely(ctx)) - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - } else if (ctx_needs_reset) { - /* - * If there is no attribute list, restoring the search context - * is accomplished simply by copying the saved context back over - * the caller supplied context. If there is an attribute list, - * things are more complicated as we need to deal with mapping - * of mft records and resulting potential changes in pointers. - */ - if (NInoAttrList(base_ni)) { - /* - * If the currently mapped (extent) inode is not the - * one we had before, we need to unmap it and map the - * old one. - */ - if (ctx->ntfs_ino != old_ctx.ntfs_ino) { - /* - * If the currently mapped inode is not the - * base inode, unmap it. - */ - if (ctx->base_ntfs_ino && ctx->ntfs_ino != - ctx->base_ntfs_ino) { - unmap_extent_mft_record(ctx->ntfs_ino); - ctx->mrec = ctx->base_mrec; - BUG_ON(!ctx->mrec); - } - /* - * If the old mapped inode is not the base - * inode, map it. - */ - if (old_ctx.base_ntfs_ino && - old_ctx.ntfs_ino != - old_ctx.base_ntfs_ino) { -retry_map: - ctx->mrec = map_mft_record( - old_ctx.ntfs_ino); - /* - * Something bad has happened. If out - * of memory retry till it succeeds. - * Any other errors are fatal and we - * return the error code in ctx->mrec. - * Let the caller deal with it... We - * just need to fudge things so the - * caller can reinit and/or put the - * search context safely. - */ - if (IS_ERR(ctx->mrec)) { - if (PTR_ERR(ctx->mrec) == - -ENOMEM) { - schedule(); - goto retry_map; - } else - old_ctx.ntfs_ino = - old_ctx. - base_ntfs_ino; - } - } - } - /* Update the changed pointers in the saved context. */ - if (ctx->mrec != old_ctx.mrec) { - if (!IS_ERR(ctx->mrec)) - old_ctx.attr = (ATTR_RECORD*)( - (u8*)ctx->mrec + - ((u8*)old_ctx.attr - - (u8*)old_ctx.mrec)); - old_ctx.mrec = ctx->mrec; - } - } - /* Restore the search context to the saved one. */ - *ctx = old_ctx; - /* - * We drop the reference on the page we took earlier. In the - * case that IS_ERR(ctx->mrec) is true this means we might lose - * some changes to the mft record that had been made between - * the last time it was marked dirty/written out and now. This - * at this stage is not a problem as the mapping error is fatal - * enough that the mft record cannot be written out anyway and - * the caller is very likely to shutdown the whole inode - * immediately and mark the volume dirty for chkdsk to pick up - * the pieces anyway. - */ - if (put_this_page) - put_page(put_this_page); - } - return err; -} - -/** - * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode - * @ni: ntfs inode for which to map (part of) a runlist - * @vcn: map runlist part containing this vcn - * - * Map the part of a runlist containing the @vcn of the ntfs inode @ni. - * - * Return 0 on success and -errno on error. There is one special error code - * which is not an error as such. This is -ENOENT. It means that @vcn is out - * of bounds of the runlist. - * - * Locking: - The runlist must be unlocked on entry and is unlocked on return. - * - This function takes the runlist lock for writing and may modify - * the runlist. - */ -int ntfs_map_runlist(ntfs_inode *ni, VCN vcn) -{ - int err = 0; - - down_write(&ni->runlist.lock); - /* Make sure someone else didn't do the work while we were sleeping. */ - if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <= - LCN_RL_NOT_MAPPED)) - err = ntfs_map_runlist_nolock(ni, vcn, NULL); - up_write(&ni->runlist.lock); - return err; -} - -/** - * ntfs_attr_vcn_to_lcn_nolock - convert a vcn into a lcn given an ntfs inode - * @ni: ntfs inode of the attribute whose runlist to search - * @vcn: vcn to convert - * @write_locked: true if the runlist is locked for writing - * - * Find the virtual cluster number @vcn in the runlist of the ntfs attribute - * described by the ntfs inode @ni and return the corresponding logical cluster - * number (lcn). - * - * If the @vcn is not mapped yet, the attempt is made to map the attribute - * extent containing the @vcn and the vcn to lcn conversion is retried. - * - * If @write_locked is true the caller has locked the runlist for writing and - * if false for reading. - * - * Since lcns must be >= 0, we use negative return codes with special meaning: - * - * Return code Meaning / Description - * ========================================== - * LCN_HOLE Hole / not allocated on disk. - * LCN_ENOENT There is no such vcn in the runlist, i.e. @vcn is out of bounds. - * LCN_ENOMEM Not enough memory to map runlist. - * LCN_EIO Critical error (runlist/file is corrupt, i/o error, etc). - * - * Locking: - The runlist must be locked on entry and is left locked on return. - * - If @write_locked is 'false', i.e. the runlist is locked for reading, - * the lock may be dropped inside the function so you cannot rely on - * the runlist still being the same when this function returns. - */ -LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, - const bool write_locked) -{ - LCN lcn; - unsigned long flags; - bool is_retry = false; - - BUG_ON(!ni); - ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.", - ni->mft_no, (unsigned long long)vcn, - write_locked ? "write" : "read"); - BUG_ON(!NInoNonResident(ni)); - BUG_ON(vcn < 0); - if (!ni->runlist.rl) { - read_lock_irqsave(&ni->size_lock, flags); - if (!ni->allocated_size) { - read_unlock_irqrestore(&ni->size_lock, flags); - return LCN_ENOENT; - } - read_unlock_irqrestore(&ni->size_lock, flags); - } -retry_remap: - /* Convert vcn to lcn. If that fails map the runlist and retry once. */ - lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn); - if (likely(lcn >= LCN_HOLE)) { - ntfs_debug("Done, lcn 0x%llx.", (long long)lcn); - return lcn; - } - if (lcn != LCN_RL_NOT_MAPPED) { - if (lcn != LCN_ENOENT) - lcn = LCN_EIO; - } else if (!is_retry) { - int err; - - if (!write_locked) { - up_read(&ni->runlist.lock); - down_write(&ni->runlist.lock); - if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) != - LCN_RL_NOT_MAPPED)) { - up_write(&ni->runlist.lock); - down_read(&ni->runlist.lock); - goto retry_remap; - } - } - err = ntfs_map_runlist_nolock(ni, vcn, NULL); - if (!write_locked) { - up_write(&ni->runlist.lock); - down_read(&ni->runlist.lock); - } - if (likely(!err)) { - is_retry = true; - goto retry_remap; - } - if (err == -ENOENT) - lcn = LCN_ENOENT; - else if (err == -ENOMEM) - lcn = LCN_ENOMEM; - else - lcn = LCN_EIO; - } - if (lcn != LCN_ENOENT) - ntfs_error(ni->vol->sb, "Failed with error code %lli.", - (long long)lcn); - return lcn; -} - -/** - * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode - * @ni: ntfs inode describing the runlist to search - * @vcn: vcn to find - * @ctx: active attribute search context if present or NULL if not - * - * Find the virtual cluster number @vcn in the runlist described by the ntfs - * inode @ni and return the address of the runlist element containing the @vcn. - * - * If the @vcn is not mapped yet, the attempt is made to map the attribute - * extent containing the @vcn and the vcn to lcn conversion is retried. - * - * If @ctx is specified, it is an active search context of @ni and its base mft - * record. This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped - * runlist fragments and allows their mapping. If you do not have the mft - * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock() - * will perform the necessary mapping and unmapping. - * - * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and - * restores it before returning. Thus, @ctx will be left pointing to the same - * attribute on return as on entry. However, the actual pointers in @ctx may - * point to different memory locations on return, so you must remember to reset - * any cached pointers from the @ctx, i.e. after the call to - * ntfs_attr_find_vcn_nolock(), you will probably want to do: - * m = ctx->mrec; - * a = ctx->attr; - * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that - * you cache ctx->mrec in a variable @m of type MFT_RECORD *. - * Note you need to distinguish between the lcn of the returned runlist element - * being >= 0 and LCN_HOLE. In the later case you have to return zeroes on - * read and allocate clusters on write. - * - * Return the runlist element containing the @vcn on success and - * ERR_PTR(-errno) on error. You need to test the return value with IS_ERR() - * to decide if the return is success or failure and PTR_ERR() to get to the - * error code if IS_ERR() is true. - * - * The possible error return codes are: - * -ENOENT - No such vcn in the runlist, i.e. @vcn is out of bounds. - * -ENOMEM - Not enough memory to map runlist. - * -EIO - Critical error (runlist/file is corrupt, i/o error, etc). - * - * WARNING: If @ctx is supplied, regardless of whether success or failure is - * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx - * is no longer valid, i.e. you need to either call - * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. - * In that case PTR_ERR(@ctx->mrec) will give you the error code for - * why the mapping of the old inode failed. - * - * Locking: - The runlist described by @ni must be locked for writing on entry - * and is locked on return. Note the runlist may be modified when - * needed runlist fragments need to be mapped. - * - If @ctx is NULL, the base mft record of @ni must not be mapped on - * entry and it will be left unmapped on return. - * - If @ctx is not NULL, the base mft record must be mapped on entry - * and it will be left mapped on return. - */ -runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn, - ntfs_attr_search_ctx *ctx) -{ - unsigned long flags; - runlist_element *rl; - int err = 0; - bool is_retry = false; - - BUG_ON(!ni); - ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.", - ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out"); - BUG_ON(!NInoNonResident(ni)); - BUG_ON(vcn < 0); - if (!ni->runlist.rl) { - read_lock_irqsave(&ni->size_lock, flags); - if (!ni->allocated_size) { - read_unlock_irqrestore(&ni->size_lock, flags); - return ERR_PTR(-ENOENT); - } - read_unlock_irqrestore(&ni->size_lock, flags); - } -retry_remap: - rl = ni->runlist.rl; - if (likely(rl && vcn >= rl[0].vcn)) { - while (likely(rl->length)) { - if (unlikely(vcn < rl[1].vcn)) { - if (likely(rl->lcn >= LCN_HOLE)) { - ntfs_debug("Done."); - return rl; - } - break; - } - rl++; - } - if (likely(rl->lcn != LCN_RL_NOT_MAPPED)) { - if (likely(rl->lcn == LCN_ENOENT)) - err = -ENOENT; - else - err = -EIO; - } - } - if (!err && !is_retry) { - /* - * If the search context is invalid we cannot map the unmapped - * region. - */ - if (IS_ERR(ctx->mrec)) - err = PTR_ERR(ctx->mrec); - else { - /* - * The @vcn is in an unmapped region, map the runlist - * and retry. - */ - err = ntfs_map_runlist_nolock(ni, vcn, ctx); - if (likely(!err)) { - is_retry = true; - goto retry_remap; - } - } - if (err == -EINVAL) - err = -EIO; - } else if (!err) - err = -EIO; - if (err != -ENOENT) - ntfs_error(ni->vol->sb, "Failed with error code %i.", err); - return ERR_PTR(err); -} - -/** - * ntfs_attr_find - find (next) attribute in mft record - * @type: attribute type to find - * @name: attribute name to find (optional, i.e. NULL means don't care) - * @name_len: attribute name length (only needed if @name present) - * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) - * @val: attribute value to find (optional, resident attributes only) - * @val_len: attribute value length - * @ctx: search context with mft record and attribute to search from - * - * You should not need to call this function directly. Use ntfs_attr_lookup() - * instead. - * - * ntfs_attr_find() takes a search context @ctx as parameter and searches the - * mft record specified by @ctx->mrec, beginning at @ctx->attr, for an - * attribute of @type, optionally @name and @val. - * - * If the attribute is found, ntfs_attr_find() returns 0 and @ctx->attr will - * point to the found attribute. - * - * If the attribute is not found, ntfs_attr_find() returns -ENOENT and - * @ctx->attr will point to the attribute before which the attribute being - * searched for would need to be inserted if such an action were to be desired. - * - * On actual error, ntfs_attr_find() returns -EIO. In this case @ctx->attr is - * undefined and in particular do not rely on it not changing. - * - * If @ctx->is_first is 'true', the search begins with @ctx->attr itself. If it - * is 'false', the search begins after @ctx->attr. - * - * If @ic is IGNORE_CASE, the @name comparisson is not case sensitive and - * @ctx->ntfs_ino must be set to the ntfs inode to which the mft record - * @ctx->mrec belongs. This is so we can get at the ntfs volume and hence at - * the upcase table. If @ic is CASE_SENSITIVE, the comparison is case - * sensitive. When @name is present, @name_len is the @name length in Unicode - * characters. - * - * If @name is not present (NULL), we assume that the unnamed attribute is - * being searched for. - * - * Finally, the resident attribute value @val is looked for, if present. If - * @val is not present (NULL), @val_len is ignored. - * - * ntfs_attr_find() only searches the specified mft record and it ignores the - * presence of an attribute list attribute (unless it is the one being searched - * for, obviously). If you need to take attribute lists into consideration, - * use ntfs_attr_lookup() instead (see below). This also means that you cannot - * use ntfs_attr_find() to search for extent records of non-resident - * attributes, as extents with lowest_vcn != 0 are usually described by the - * attribute list attribute only. - Note that it is possible that the first - * extent is only in the attribute list while the last extent is in the base - * mft record, so do not rely on being able to find the first extent in the - * base mft record. - * - * Warning: Never use @val when looking for attribute types which can be - * non-resident as this most likely will result in a crash! - */ -static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, - const u32 name_len, const IGNORE_CASE_BOOL ic, - const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx) -{ - ATTR_RECORD *a; - ntfs_volume *vol = ctx->ntfs_ino->vol; - ntfschar *upcase = vol->upcase; - u32 upcase_len = vol->upcase_len; - - /* - * Iterate over attributes in mft record starting at @ctx->attr, or the - * attribute following that, if @ctx->is_first is 'true'. - */ - if (ctx->is_first) { - a = ctx->attr; - ctx->is_first = false; - } else - a = (ATTR_RECORD*)((u8*)ctx->attr + - le32_to_cpu(ctx->attr->length)); - for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { - u8 *mrec_end = (u8 *)ctx->mrec + - le32_to_cpu(ctx->mrec->bytes_allocated); - u8 *name_end; - - /* check whether ATTR_RECORD wrap */ - if ((u8 *)a < (u8 *)ctx->mrec) - break; - - /* check whether Attribute Record Header is within bounds */ - if ((u8 *)a > mrec_end || - (u8 *)a + sizeof(ATTR_RECORD) > mrec_end) - break; - - /* check whether ATTR_RECORD's name is within bounds */ - name_end = (u8 *)a + le16_to_cpu(a->name_offset) + - a->name_length * sizeof(ntfschar); - if (name_end > mrec_end) - break; - - ctx->attr = a; - if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || - a->type == AT_END)) - return -ENOENT; - if (unlikely(!a->length)) - break; - - /* check whether ATTR_RECORD's length wrap */ - if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a) - break; - /* check whether ATTR_RECORD's length is within bounds */ - if ((u8 *)a + le32_to_cpu(a->length) > mrec_end) - break; - - if (a->type != type) - continue; - /* - * If @name is present, compare the two names. If @name is - * missing, assume we want an unnamed attribute. - */ - if (!name) { - /* The search failed if the found attribute is named. */ - if (a->name_length) - return -ENOENT; - } else if (!ntfs_are_names_equal(name, name_len, - (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)), - a->name_length, ic, upcase, upcase_len)) { - register int rc; - - rc = ntfs_collate_names(name, name_len, - (ntfschar*)((u8*)a + - le16_to_cpu(a->name_offset)), - a->name_length, 1, IGNORE_CASE, - upcase, upcase_len); - /* - * If @name collates before a->name, there is no - * matching attribute. - */ - if (rc == -1) - return -ENOENT; - /* If the strings are not equal, continue search. */ - if (rc) - continue; - rc = ntfs_collate_names(name, name_len, - (ntfschar*)((u8*)a + - le16_to_cpu(a->name_offset)), - a->name_length, 1, CASE_SENSITIVE, - upcase, upcase_len); - if (rc == -1) - return -ENOENT; - if (rc) - continue; - } - /* - * The names match or @name not present and attribute is - * unnamed. If no @val specified, we have found the attribute - * and are done. - */ - if (!val) - return 0; - /* @val is present; compare values. */ - else { - register int rc; - - rc = memcmp(val, (u8*)a + le16_to_cpu( - a->data.resident.value_offset), - min_t(u32, val_len, le32_to_cpu( - a->data.resident.value_length))); - /* - * If @val collates before the current attribute's - * value, there is no matching attribute. - */ - if (!rc) { - register u32 avl; - - avl = le32_to_cpu( - a->data.resident.value_length); - if (val_len == avl) - return 0; - if (val_len < avl) - return -ENOENT; - } else if (rc < 0) - return -ENOENT; - } - } - ntfs_error(vol->sb, "Inode is corrupt. Run chkdsk."); - NVolSetErrors(vol); - return -EIO; -} - -/** - * load_attribute_list - load an attribute list into memory - * @vol: ntfs volume from which to read - * @runlist: runlist of the attribute list - * @al_start: destination buffer - * @size: size of the destination buffer in bytes - * @initialized_size: initialized size of the attribute list - * - * Walk the runlist @runlist and load all clusters from it copying them into - * the linear buffer @al. The maximum number of bytes copied to @al is @size - * bytes. Note, @size does not need to be a multiple of the cluster size. If - * @initialized_size is less than @size, the region in @al between - * @initialized_size and @size will be zeroed and not read from disk. - * - * Return 0 on success or -errno on error. - */ -int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start, - const s64 size, const s64 initialized_size) -{ - LCN lcn; - u8 *al = al_start; - u8 *al_end = al + initialized_size; - runlist_element *rl; - struct buffer_head *bh; - struct super_block *sb; - unsigned long block_size; - unsigned long block, max_block; - int err = 0; - unsigned char block_size_bits; - - ntfs_debug("Entering."); - if (!vol || !runlist || !al || size <= 0 || initialized_size < 0 || - initialized_size > size) - return -EINVAL; - if (!initialized_size) { - memset(al, 0, size); - return 0; - } - sb = vol->sb; - block_size = sb->s_blocksize; - block_size_bits = sb->s_blocksize_bits; - down_read(&runlist->lock); - rl = runlist->rl; - if (!rl) { - ntfs_error(sb, "Cannot read attribute list since runlist is " - "missing."); - goto err_out; - } - /* Read all clusters specified by the runlist one run at a time. */ - while (rl->length) { - lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn); - ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.", - (unsigned long long)rl->vcn, - (unsigned long long)lcn); - /* The attribute list cannot be sparse. */ - if (lcn < 0) { - ntfs_error(sb, "ntfs_rl_vcn_to_lcn() failed. Cannot " - "read attribute list."); - goto err_out; - } - block = lcn << vol->cluster_size_bits >> block_size_bits; - /* Read the run from device in chunks of block_size bytes. */ - max_block = block + (rl->length << vol->cluster_size_bits >> - block_size_bits); - ntfs_debug("max_block = 0x%lx.", max_block); - do { - ntfs_debug("Reading block = 0x%lx.", block); - bh = sb_bread(sb, block); - if (!bh) { - ntfs_error(sb, "sb_bread() failed. Cannot " - "read attribute list."); - goto err_out; - } - if (al + block_size >= al_end) - goto do_final; - memcpy(al, bh->b_data, block_size); - brelse(bh); - al += block_size; - } while (++block < max_block); - rl++; - } - if (initialized_size < size) { -initialize: - memset(al_start + initialized_size, 0, size - initialized_size); - } -done: - up_read(&runlist->lock); - return err; -do_final: - if (al < al_end) { - /* - * Partial block. - * - * Note: The attribute list can be smaller than its allocation - * by multiple clusters. This has been encountered by at least - * two people running Windows XP, thus we cannot do any - * truncation sanity checking here. (AIA) - */ - memcpy(al, bh->b_data, al_end - al); - brelse(bh); - if (initialized_size < size) - goto initialize; - goto done; - } - brelse(bh); - /* Real overflow! */ - ntfs_error(sb, "Attribute list buffer overflow. Read attribute list " - "is truncated."); -err_out: - err = -EIO; - goto done; -} - -/** - * ntfs_external_attr_find - find an attribute in the attribute list of an inode - * @type: attribute type to find - * @name: attribute name to find (optional, i.e. NULL means don't care) - * @name_len: attribute name length (only needed if @name present) - * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) - * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only) - * @val: attribute value to find (optional, resident attributes only) - * @val_len: attribute value length - * @ctx: search context with mft record and attribute to search from - * - * You should not need to call this function directly. Use ntfs_attr_lookup() - * instead. - * - * Find an attribute by searching the attribute list for the corresponding - * attribute list entry. Having found the entry, map the mft record if the - * attribute is in a different mft record/inode, ntfs_attr_find() the attribute - * in there and return it. - * - * On first search @ctx->ntfs_ino must be the base mft record and @ctx must - * have been obtained from a call to ntfs_attr_get_search_ctx(). On subsequent - * calls @ctx->ntfs_ino can be any extent inode, too (@ctx->base_ntfs_ino is - * then the base inode). - * - * After finishing with the attribute/mft record you need to call - * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any - * mapped inodes, etc). - * - * If the attribute is found, ntfs_external_attr_find() returns 0 and - * @ctx->attr will point to the found attribute. @ctx->mrec will point to the - * mft record in which @ctx->attr is located and @ctx->al_entry will point to - * the attribute list entry for the attribute. - * - * If the attribute is not found, ntfs_external_attr_find() returns -ENOENT and - * @ctx->attr will point to the attribute in the base mft record before which - * the attribute being searched for would need to be inserted if such an action - * were to be desired. @ctx->mrec will point to the mft record in which - * @ctx->attr is located and @ctx->al_entry will point to the attribute list - * entry of the attribute before which the attribute being searched for would - * need to be inserted if such an action were to be desired. - * - * Thus to insert the not found attribute, one wants to add the attribute to - * @ctx->mrec (the base mft record) and if there is not enough space, the - * attribute should be placed in a newly allocated extent mft record. The - * attribute list entry for the inserted attribute should be inserted in the - * attribute list attribute at @ctx->al_entry. - * - * On actual error, ntfs_external_attr_find() returns -EIO. In this case - * @ctx->attr is undefined and in particular do not rely on it not changing. - */ -static int ntfs_external_attr_find(const ATTR_TYPE type, - const ntfschar *name, const u32 name_len, - const IGNORE_CASE_BOOL ic, const VCN lowest_vcn, - const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx) -{ - ntfs_inode *base_ni, *ni; - ntfs_volume *vol; - ATTR_LIST_ENTRY *al_entry, *next_al_entry; - u8 *al_start, *al_end; - ATTR_RECORD *a; - ntfschar *al_name; - u32 al_name_len; - int err = 0; - static const char *es = " Unmount and run chkdsk."; - - ni = ctx->ntfs_ino; - base_ni = ctx->base_ntfs_ino; - ntfs_debug("Entering for inode 0x%lx, type 0x%x.", ni->mft_no, type); - if (!base_ni) { - /* First call happens with the base mft record. */ - base_ni = ctx->base_ntfs_ino = ctx->ntfs_ino; - ctx->base_mrec = ctx->mrec; - } - if (ni == base_ni) - ctx->base_attr = ctx->attr; - if (type == AT_END) - goto not_found; - vol = base_ni->vol; - al_start = base_ni->attr_list; - al_end = al_start + base_ni->attr_list_size; - if (!ctx->al_entry) - ctx->al_entry = (ATTR_LIST_ENTRY*)al_start; - /* - * Iterate over entries in attribute list starting at @ctx->al_entry, - * or the entry following that, if @ctx->is_first is 'true'. - */ - if (ctx->is_first) { - al_entry = ctx->al_entry; - ctx->is_first = false; - } else - al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry + - le16_to_cpu(ctx->al_entry->length)); - for (;; al_entry = next_al_entry) { - /* Out of bounds check. */ - if ((u8*)al_entry < base_ni->attr_list || - (u8*)al_entry > al_end) - break; /* Inode is corrupt. */ - ctx->al_entry = al_entry; - /* Catch the end of the attribute list. */ - if ((u8*)al_entry == al_end) - goto not_found; - if (!al_entry->length) - break; - if ((u8*)al_entry + 6 > al_end || (u8*)al_entry + - le16_to_cpu(al_entry->length) > al_end) - break; - next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + - le16_to_cpu(al_entry->length)); - if (le32_to_cpu(al_entry->type) > le32_to_cpu(type)) - goto not_found; - if (type != al_entry->type) - continue; - /* - * If @name is present, compare the two names. If @name is - * missing, assume we want an unnamed attribute. - */ - al_name_len = al_entry->name_length; - al_name = (ntfschar*)((u8*)al_entry + al_entry->name_offset); - if (!name) { - if (al_name_len) - goto not_found; - } else if (!ntfs_are_names_equal(al_name, al_name_len, name, - name_len, ic, vol->upcase, vol->upcase_len)) { - register int rc; - - rc = ntfs_collate_names(name, name_len, al_name, - al_name_len, 1, IGNORE_CASE, - vol->upcase, vol->upcase_len); - /* - * If @name collates before al_name, there is no - * matching attribute. - */ - if (rc == -1) - goto not_found; - /* If the strings are not equal, continue search. */ - if (rc) - continue; - /* - * FIXME: Reverse engineering showed 0, IGNORE_CASE but - * that is inconsistent with ntfs_attr_find(). The - * subsequent rc checks were also different. Perhaps I - * made a mistake in one of the two. Need to recheck - * which is correct or at least see what is going on... - * (AIA) - */ - rc = ntfs_collate_names(name, name_len, al_name, - al_name_len, 1, CASE_SENSITIVE, - vol->upcase, vol->upcase_len); - if (rc == -1) - goto not_found; - if (rc) - continue; - } - /* - * The names match or @name not present and attribute is - * unnamed. Now check @lowest_vcn. Continue search if the - * next attribute list entry still fits @lowest_vcn. Otherwise - * we have reached the right one or the search has failed. - */ - if (lowest_vcn && (u8*)next_al_entry >= al_start && - (u8*)next_al_entry + 6 < al_end && - (u8*)next_al_entry + le16_to_cpu( - next_al_entry->length) <= al_end && - sle64_to_cpu(next_al_entry->lowest_vcn) <= - lowest_vcn && - next_al_entry->type == al_entry->type && - next_al_entry->name_length == al_name_len && - ntfs_are_names_equal((ntfschar*)((u8*) - next_al_entry + - next_al_entry->name_offset), - next_al_entry->name_length, - al_name, al_name_len, CASE_SENSITIVE, - vol->upcase, vol->upcase_len)) - continue; - if (MREF_LE(al_entry->mft_reference) == ni->mft_no) { - if (MSEQNO_LE(al_entry->mft_reference) != ni->seq_no) { - ntfs_error(vol->sb, "Found stale mft " - "reference in attribute list " - "of base inode 0x%lx.%s", - base_ni->mft_no, es); - err = -EIO; - break; - } - } else { /* Mft references do not match. */ - /* If there is a mapped record unmap it first. */ - if (ni != base_ni) - unmap_extent_mft_record(ni); - /* Do we want the base record back? */ - if (MREF_LE(al_entry->mft_reference) == - base_ni->mft_no) { - ni = ctx->ntfs_ino = base_ni; - ctx->mrec = ctx->base_mrec; - } else { - /* We want an extent record. */ - ctx->mrec = map_extent_mft_record(base_ni, - le64_to_cpu( - al_entry->mft_reference), &ni); - if (IS_ERR(ctx->mrec)) { - ntfs_error(vol->sb, "Failed to map " - "extent mft record " - "0x%lx of base inode " - "0x%lx.%s", - MREF_LE(al_entry-> - mft_reference), - base_ni->mft_no, es); - err = PTR_ERR(ctx->mrec); - if (err == -ENOENT) - err = -EIO; - /* Cause @ctx to be sanitized below. */ - ni = NULL; - break; - } - ctx->ntfs_ino = ni; - } - ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + - le16_to_cpu(ctx->mrec->attrs_offset)); - } - /* - * ctx->vfs_ino, ctx->mrec, and ctx->attr now point to the - * mft record containing the attribute represented by the - * current al_entry. - */ - /* - * We could call into ntfs_attr_find() to find the right - * attribute in this mft record but this would be less - * efficient and not quite accurate as ntfs_attr_find() ignores - * the attribute instance numbers for example which become - * important when one plays with attribute lists. Also, - * because a proper match has been found in the attribute list - * entry above, the comparison can now be optimized. So it is - * worth re-implementing a simplified ntfs_attr_find() here. - */ - a = ctx->attr; - /* - * Use a manual loop so we can still use break and continue - * with the same meanings as above. - */ -do_next_attr_loop: - if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec + - le32_to_cpu(ctx->mrec->bytes_allocated)) - break; - if (a->type == AT_END) - break; - if (!a->length) - break; - if (al_entry->instance != a->instance) - goto do_next_attr; - /* - * If the type and/or the name are mismatched between the - * attribute list entry and the attribute record, there is - * corruption so we break and return error EIO. - */ - if (al_entry->type != a->type) - break; - if (!ntfs_are_names_equal((ntfschar*)((u8*)a + - le16_to_cpu(a->name_offset)), a->name_length, - al_name, al_name_len, CASE_SENSITIVE, - vol->upcase, vol->upcase_len)) - break; - ctx->attr = a; - /* - * If no @val specified or @val specified and it matches, we - * have found it! - */ - if (!val || (!a->non_resident && le32_to_cpu( - a->data.resident.value_length) == val_len && - !memcmp((u8*)a + - le16_to_cpu(a->data.resident.value_offset), - val, val_len))) { - ntfs_debug("Done, found."); - return 0; - } -do_next_attr: - /* Proceed to the next attribute in the current mft record. */ - a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length)); - goto do_next_attr_loop; - } - if (!err) { - ntfs_error(vol->sb, "Base inode 0x%lx contains corrupt " - "attribute list attribute.%s", base_ni->mft_no, - es); - err = -EIO; - } - if (ni != base_ni) { - if (ni) - unmap_extent_mft_record(ni); - ctx->ntfs_ino = base_ni; - ctx->mrec = ctx->base_mrec; - ctx->attr = ctx->base_attr; - } - if (err != -ENOMEM) - NVolSetErrors(vol); - return err; -not_found: - /* - * If we were looking for AT_END, we reset the search context @ctx and - * use ntfs_attr_find() to seek to the end of the base mft record. - */ - if (type == AT_END) { - ntfs_attr_reinit_search_ctx(ctx); - return ntfs_attr_find(AT_END, name, name_len, ic, val, val_len, - ctx); - } - /* - * The attribute was not found. Before we return, we want to ensure - * @ctx->mrec and @ctx->attr indicate the position at which the - * attribute should be inserted in the base mft record. Since we also - * want to preserve @ctx->al_entry we cannot reinitialize the search - * context using ntfs_attr_reinit_search_ctx() as this would set - * @ctx->al_entry to NULL. Thus we do the necessary bits manually (see - * ntfs_attr_init_search_ctx() below). Note, we _only_ preserve - * @ctx->al_entry as the remaining fields (base_*) are identical to - * their non base_ counterparts and we cannot set @ctx->base_attr - * correctly yet as we do not know what @ctx->attr will be set to by - * the call to ntfs_attr_find() below. - */ - if (ni != base_ni) - unmap_extent_mft_record(ni); - ctx->mrec = ctx->base_mrec; - ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + - le16_to_cpu(ctx->mrec->attrs_offset)); - ctx->is_first = true; - ctx->ntfs_ino = base_ni; - ctx->base_ntfs_ino = NULL; - ctx->base_mrec = NULL; - ctx->base_attr = NULL; - /* - * In case there are multiple matches in the base mft record, need to - * keep enumerating until we get an attribute not found response (or - * another error), otherwise we would keep returning the same attribute - * over and over again and all programs using us for enumeration would - * lock up in a tight loop. - */ - do { - err = ntfs_attr_find(type, name, name_len, ic, val, val_len, - ctx); - } while (!err); - ntfs_debug("Done, not found."); - return err; -} - -/** - * ntfs_attr_lookup - find an attribute in an ntfs inode - * @type: attribute type to find - * @name: attribute name to find (optional, i.e. NULL means don't care) - * @name_len: attribute name length (only needed if @name present) - * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) - * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only) - * @val: attribute value to find (optional, resident attributes only) - * @val_len: attribute value length - * @ctx: search context with mft record and attribute to search from - * - * Find an attribute in an ntfs inode. On first search @ctx->ntfs_ino must - * be the base mft record and @ctx must have been obtained from a call to - * ntfs_attr_get_search_ctx(). - * - * This function transparently handles attribute lists and @ctx is used to - * continue searches where they were left off at. - * - * After finishing with the attribute/mft record you need to call - * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any - * mapped inodes, etc). - * - * Return 0 if the search was successful and -errno if not. - * - * When 0, @ctx->attr is the found attribute and it is in mft record - * @ctx->mrec. If an attribute list attribute is present, @ctx->al_entry is - * the attribute list entry of the found attribute. - * - * When -ENOENT, @ctx->attr is the attribute which collates just after the - * attribute being searched for, i.e. if one wants to add the attribute to the - * mft record this is the correct place to insert it into. If an attribute - * list attribute is present, @ctx->al_entry is the attribute list entry which - * collates just after the attribute list entry of the attribute being searched - * for, i.e. if one wants to add the attribute to the mft record this is the - * correct place to insert its attribute list entry into. - * - * When -errno != -ENOENT, an error occurred during the lookup. @ctx->attr is - * then undefined and in particular you should not rely on it not changing. - */ -int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, - const u32 name_len, const IGNORE_CASE_BOOL ic, - const VCN lowest_vcn, const u8 *val, const u32 val_len, - ntfs_attr_search_ctx *ctx) -{ - ntfs_inode *base_ni; - - ntfs_debug("Entering."); - BUG_ON(IS_ERR(ctx->mrec)); - if (ctx->base_ntfs_ino) - base_ni = ctx->base_ntfs_ino; - else - base_ni = ctx->ntfs_ino; - /* Sanity check, just for debugging really. */ - BUG_ON(!base_ni); - if (!NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST) - return ntfs_attr_find(type, name, name_len, ic, val, val_len, - ctx); - return ntfs_external_attr_find(type, name, name_len, ic, lowest_vcn, - val, val_len, ctx); -} - -/** - * ntfs_attr_init_search_ctx - initialize an attribute search context - * @ctx: attribute search context to initialize - * @ni: ntfs inode with which to initialize the search context - * @mrec: mft record with which to initialize the search context - * - * Initialize the attribute search context @ctx with @ni and @mrec. - */ -static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx, - ntfs_inode *ni, MFT_RECORD *mrec) -{ - *ctx = (ntfs_attr_search_ctx) { - .mrec = mrec, - /* Sanity checks are performed elsewhere. */ - .attr = (ATTR_RECORD*)((u8*)mrec + - le16_to_cpu(mrec->attrs_offset)), - .is_first = true, - .ntfs_ino = ni, - }; -} - -/** - * ntfs_attr_reinit_search_ctx - reinitialize an attribute search context - * @ctx: attribute search context to reinitialize - * - * Reinitialize the attribute search context @ctx, unmapping an associated - * extent mft record if present, and initialize the search context again. - * - * This is used when a search for a new attribute is being started to reset - * the search context to the beginning. - */ -void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx) -{ - if (likely(!ctx->base_ntfs_ino)) { - /* No attribute list. */ - ctx->is_first = true; - /* Sanity checks are performed elsewhere. */ - ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + - le16_to_cpu(ctx->mrec->attrs_offset)); - /* - * This needs resetting due to ntfs_external_attr_find() which - * can leave it set despite having zeroed ctx->base_ntfs_ino. - */ - ctx->al_entry = NULL; - return; - } /* Attribute list. */ - if (ctx->ntfs_ino != ctx->base_ntfs_ino) - unmap_extent_mft_record(ctx->ntfs_ino); - ntfs_attr_init_search_ctx(ctx, ctx->base_ntfs_ino, ctx->base_mrec); - return; -} - -/** - * ntfs_attr_get_search_ctx - allocate/initialize a new attribute search context - * @ni: ntfs inode with which to initialize the search context - * @mrec: mft record with which to initialize the search context - * - * Allocate a new attribute search context, initialize it with @ni and @mrec, - * and return it. Return NULL if allocation failed. - */ -ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec) -{ - ntfs_attr_search_ctx *ctx; - - ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, GFP_NOFS); - if (ctx) - ntfs_attr_init_search_ctx(ctx, ni, mrec); - return ctx; -} - -/** - * ntfs_attr_put_search_ctx - release an attribute search context - * @ctx: attribute search context to free - * - * Release the attribute search context @ctx, unmapping an associated extent - * mft record if present. - */ -void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx) -{ - if (ctx->base_ntfs_ino && ctx->ntfs_ino != ctx->base_ntfs_ino) - unmap_extent_mft_record(ctx->ntfs_ino); - kmem_cache_free(ntfs_attr_ctx_cache, ctx); - return; -} - -#ifdef NTFS_RW - -/** - * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file - * @vol: ntfs volume to which the attribute belongs - * @type: attribute type which to find - * - * Search for the attribute definition record corresponding to the attribute - * @type in the $AttrDef system file. - * - * Return the attribute type definition record if found and NULL if not found. - */ -static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol, - const ATTR_TYPE type) -{ - ATTR_DEF *ad; - - BUG_ON(!vol->attrdef); - BUG_ON(!type); - for (ad = vol->attrdef; (u8*)ad - (u8*)vol->attrdef < - vol->attrdef_size && ad->type; ++ad) { - /* We have not found it yet, carry on searching. */ - if (likely(le32_to_cpu(ad->type) < le32_to_cpu(type))) - continue; - /* We found the attribute; return it. */ - if (likely(ad->type == type)) - return ad; - /* We have gone too far already. No point in continuing. */ - break; - } - /* Attribute not found. */ - ntfs_debug("Attribute type 0x%x not found in $AttrDef.", - le32_to_cpu(type)); - return NULL; -} - -/** - * ntfs_attr_size_bounds_check - check a size of an attribute type for validity - * @vol: ntfs volume to which the attribute belongs - * @type: attribute type which to check - * @size: size which to check - * - * Check whether the @size in bytes is valid for an attribute of @type on the - * ntfs volume @vol. This information is obtained from $AttrDef system file. - * - * Return 0 if valid, -ERANGE if not valid, or -ENOENT if the attribute is not - * listed in $AttrDef. - */ -int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type, - const s64 size) -{ - ATTR_DEF *ad; - - BUG_ON(size < 0); - /* - * $ATTRIBUTE_LIST has a maximum size of 256kiB, but this is not - * listed in $AttrDef. - */ - if (unlikely(type == AT_ATTRIBUTE_LIST && size > 256 * 1024)) - return -ERANGE; - /* Get the $AttrDef entry for the attribute @type. */ - ad = ntfs_attr_find_in_attrdef(vol, type); - if (unlikely(!ad)) - return -ENOENT; - /* Do the bounds check. */ - if (((sle64_to_cpu(ad->min_size) > 0) && - size < sle64_to_cpu(ad->min_size)) || - ((sle64_to_cpu(ad->max_size) > 0) && size > - sle64_to_cpu(ad->max_size))) - return -ERANGE; - return 0; -} - -/** - * ntfs_attr_can_be_non_resident - check if an attribute can be non-resident - * @vol: ntfs volume to which the attribute belongs - * @type: attribute type which to check - * - * Check whether the attribute of @type on the ntfs volume @vol is allowed to - * be non-resident. This information is obtained from $AttrDef system file. - * - * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, and - * -ENOENT if the attribute is not listed in $AttrDef. - */ -int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type) -{ - ATTR_DEF *ad; - - /* Find the attribute definition record in $AttrDef. */ - ad = ntfs_attr_find_in_attrdef(vol, type); - if (unlikely(!ad)) - return -ENOENT; - /* Check the flags and return the result. */ - if (ad->flags & ATTR_DEF_RESIDENT) - return -EPERM; - return 0; -} - -/** - * ntfs_attr_can_be_resident - check if an attribute can be resident - * @vol: ntfs volume to which the attribute belongs - * @type: attribute type which to check - * - * Check whether the attribute of @type on the ntfs volume @vol is allowed to - * be resident. This information is derived from our ntfs knowledge and may - * not be completely accurate, especially when user defined attributes are - * present. Basically we allow everything to be resident except for index - * allocation and $EA attributes. - * - * Return 0 if the attribute is allowed to be non-resident and -EPERM if not. - * - * Warning: In the system file $MFT the attribute $Bitmap must be non-resident - * otherwise windows will not boot (blue screen of death)! We cannot - * check for this here as we do not know which inode's $Bitmap is - * being asked about so the caller needs to special case this. - */ -int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type) -{ - if (type == AT_INDEX_ALLOCATION) - return -EPERM; - return 0; -} - -/** - * ntfs_attr_record_resize - resize an attribute record - * @m: mft record containing attribute record - * @a: attribute record to resize - * @new_size: new size in bytes to which to resize the attribute record @a - * - * Resize the attribute record @a, i.e. the resident part of the attribute, in - * the mft record @m to @new_size bytes. - * - * Return 0 on success and -errno on error. The following error codes are - * defined: - * -ENOSPC - Not enough space in the mft record @m to perform the resize. - * - * Note: On error, no modifications have been performed whatsoever. - * - * Warning: If you make a record smaller without having copied all the data you - * are interested in the data may be overwritten. - */ -int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size) -{ - ntfs_debug("Entering for new_size %u.", new_size); - /* Align to 8 bytes if it is not already done. */ - if (new_size & 7) - new_size = (new_size + 7) & ~7; - /* If the actual attribute length has changed, move things around. */ - if (new_size != le32_to_cpu(a->length)) { - u32 new_muse = le32_to_cpu(m->bytes_in_use) - - le32_to_cpu(a->length) + new_size; - /* Not enough space in this mft record. */ - if (new_muse > le32_to_cpu(m->bytes_allocated)) - return -ENOSPC; - /* Move attributes following @a to their new location. */ - memmove((u8*)a + new_size, (u8*)a + le32_to_cpu(a->length), - le32_to_cpu(m->bytes_in_use) - ((u8*)a - - (u8*)m) - le32_to_cpu(a->length)); - /* Adjust @m to reflect the change in used space. */ - m->bytes_in_use = cpu_to_le32(new_muse); - /* Adjust @a to reflect the new size. */ - if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length)) - a->length = cpu_to_le32(new_size); - } - return 0; -} - -/** - * ntfs_resident_attr_value_resize - resize the value of a resident attribute - * @m: mft record containing attribute record - * @a: attribute record whose value to resize - * @new_size: new size in bytes to which to resize the attribute value of @a - * - * Resize the value of the attribute @a in the mft record @m to @new_size bytes. - * If the value is made bigger, the newly allocated space is cleared. - * - * Return 0 on success and -errno on error. The following error codes are - * defined: - * -ENOSPC - Not enough space in the mft record @m to perform the resize. - * - * Note: On error, no modifications have been performed whatsoever. - * - * Warning: If you make a record smaller without having copied all the data you - * are interested in the data may be overwritten. - */ -int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, - const u32 new_size) -{ - u32 old_size; - - /* Resize the resident part of the attribute record. */ - if (ntfs_attr_record_resize(m, a, - le16_to_cpu(a->data.resident.value_offset) + new_size)) - return -ENOSPC; - /* - * The resize succeeded! If we made the attribute value bigger, clear - * the area between the old size and @new_size. - */ - old_size = le32_to_cpu(a->data.resident.value_length); - if (new_size > old_size) - memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) + - old_size, 0, new_size - old_size); - /* Finally update the length of the attribute value. */ - a->data.resident.value_length = cpu_to_le32(new_size); - return 0; -} - -/** - * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute - * @ni: ntfs inode describing the attribute to convert - * @data_size: size of the resident data to copy to the non-resident attribute - * - * Convert the resident ntfs attribute described by the ntfs inode @ni to a - * non-resident one. - * - * @data_size must be equal to the attribute value size. This is needed since - * we need to know the size before we can map the mft record and our callers - * always know it. The reason we cannot simply read the size from the vfs - * inode i_size is that this is not necessarily uptodate. This happens when - * ntfs_attr_make_non_resident() is called in the ->truncate call path(s). - * - * Return 0 on success and -errno on error. The following error return codes - * are defined: - * -EPERM - The attribute is not allowed to be non-resident. - * -ENOMEM - Not enough memory. - * -ENOSPC - Not enough disk space. - * -EINVAL - Attribute not defined on the volume. - * -EIO - I/o error or other error. - * Note that -ENOSPC is also returned in the case that there is not enough - * space in the mft record to do the conversion. This can happen when the mft - * record is already very full. The caller is responsible for trying to make - * space in the mft record and trying again. FIXME: Do we need a separate - * error return code for this kind of -ENOSPC or is it always worth trying - * again in case the attribute may then fit in a resident state so no need to - * make it non-resident at all? Ho-hum... (AIA) - * - * NOTE to self: No changes in the attribute list are required to move from - * a resident to a non-resident attribute. - * - * Locking: - The caller must hold i_mutex on the inode. - */ -int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) -{ - s64 new_size; - struct inode *vi = VFS_I(ni); - ntfs_volume *vol = ni->vol; - ntfs_inode *base_ni; - MFT_RECORD *m; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx; - struct page *page; - runlist_element *rl; - u8 *kaddr; - unsigned long flags; - int mp_size, mp_ofs, name_ofs, arec_size, err, err2; - u32 attr_size; - u8 old_res_attr_flags; - - /* Check that the attribute is allowed to be non-resident. */ - err = ntfs_attr_can_be_non_resident(vol, ni->type); - if (unlikely(err)) { - if (err == -EPERM) - ntfs_debug("Attribute is not allowed to be " - "non-resident."); - else - ntfs_debug("Attribute not defined on the NTFS " - "volume!"); - return err; - } - /* - * FIXME: Compressed and encrypted attributes are not supported when - * writing and we should never have gotten here for them. - */ - BUG_ON(NInoCompressed(ni)); - BUG_ON(NInoEncrypted(ni)); - /* - * The size needs to be aligned to a cluster boundary for allocation - * purposes. - */ - new_size = (data_size + vol->cluster_size - 1) & - ~(vol->cluster_size - 1); - if (new_size > 0) { - /* - * Will need the page later and since the page lock nests - * outside all ntfs locks, we need to get the page now. - */ - page = find_or_create_page(vi->i_mapping, 0, - mapping_gfp_mask(vi->i_mapping)); - if (unlikely(!page)) - return -ENOMEM; - /* Start by allocating clusters to hold the attribute value. */ - rl = ntfs_cluster_alloc(vol, 0, new_size >> - vol->cluster_size_bits, -1, DATA_ZONE, true); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - ntfs_debug("Failed to allocate cluster%s, error code " - "%i.", (new_size >> - vol->cluster_size_bits) > 1 ? "s" : "", - err); - goto page_err_out; - } - } else { - rl = NULL; - page = NULL; - } - /* Determine the size of the mapping pairs array. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1); - if (unlikely(mp_size < 0)) { - err = mp_size; - ntfs_debug("Failed to get size for mapping pairs array, error " - "code %i.", err); - goto rl_err_out; - } - down_write(&ni->runlist.lock); - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - m = ctx->mrec; - a = ctx->attr; - BUG_ON(NInoNonResident(ni)); - BUG_ON(a->non_resident); - /* - * Calculate new offsets for the name and the mapping pairs array. - */ - if (NInoSparse(ni) || NInoCompressed(ni)) - name_ofs = (offsetof(ATTR_REC, - data.non_resident.compressed_size) + - sizeof(a->data.non_resident.compressed_size) + - 7) & ~7; - else - name_ofs = (offsetof(ATTR_REC, - data.non_resident.compressed_size) + 7) & ~7; - mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7; - /* - * Determine the size of the resident part of the now non-resident - * attribute record. - */ - arec_size = (mp_ofs + mp_size + 7) & ~7; - /* - * If the page is not uptodate bring it uptodate by copying from the - * attribute value. - */ - attr_size = le32_to_cpu(a->data.resident.value_length); - BUG_ON(attr_size != data_size); - if (page && !PageUptodate(page)) { - kaddr = kmap_atomic(page); - memcpy(kaddr, (u8*)a + - le16_to_cpu(a->data.resident.value_offset), - attr_size); - memset(kaddr + attr_size, 0, PAGE_SIZE - attr_size); - kunmap_atomic(kaddr); - flush_dcache_page(page); - SetPageUptodate(page); - } - /* Backup the attribute flag. */ - old_res_attr_flags = a->data.resident.flags; - /* Resize the resident part of the attribute record. */ - err = ntfs_attr_record_resize(m, a, arec_size); - if (unlikely(err)) - goto err_out; - /* - * Convert the resident part of the attribute record to describe a - * non-resident attribute. - */ - a->non_resident = 1; - /* Move the attribute name if it exists and update the offset. */ - if (a->name_length) - memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset), - a->name_length * sizeof(ntfschar)); - a->name_offset = cpu_to_le16(name_ofs); - /* Setup the fields specific to non-resident attributes. */ - a->data.non_resident.lowest_vcn = 0; - a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >> - vol->cluster_size_bits); - a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs); - memset(&a->data.non_resident.reserved, 0, - sizeof(a->data.non_resident.reserved)); - a->data.non_resident.allocated_size = cpu_to_sle64(new_size); - a->data.non_resident.data_size = - a->data.non_resident.initialized_size = - cpu_to_sle64(attr_size); - if (NInoSparse(ni) || NInoCompressed(ni)) { - a->data.non_resident.compression_unit = 0; - if (NInoCompressed(ni) || vol->major_ver < 3) - a->data.non_resident.compression_unit = 4; - a->data.non_resident.compressed_size = - a->data.non_resident.allocated_size; - } else - a->data.non_resident.compression_unit = 0; - /* Generate the mapping pairs array into the attribute record. */ - err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs, - arec_size - mp_ofs, rl, 0, -1, NULL); - if (unlikely(err)) { - ntfs_debug("Failed to build mapping pairs, error code %i.", - err); - goto undo_err_out; - } - /* Setup the in-memory attribute structure to be non-resident. */ - ni->runlist.rl = rl; - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = new_size; - if (NInoSparse(ni) || NInoCompressed(ni)) { - ni->itype.compressed.size = ni->allocated_size; - if (a->data.non_resident.compression_unit) { - ni->itype.compressed.block_size = 1U << (a->data. - non_resident.compression_unit + - vol->cluster_size_bits); - ni->itype.compressed.block_size_bits = - ffs(ni->itype.compressed.block_size) - - 1; - ni->itype.compressed.block_clusters = 1U << - a->data.non_resident.compression_unit; - } else { - ni->itype.compressed.block_size = 0; - ni->itype.compressed.block_size_bits = 0; - ni->itype.compressed.block_clusters = 0; - } - vi->i_blocks = ni->itype.compressed.size >> 9; - } else - vi->i_blocks = ni->allocated_size >> 9; - write_unlock_irqrestore(&ni->size_lock, flags); - /* - * This needs to be last since the address space operations ->read_folio - * and ->writepage can run concurrently with us as they are not - * serialized on i_mutex. Note, we are not allowed to fail once we flip - * this switch, which is another reason to do this last. - */ - NInoSetNonResident(ni); - /* Mark the mft record dirty, so it gets written back. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - if (page) { - set_page_dirty(page); - unlock_page(page); - put_page(page); - } - ntfs_debug("Done."); - return 0; -undo_err_out: - /* Convert the attribute back into a resident attribute. */ - a->non_resident = 0; - /* Move the attribute name if it exists and update the offset. */ - name_ofs = (offsetof(ATTR_RECORD, data.resident.reserved) + - sizeof(a->data.resident.reserved) + 7) & ~7; - if (a->name_length) - memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset), - a->name_length * sizeof(ntfschar)); - mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7; - a->name_offset = cpu_to_le16(name_ofs); - arec_size = (mp_ofs + attr_size + 7) & ~7; - /* Resize the resident part of the attribute record. */ - err2 = ntfs_attr_record_resize(m, a, arec_size); - if (unlikely(err2)) { - /* - * This cannot happen (well if memory corruption is at work it - * could happen in theory), but deal with it as well as we can. - * If the old size is too small, truncate the attribute, - * otherwise simply give it a larger allocated size. - * FIXME: Should check whether chkdsk complains when the - * allocated size is much bigger than the resident value size. - */ - arec_size = le32_to_cpu(a->length); - if ((mp_ofs + attr_size) > arec_size) { - err2 = attr_size; - attr_size = arec_size - mp_ofs; - ntfs_error(vol->sb, "Failed to undo partial resident " - "to non-resident attribute " - "conversion. Truncating inode 0x%lx, " - "attribute type 0x%x from %i bytes to " - "%i bytes to maintain metadata " - "consistency. THIS MEANS YOU ARE " - "LOSING %i BYTES DATA FROM THIS %s.", - vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - err2, attr_size, err2 - attr_size, - ((ni->type == AT_DATA) && - !ni->name_len) ? "FILE": "ATTRIBUTE"); - write_lock_irqsave(&ni->size_lock, flags); - ni->initialized_size = attr_size; - i_size_write(vi, attr_size); - write_unlock_irqrestore(&ni->size_lock, flags); - } - } - /* Setup the fields specific to resident attributes. */ - a->data.resident.value_length = cpu_to_le32(attr_size); - a->data.resident.value_offset = cpu_to_le16(mp_ofs); - a->data.resident.flags = old_res_attr_flags; - memset(&a->data.resident.reserved, 0, - sizeof(a->data.resident.reserved)); - /* Copy the data from the page back to the attribute value. */ - if (page) { - kaddr = kmap_atomic(page); - memcpy((u8*)a + mp_ofs, kaddr, attr_size); - kunmap_atomic(kaddr); - } - /* Setup the allocated size in the ntfs inode in case it changed. */ - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = arec_size - mp_ofs; - write_unlock_irqrestore(&ni->size_lock, flags); - /* Mark the mft record dirty, so it gets written back. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - ni->runlist.rl = NULL; - up_write(&ni->runlist.lock); -rl_err_out: - if (rl) { - if (ntfs_cluster_free_from_rl(vol, rl) < 0) { - ntfs_error(vol->sb, "Failed to release allocated " - "cluster(s) in error code path. Run " - "chkdsk to recover the lost " - "cluster(s)."); - NVolSetErrors(vol); - } - ntfs_free(rl); -page_err_out: - unlock_page(page); - put_page(page); - } - if (err == -EINVAL) - err = -EIO; - return err; -} - -/** - * ntfs_attr_extend_allocation - extend the allocated space of an attribute - * @ni: ntfs inode of the attribute whose allocation to extend - * @new_alloc_size: new size in bytes to which to extend the allocation to - * @new_data_size: new size in bytes to which to extend the data to - * @data_start: beginning of region which is required to be non-sparse - * - * Extend the allocated space of an attribute described by the ntfs inode @ni - * to @new_alloc_size bytes. If @data_start is -1, the whole extension may be - * implemented as a hole in the file (as long as both the volume and the ntfs - * inode @ni have sparse support enabled). If @data_start is >= 0, then the - * region between the old allocated size and @data_start - 1 may be made sparse - * but the regions between @data_start and @new_alloc_size must be backed by - * actual clusters. - * - * If @new_data_size is -1, it is ignored. If it is >= 0, then the data size - * of the attribute is extended to @new_data_size. Note that the i_size of the - * vfs inode is not updated. Only the data size in the base attribute record - * is updated. The caller has to update i_size separately if this is required. - * WARNING: It is a BUG() for @new_data_size to be smaller than the old data - * size as well as for @new_data_size to be greater than @new_alloc_size. - * - * For resident attributes this involves resizing the attribute record and if - * necessary moving it and/or other attributes into extent mft records and/or - * converting the attribute to a non-resident attribute which in turn involves - * extending the allocation of a non-resident attribute as described below. - * - * For non-resident attributes this involves allocating clusters in the data - * zone on the volume (except for regions that are being made sparse) and - * extending the run list to describe the allocated clusters as well as - * updating the mapping pairs array of the attribute. This in turn involves - * resizing the attribute record and if necessary moving it and/or other - * attributes into extent mft records and/or splitting the attribute record - * into multiple extent attribute records. - * - * Also, the attribute list attribute is updated if present and in some of the - * above cases (the ones where extent mft records/attributes come into play), - * an attribute list attribute is created if not already present. - * - * Return the new allocated size on success and -errno on error. In the case - * that an error is encountered but a partial extension at least up to - * @data_start (if present) is possible, the allocation is partially extended - * and this is returned. This means the caller must check the returned size to - * determine if the extension was partial. If @data_start is -1 then partial - * allocations are not performed. - * - * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA. - * - * Locking: This function takes the runlist lock of @ni for writing as well as - * locking the mft record of the base ntfs inode. These locks are maintained - * throughout execution of the function. These locks are required so that the - * attribute can be resized safely and so that it can for example be converted - * from resident to non-resident safely. - * - * TODO: At present attribute list attribute handling is not implemented. - * - * TODO: At present it is not safe to call this function for anything other - * than the $DATA attribute(s) of an uncompressed and unencrypted file. - */ -s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size, - const s64 new_data_size, const s64 data_start) -{ - VCN vcn; - s64 ll, allocated_size, start = data_start; - struct inode *vi = VFS_I(ni); - ntfs_volume *vol = ni->vol; - ntfs_inode *base_ni; - MFT_RECORD *m; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx; - runlist_element *rl, *rl2; - unsigned long flags; - int err, mp_size; - u32 attr_len = 0; /* Silence stupid gcc warning. */ - bool mp_rebuilt; - -#ifdef DEBUG - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " - "old_allocated_size 0x%llx, " - "new_allocated_size 0x%llx, new_data_size 0x%llx, " - "data_start 0x%llx.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - (unsigned long long)allocated_size, - (unsigned long long)new_alloc_size, - (unsigned long long)new_data_size, - (unsigned long long)start); -#endif -retry_extend: - /* - * For non-resident attributes, @start and @new_size need to be aligned - * to cluster boundaries for allocation purposes. - */ - if (NInoNonResident(ni)) { - if (start > 0) - start &= ~(s64)vol->cluster_size_mask; - new_alloc_size = (new_alloc_size + vol->cluster_size - 1) & - ~(s64)vol->cluster_size_mask; - } - BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size); - /* Check if new size is allowed in $AttrDef. */ - err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size); - if (unlikely(err)) { - /* Only emit errors when the write will fail completely. */ - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (start < 0 || start >= allocated_size) { - if (err == -ERANGE) { - ntfs_error(vol->sb, "Cannot extend allocation " - "of inode 0x%lx, attribute " - "type 0x%x, because the new " - "allocation would exceed the " - "maximum allowed size for " - "this attribute type.", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - } else { - ntfs_error(vol->sb, "Cannot extend allocation " - "of inode 0x%lx, attribute " - "type 0x%x, because this " - "attribute type is not " - "defined on the NTFS volume. " - "Possible corruption! You " - "should run chkdsk!", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - } - } - /* Translate error code to be POSIX conformant for write(2). */ - if (err == -ERANGE) - err = -EFBIG; - else - err = -EIO; - return err; - } - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - /* - * We will be modifying both the runlist (if non-resident) and the mft - * record so lock them both down. - */ - down_write(&ni->runlist.lock); - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - /* - * If non-resident, seek to the last extent. If resident, there is - * only one extent, so seek to that. - */ - vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits : - 0; - /* - * Abort if someone did the work whilst we waited for the locks. If we - * just converted the attribute from resident to non-resident it is - * likely that exactly this has happened already. We cannot quite - * abort if we need to update the data size. - */ - if (unlikely(new_alloc_size <= allocated_size)) { - ntfs_debug("Allocated size already exceeds requested size."); - new_alloc_size = allocated_size; - if (new_data_size < 0) - goto done; - /* - * We want the first attribute extent so that we can update the - * data size. - */ - vcn = 0; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, vcn, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - m = ctx->mrec; - a = ctx->attr; - /* Use goto to reduce indentation. */ - if (a->non_resident) - goto do_non_resident_extend; - BUG_ON(NInoNonResident(ni)); - /* The total length of the attribute value. */ - attr_len = le32_to_cpu(a->data.resident.value_length); - /* - * Extend the attribute record to be able to store the new attribute - * size. ntfs_attr_record_resize() will not do anything if the size is - * not changing. - */ - if (new_alloc_size < vol->mft_record_size && - !ntfs_attr_record_resize(m, a, - le16_to_cpu(a->data.resident.value_offset) + - new_alloc_size)) { - /* The resize succeeded! */ - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = le32_to_cpu(a->length) - - le16_to_cpu(a->data.resident.value_offset); - write_unlock_irqrestore(&ni->size_lock, flags); - if (new_data_size >= 0) { - BUG_ON(new_data_size < attr_len); - a->data.resident.value_length = - cpu_to_le32((u32)new_data_size); - } - goto flush_done; - } - /* - * We have to drop all the locks so we can call - * ntfs_attr_make_non_resident(). This could be optimised by try- - * locking the first page cache page and only if that fails dropping - * the locks, locking the page, and redoing all the locking and - * lookups. While this would be a huge optimisation, it is not worth - * it as this is definitely a slow code path. - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - /* - * Not enough space in the mft record, try to make the attribute - * non-resident and if successful restart the extension process. - */ - err = ntfs_attr_make_non_resident(ni, attr_len); - if (likely(!err)) - goto retry_extend; - /* - * Could not make non-resident. If this is due to this not being - * permitted for this attribute type or there not being enough space, - * try to make other attributes non-resident. Otherwise fail. - */ - if (unlikely(err != -EPERM && err != -ENOSPC)) { - /* Only emit errors when the write will fail completely. */ - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because the conversion from resident " - "to non-resident attribute failed " - "with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err != -ENOMEM) - err = -EIO; - goto conv_err_out; - } - /* TODO: Not implemented from here, abort. */ - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (start < 0 || start >= allocated_size) { - if (err == -ENOSPC) - ntfs_error(vol->sb, "Not enough space in the mft " - "record/on disk for the non-resident " - "attribute value. This case is not " - "implemented yet."); - else /* if (err == -EPERM) */ - ntfs_error(vol->sb, "This attribute type may not be " - "non-resident. This case is not " - "implemented yet."); - } - err = -EOPNOTSUPP; - goto conv_err_out; -#if 0 - // TODO: Attempt to make other attributes non-resident. - if (!err) - goto do_resident_extend; - /* - * Both the attribute list attribute and the standard information - * attribute must remain in the base inode. Thus, if this is one of - * these attributes, we have to try to move other attributes out into - * extent mft records instead. - */ - if (ni->type == AT_ATTRIBUTE_LIST || - ni->type == AT_STANDARD_INFORMATION) { - // TODO: Attempt to move other attributes into extent mft - // records. - err = -EOPNOTSUPP; - if (!err) - goto do_resident_extend; - goto err_out; - } - // TODO: Attempt to move this attribute to an extent mft record, but - // only if it is not already the only attribute in an mft record in - // which case there would be nothing to gain. - err = -EOPNOTSUPP; - if (!err) - goto do_resident_extend; - /* There is nothing we can do to make enough space. )-: */ - goto err_out; -#endif -do_non_resident_extend: - BUG_ON(!NInoNonResident(ni)); - if (new_alloc_size == allocated_size) { - BUG_ON(vcn); - goto alloc_done; - } - /* - * If the data starts after the end of the old allocation, this is a - * $DATA attribute and sparse attributes are enabled on the volume and - * for this inode, then create a sparse region between the old - * allocated size and the start of the data. Otherwise simply proceed - * with filling the whole space between the old allocated size and the - * new allocated size with clusters. - */ - if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA || - !NVolSparseEnabled(vol) || NInoSparseDisabled(ni)) - goto skip_sparse; - // TODO: This is not implemented yet. We just fill in with real - // clusters for now... - ntfs_debug("Inserting holes is not-implemented yet. Falling back to " - "allocating real clusters instead."); -skip_sparse: - rl = ni->runlist.rl; - if (likely(rl)) { - /* Seek to the end of the runlist. */ - while (rl->length) - rl++; - } - /* If this attribute extent is not mapped, map it now. */ - if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED || - (rl->lcn == LCN_ENOENT && rl > ni->runlist.rl && - (rl-1)->lcn == LCN_RL_NOT_MAPPED))) { - if (!rl && !allocated_size) - goto first_alloc; - rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation " - "of inode 0x%lx, attribute " - "type 0x%x, because the " - "mapping of a runlist " - "fragment failed with error " - "code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - err); - if (err != -ENOMEM) - err = -EIO; - goto err_out; - } - ni->runlist.rl = rl; - /* Seek to the end of the runlist. */ - while (rl->length) - rl++; - } - /* - * We now know the runlist of the last extent is mapped and @rl is at - * the end of the runlist. We want to begin allocating clusters - * starting at the last allocated cluster to reduce fragmentation. If - * there are no valid LCNs in the attribute we let the cluster - * allocator choose the starting cluster. - */ - /* If the last LCN is a hole or simillar seek back to last real LCN. */ - while (rl->lcn < 0 && rl > ni->runlist.rl) - rl--; -first_alloc: - // FIXME: Need to implement partial allocations so at least part of the - // write can be performed when start >= 0. (Needed for POSIX write(2) - // conformance.) - rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits, - (new_alloc_size - allocated_size) >> - vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ? - rl->lcn + rl->length : -1, DATA_ZONE, true); - if (IS_ERR(rl2)) { - err = PTR_ERR(rl2); - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because the allocation of clusters " - "failed with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err != -ENOMEM && err != -ENOSPC) - err = -EIO; - goto err_out; - } - rl = ntfs_runlists_merge(ni->runlist.rl, rl2); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because the runlist merge failed " - "with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err != -ENOMEM) - err = -EIO; - if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to release allocated " - "cluster(s) in error code path. Run " - "chkdsk to recover the lost " - "cluster(s)."); - NVolSetErrors(vol); - } - ntfs_free(rl2); - goto err_out; - } - ni->runlist.rl = rl; - ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size - - allocated_size) >> vol->cluster_size_bits); - /* Find the runlist element with which the attribute extent starts. */ - ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); - rl2 = ntfs_rl_find_vcn_nolock(rl, ll); - BUG_ON(!rl2); - BUG_ON(!rl2->length); - BUG_ON(rl2->lcn < LCN_HOLE); - mp_rebuilt = false; - /* Get the size for the new mapping pairs array for this extent. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); - if (unlikely(mp_size <= 0)) { - err = mp_size; - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because determining the size for the " - "mapping pairs failed with error code " - "%i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - err = -EIO; - goto undo_alloc; - } - /* Extend the attribute record to fit the bigger mapping pairs array. */ - attr_len = le32_to_cpu(a->length); - err = ntfs_attr_record_resize(m, a, mp_size + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); - if (unlikely(err)) { - BUG_ON(err != -ENOSPC); - // TODO: Deal with this by moving this extent to a new mft - // record or by starting a new extent in a new mft record, - // possibly by extending this extent partially and filling it - // and creating a new extent for the remainder, or by making - // other attributes non-resident and/or by moving other - // attributes out of this mft record. - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Not enough space in the mft " - "record for the extended attribute " - "record. This case is not " - "implemented yet."); - err = -EOPNOTSUPP; - goto undo_alloc; - } - mp_rebuilt = true; - /* Generate the mapping pairs array directly into the attr record. */ - err = ntfs_mapping_pairs_build(vol, (u8*)a + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset), - mp_size, rl2, ll, -1, NULL); - if (unlikely(err)) { - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because building the mapping pairs " - "failed with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - err = -EIO; - goto undo_alloc; - } - /* Update the highest_vcn. */ - a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >> - vol->cluster_size_bits) - 1); - /* - * We now have extended the allocated size of the attribute. Reflect - * this in the ntfs_inode structure and the attribute record. - */ - if (a->data.non_resident.lowest_vcn) { - /* - * We are not in the first attribute extent, switch to it, but - * first ensure the changes will make it to disk later. - */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) - goto restore_undo_alloc; - /* @m is not used any more so no need to set it. */ - a = ctx->attr; - } - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = new_alloc_size; - a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size); - /* - * FIXME: This would fail if @ni is a directory, $MFT, or an index, - * since those can have sparse/compressed set. For example can be - * set compressed even though it is not compressed itself and in that - * case the bit means that files are to be created compressed in the - * directory... At present this is ok as this code is only called for - * regular files, and only for their $DATA attribute(s). - * FIXME: The calculation is wrong if we created a hole above. For now - * it does not matter as we never create holes. - */ - if (NInoSparse(ni) || NInoCompressed(ni)) { - ni->itype.compressed.size += new_alloc_size - allocated_size; - a->data.non_resident.compressed_size = - cpu_to_sle64(ni->itype.compressed.size); - vi->i_blocks = ni->itype.compressed.size >> 9; - } else - vi->i_blocks = new_alloc_size >> 9; - write_unlock_irqrestore(&ni->size_lock, flags); -alloc_done: - if (new_data_size >= 0) { - BUG_ON(new_data_size < - sle64_to_cpu(a->data.non_resident.data_size)); - a->data.non_resident.data_size = cpu_to_sle64(new_data_size); - } -flush_done: - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); -done: - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - ntfs_debug("Done, new_allocated_size 0x%llx.", - (unsigned long long)new_alloc_size); - return new_alloc_size; -restore_undo_alloc: - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot complete extension of allocation " - "of inode 0x%lx, attribute type 0x%x, because " - "lookup of first attribute extent failed with " - "error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err == -ENOENT) - err = -EIO; - ntfs_attr_reinit_search_ctx(ctx); - if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE, - allocated_size >> vol->cluster_size_bits, NULL, 0, - ctx)) { - ntfs_error(vol->sb, "Failed to find last attribute extent of " - "attribute in error code path. Run chkdsk to " - "recover."); - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = new_alloc_size; - /* - * FIXME: This would fail if @ni is a directory... See above. - * FIXME: The calculation is wrong if we created a hole above. - * For now it does not matter as we never create holes. - */ - if (NInoSparse(ni) || NInoCompressed(ni)) { - ni->itype.compressed.size += new_alloc_size - - allocated_size; - vi->i_blocks = ni->itype.compressed.size >> 9; - } else - vi->i_blocks = new_alloc_size >> 9; - write_unlock_irqrestore(&ni->size_lock, flags); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - /* - * The only thing that is now wrong is the allocated size of the - * base attribute extent which chkdsk should be able to fix. - */ - NVolSetErrors(vol); - return err; - } - ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64( - (allocated_size >> vol->cluster_size_bits) - 1); -undo_alloc: - ll = allocated_size >> vol->cluster_size_bits; - if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) { - ntfs_error(vol->sb, "Failed to release allocated cluster(s) " - "in error code path. Run chkdsk to recover " - "the lost cluster(s)."); - NVolSetErrors(vol); - } - m = ctx->mrec; - a = ctx->attr; - /* - * If the runlist truncation fails and/or the search context is no - * longer valid, we cannot resize the attribute record or build the - * mapping pairs array thus we mark the inode bad so that no access to - * the freed clusters can happen. - */ - if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) { - ntfs_error(vol->sb, "Failed to %s in error code path. Run " - "chkdsk to recover.", IS_ERR(m) ? - "restore attribute search context" : - "truncate attribute runlist"); - NVolSetErrors(vol); - } else if (mp_rebuilt) { - if (ntfs_attr_record_resize(m, a, attr_len)) { - ntfs_error(vol->sb, "Failed to restore attribute " - "record in error code path. Run " - "chkdsk to recover."); - NVolSetErrors(vol); - } else /* if (success) */ { - if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( - a->data.non_resident. - mapping_pairs_offset), attr_len - - le16_to_cpu(a->data.non_resident. - mapping_pairs_offset), rl2, ll, -1, - NULL)) { - ntfs_error(vol->sb, "Failed to restore " - "mapping pairs array in error " - "code path. Run chkdsk to " - "recover."); - NVolSetErrors(vol); - } - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - } - } -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); -conv_err_out: - ntfs_debug("Failed. Returning error code %i.", err); - return err; -} - -/** - * ntfs_attr_set - fill (a part of) an attribute with a byte - * @ni: ntfs inode describing the attribute to fill - * @ofs: offset inside the attribute at which to start to fill - * @cnt: number of bytes to fill - * @val: the unsigned 8-bit value with which to fill the attribute - * - * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at - * byte offset @ofs inside the attribute with the constant byte @val. - * - * This function is effectively like memset() applied to an ntfs attribute. - * Note this function actually only operates on the page cache pages belonging - * to the ntfs attribute and it marks them dirty after doing the memset(). - * Thus it relies on the vm dirty page write code paths to cause the modified - * pages to be written to the mft record/disk. - * - * Return 0 on success and -errno on error. An error code of -ESPIPE means - * that @ofs + @cnt were outside the end of the attribute and no write was - * performed. - */ -int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val) -{ - ntfs_volume *vol = ni->vol; - struct address_space *mapping; - struct page *page; - u8 *kaddr; - pgoff_t idx, end; - unsigned start_ofs, end_ofs, size; - - ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.", - (long long)ofs, (long long)cnt, val); - BUG_ON(ofs < 0); - BUG_ON(cnt < 0); - if (!cnt) - goto done; - /* - * FIXME: Compressed and encrypted attributes are not supported when - * writing and we should never have gotten here for them. - */ - BUG_ON(NInoCompressed(ni)); - BUG_ON(NInoEncrypted(ni)); - mapping = VFS_I(ni)->i_mapping; - /* Work out the starting index and page offset. */ - idx = ofs >> PAGE_SHIFT; - start_ofs = ofs & ~PAGE_MASK; - /* Work out the ending index and page offset. */ - end = ofs + cnt; - end_ofs = end & ~PAGE_MASK; - /* If the end is outside the inode size return -ESPIPE. */ - if (unlikely(end > i_size_read(VFS_I(ni)))) { - ntfs_error(vol->sb, "Request exceeds end of attribute."); - return -ESPIPE; - } - end >>= PAGE_SHIFT; - /* If there is a first partial page, need to do it the slow way. */ - if (start_ofs) { - page = read_mapping_page(mapping, idx, NULL); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read first partial " - "page (error, index 0x%lx).", idx); - return PTR_ERR(page); - } - /* - * If the last page is the same as the first page, need to - * limit the write to the end offset. - */ - size = PAGE_SIZE; - if (idx == end) - size = end_ofs; - kaddr = kmap_atomic(page); - memset(kaddr + start_ofs, val, size - start_ofs); - flush_dcache_page(page); - kunmap_atomic(kaddr); - set_page_dirty(page); - put_page(page); - balance_dirty_pages_ratelimited(mapping); - cond_resched(); - if (idx == end) - goto done; - idx++; - } - /* Do the whole pages the fast way. */ - for (; idx < end; idx++) { - /* Find or create the current page. (The page is locked.) */ - page = grab_cache_page(mapping, idx); - if (unlikely(!page)) { - ntfs_error(vol->sb, "Insufficient memory to grab " - "page (index 0x%lx).", idx); - return -ENOMEM; - } - kaddr = kmap_atomic(page); - memset(kaddr, val, PAGE_SIZE); - flush_dcache_page(page); - kunmap_atomic(kaddr); - /* - * If the page has buffers, mark them uptodate since buffer - * state and not page state is definitive in 2.6 kernels. - */ - if (page_has_buffers(page)) { - struct buffer_head *bh, *head; - - bh = head = page_buffers(page); - do { - set_buffer_uptodate(bh); - } while ((bh = bh->b_this_page) != head); - } - /* Now that buffers are uptodate, set the page uptodate, too. */ - SetPageUptodate(page); - /* - * Set the page and all its buffers dirty and mark the inode - * dirty, too. The VM will write the page later on. - */ - set_page_dirty(page); - /* Finally unlock and release the page. */ - unlock_page(page); - put_page(page); - balance_dirty_pages_ratelimited(mapping); - cond_resched(); - } - /* If there is a last partial page, need to do it the slow way. */ - if (end_ofs) { - page = read_mapping_page(mapping, idx, NULL); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read last partial page " - "(error, index 0x%lx).", idx); - return PTR_ERR(page); - } - kaddr = kmap_atomic(page); - memset(kaddr, val, end_ofs); - flush_dcache_page(page); - kunmap_atomic(kaddr); - set_page_dirty(page); - put_page(page); - balance_dirty_pages_ratelimited(mapping); - cond_resched(); - } -done: - ntfs_debug("Done."); - return 0; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h deleted file mode 100644 index fe0890d3d0721..0000000000000 --- a/fs/ntfs/attrib.h +++ /dev/null @@ -1,102 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * attrib.h - Defines for attribute handling in NTFS Linux kernel driver. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2005 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_ATTRIB_H -#define _LINUX_NTFS_ATTRIB_H - -#include "endian.h" -#include "types.h" -#include "layout.h" -#include "inode.h" -#include "runlist.h" -#include "volume.h" - -/** - * ntfs_attr_search_ctx - used in attribute search functions - * @mrec: buffer containing mft record to search - * @attr: attribute record in @mrec where to begin/continue search - * @is_first: if true ntfs_attr_lookup() begins search with @attr, else after - * - * Structure must be initialized to zero before the first call to one of the - * attribute search functions. Initialize @mrec to point to the mft record to - * search, and @attr to point to the first attribute within @mrec (not necessary - * if calling the _first() functions), and set @is_first to 'true' (not necessary - * if calling the _first() functions). - * - * If @is_first is 'true', the search begins with @attr. If @is_first is 'false', - * the search begins after @attr. This is so that, after the first call to one - * of the search attribute functions, we can call the function again, without - * any modification of the search context, to automagically get the next - * matching attribute. - */ -typedef struct { - MFT_RECORD *mrec; - ATTR_RECORD *attr; - bool is_first; - ntfs_inode *ntfs_ino; - ATTR_LIST_ENTRY *al_entry; - ntfs_inode *base_ntfs_ino; - MFT_RECORD *base_mrec; - ATTR_RECORD *base_attr; -} ntfs_attr_search_ctx; - -extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, - ntfs_attr_search_ctx *ctx); -extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn); - -extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, - const bool write_locked); - -extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, - const VCN vcn, ntfs_attr_search_ctx *ctx); - -int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, - const u32 name_len, const IGNORE_CASE_BOOL ic, - const VCN lowest_vcn, const u8 *val, const u32 val_len, - ntfs_attr_search_ctx *ctx); - -extern int load_attribute_list(ntfs_volume *vol, runlist *rl, u8 *al_start, - const s64 size, const s64 initialized_size); - -static inline s64 ntfs_attr_size(const ATTR_RECORD *a) -{ - if (!a->non_resident) - return (s64)le32_to_cpu(a->data.resident.value_length); - return sle64_to_cpu(a->data.non_resident.data_size); -} - -extern void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx); -extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, - MFT_RECORD *mrec); -extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx); - -#ifdef NTFS_RW - -extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol, - const ATTR_TYPE type, const s64 size); -extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, - const ATTR_TYPE type); -extern int ntfs_attr_can_be_resident(const ntfs_volume *vol, - const ATTR_TYPE type); - -extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size); -extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, - const u32 new_size); - -extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size); - -extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size, - const s64 new_data_size, const s64 data_start); - -extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, - const u8 val); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_ATTRIB_H */ diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c deleted file mode 100644 index 0675b2400873a..0000000000000 --- a/fs/ntfs/bitmap.c +++ /dev/null @@ -1,179 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * bitmap.c - NTFS kernel bitmap handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2004-2005 Anton Altaparmakov - */ - -#ifdef NTFS_RW - -#include - -#include "bitmap.h" -#include "debug.h" -#include "aops.h" -#include "ntfs.h" - -/** - * __ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value - * @vi: vfs inode describing the bitmap - * @start_bit: first bit to set - * @count: number of bits to set - * @value: value to set the bits to (i.e. 0 or 1) - * @is_rollback: if 'true' this is a rollback operation - * - * Set @count bits starting at bit @start_bit in the bitmap described by the - * vfs inode @vi to @value, where @value is either 0 or 1. - * - * @is_rollback should always be 'false', it is for internal use to rollback - * errors. You probably want to use ntfs_bitmap_set_bits_in_run() instead. - * - * Return 0 on success and -errno on error. - */ -int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, - const s64 count, const u8 value, const bool is_rollback) -{ - s64 cnt = count; - pgoff_t index, end_index; - struct address_space *mapping; - struct page *page; - u8 *kaddr; - int pos, len; - u8 bit; - - BUG_ON(!vi); - ntfs_debug("Entering for i_ino 0x%lx, start_bit 0x%llx, count 0x%llx, " - "value %u.%s", vi->i_ino, (unsigned long long)start_bit, - (unsigned long long)cnt, (unsigned int)value, - is_rollback ? " (rollback)" : ""); - BUG_ON(start_bit < 0); - BUG_ON(cnt < 0); - BUG_ON(value > 1); - /* - * Calculate the indices for the pages containing the first and last - * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively. - */ - index = start_bit >> (3 + PAGE_SHIFT); - end_index = (start_bit + cnt - 1) >> (3 + PAGE_SHIFT); - - /* Get the page containing the first bit (@start_bit). */ - mapping = vi->i_mapping; - page = ntfs_map_page(mapping, index); - if (IS_ERR(page)) { - if (!is_rollback) - ntfs_error(vi->i_sb, "Failed to map first page (error " - "%li), aborting.", PTR_ERR(page)); - return PTR_ERR(page); - } - kaddr = page_address(page); - - /* Set @pos to the position of the byte containing @start_bit. */ - pos = (start_bit >> 3) & ~PAGE_MASK; - - /* Calculate the position of @start_bit in the first byte. */ - bit = start_bit & 7; - - /* If the first byte is partial, modify the appropriate bits in it. */ - if (bit) { - u8 *byte = kaddr + pos; - while ((bit & 7) && cnt) { - cnt--; - if (value) - *byte |= 1 << bit++; - else - *byte &= ~(1 << bit++); - } - /* If we are done, unmap the page and return success. */ - if (!cnt) - goto done; - - /* Update @pos to the new position. */ - pos++; - } - /* - * Depending on @value, modify all remaining whole bytes in the page up - * to @cnt. - */ - len = min_t(s64, cnt >> 3, PAGE_SIZE - pos); - memset(kaddr + pos, value ? 0xff : 0, len); - cnt -= len << 3; - - /* Update @len to point to the first not-done byte in the page. */ - if (cnt < 8) - len += pos; - - /* If we are not in the last page, deal with all subsequent pages. */ - while (index < end_index) { - BUG_ON(cnt <= 0); - - /* Update @index and get the next page. */ - flush_dcache_page(page); - set_page_dirty(page); - ntfs_unmap_page(page); - page = ntfs_map_page(mapping, ++index); - if (IS_ERR(page)) - goto rollback; - kaddr = page_address(page); - /* - * Depending on @value, modify all remaining whole bytes in the - * page up to @cnt. - */ - len = min_t(s64, cnt >> 3, PAGE_SIZE); - memset(kaddr, value ? 0xff : 0, len); - cnt -= len << 3; - } - /* - * The currently mapped page is the last one. If the last byte is - * partial, modify the appropriate bits in it. Note, @len is the - * position of the last byte inside the page. - */ - if (cnt) { - u8 *byte; - - BUG_ON(cnt > 7); - - bit = cnt; - byte = kaddr + len; - while (bit--) { - if (value) - *byte |= 1 << bit; - else - *byte &= ~(1 << bit); - } - } -done: - /* We are done. Unmap the page and return success. */ - flush_dcache_page(page); - set_page_dirty(page); - ntfs_unmap_page(page); - ntfs_debug("Done."); - return 0; -rollback: - /* - * Current state: - * - no pages are mapped - * - @count - @cnt is the number of bits that have been modified - */ - if (is_rollback) - return PTR_ERR(page); - if (count != cnt) - pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt, - value ? 0 : 1, true); - else - pos = 0; - if (!pos) { - /* Rollback was successful. */ - ntfs_error(vi->i_sb, "Failed to map subsequent page (error " - "%li), aborting.", PTR_ERR(page)); - } else { - /* Rollback failed. */ - ntfs_error(vi->i_sb, "Failed to map subsequent page (error " - "%li) and rollback failed (error %i). " - "Aborting and leaving inconsistent metadata. " - "Unmount and run chkdsk.", PTR_ERR(page), pos); - NVolSetErrors(NTFS_SB(vi->i_sb)); - } - return PTR_ERR(page); -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/bitmap.h b/fs/ntfs/bitmap.h deleted file mode 100644 index 9dd2224ca9c40..0000000000000 --- a/fs/ntfs/bitmap.h +++ /dev/null @@ -1,104 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * bitmap.h - Defines for NTFS kernel bitmap handling. Part of the Linux-NTFS - * project. - * - * Copyright (c) 2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_BITMAP_H -#define _LINUX_NTFS_BITMAP_H - -#ifdef NTFS_RW - -#include - -#include "types.h" - -extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, - const s64 count, const u8 value, const bool is_rollback); - -/** - * ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value - * @vi: vfs inode describing the bitmap - * @start_bit: first bit to set - * @count: number of bits to set - * @value: value to set the bits to (i.e. 0 or 1) - * - * Set @count bits starting at bit @start_bit in the bitmap described by the - * vfs inode @vi to @value, where @value is either 0 or 1. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi, - const s64 start_bit, const s64 count, const u8 value) -{ - return __ntfs_bitmap_set_bits_in_run(vi, start_bit, count, value, - false); -} - -/** - * ntfs_bitmap_set_run - set a run of bits in a bitmap - * @vi: vfs inode describing the bitmap - * @start_bit: first bit to set - * @count: number of bits to set - * - * Set @count bits starting at bit @start_bit in the bitmap described by the - * vfs inode @vi. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_bitmap_set_run(struct inode *vi, const s64 start_bit, - const s64 count) -{ - return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 1); -} - -/** - * ntfs_bitmap_clear_run - clear a run of bits in a bitmap - * @vi: vfs inode describing the bitmap - * @start_bit: first bit to clear - * @count: number of bits to clear - * - * Clear @count bits starting at bit @start_bit in the bitmap described by the - * vfs inode @vi. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_bitmap_clear_run(struct inode *vi, const s64 start_bit, - const s64 count) -{ - return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 0); -} - -/** - * ntfs_bitmap_set_bit - set a bit in a bitmap - * @vi: vfs inode describing the bitmap - * @bit: bit to set - * - * Set bit @bit in the bitmap described by the vfs inode @vi. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_bitmap_set_bit(struct inode *vi, const s64 bit) -{ - return ntfs_bitmap_set_run(vi, bit, 1); -} - -/** - * ntfs_bitmap_clear_bit - clear a bit in a bitmap - * @vi: vfs inode describing the bitmap - * @bit: bit to clear - * - * Clear bit @bit in the bitmap described by the vfs inode @vi. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_bitmap_clear_bit(struct inode *vi, const s64 bit) -{ - return ntfs_bitmap_clear_run(vi, bit, 1); -} - -#endif /* NTFS_RW */ - -#endif /* defined _LINUX_NTFS_BITMAP_H */ diff --git a/fs/ntfs/collate.c b/fs/ntfs/collate.c deleted file mode 100644 index 3ab6ec96abfe2..0000000000000 --- a/fs/ntfs/collate.c +++ /dev/null @@ -1,110 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * collate.c - NTFS kernel collation handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2004 Anton Altaparmakov - */ - -#include "collate.h" -#include "debug.h" -#include "ntfs.h" - -static int ntfs_collate_binary(ntfs_volume *vol, - const void *data1, const int data1_len, - const void *data2, const int data2_len) -{ - int rc; - - ntfs_debug("Entering."); - rc = memcmp(data1, data2, min(data1_len, data2_len)); - if (!rc && (data1_len != data2_len)) { - if (data1_len < data2_len) - rc = -1; - else - rc = 1; - } - ntfs_debug("Done, returning %i", rc); - return rc; -} - -static int ntfs_collate_ntofs_ulong(ntfs_volume *vol, - const void *data1, const int data1_len, - const void *data2, const int data2_len) -{ - int rc; - u32 d1, d2; - - ntfs_debug("Entering."); - // FIXME: We don't really want to bug here. - BUG_ON(data1_len != data2_len); - BUG_ON(data1_len != 4); - d1 = le32_to_cpup(data1); - d2 = le32_to_cpup(data2); - if (d1 < d2) - rc = -1; - else { - if (d1 == d2) - rc = 0; - else - rc = 1; - } - ntfs_debug("Done, returning %i", rc); - return rc; -} - -typedef int (*ntfs_collate_func_t)(ntfs_volume *, const void *, const int, - const void *, const int); - -static ntfs_collate_func_t ntfs_do_collate0x0[3] = { - ntfs_collate_binary, - NULL/*ntfs_collate_file_name*/, - NULL/*ntfs_collate_unicode_string*/, -}; - -static ntfs_collate_func_t ntfs_do_collate0x1[4] = { - ntfs_collate_ntofs_ulong, - NULL/*ntfs_collate_ntofs_sid*/, - NULL/*ntfs_collate_ntofs_security_hash*/, - NULL/*ntfs_collate_ntofs_ulongs*/, -}; - -/** - * ntfs_collate - collate two data items using a specified collation rule - * @vol: ntfs volume to which the data items belong - * @cr: collation rule to use when comparing the items - * @data1: first data item to collate - * @data1_len: length in bytes of @data1 - * @data2: second data item to collate - * @data2_len: length in bytes of @data2 - * - * Collate the two data items @data1 and @data2 using the collation rule @cr - * and return -1, 0, ir 1 if @data1 is found, respectively, to collate before, - * to match, or to collate after @data2. - * - * For speed we use the collation rule @cr as an index into two tables of - * function pointers to call the appropriate collation function. - */ -int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr, - const void *data1, const int data1_len, - const void *data2, const int data2_len) { - int i; - - ntfs_debug("Entering."); - /* - * FIXME: At the moment we only support COLLATION_BINARY and - * COLLATION_NTOFS_ULONG, so we BUG() for everything else for now. - */ - BUG_ON(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG); - i = le32_to_cpu(cr); - BUG_ON(i < 0); - if (i <= 0x02) - return ntfs_do_collate0x0[i](vol, data1, data1_len, - data2, data2_len); - BUG_ON(i < 0x10); - i -= 0x10; - if (likely(i <= 3)) - return ntfs_do_collate0x1[i](vol, data1, data1_len, - data2, data2_len); - BUG(); - return 0; -} diff --git a/fs/ntfs/collate.h b/fs/ntfs/collate.h deleted file mode 100644 index f2255619b4f4a..0000000000000 --- a/fs/ntfs/collate.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * collate.h - Defines for NTFS kernel collation handling. Part of the - * Linux-NTFS project. - * - * Copyright (c) 2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_COLLATE_H -#define _LINUX_NTFS_COLLATE_H - -#include "types.h" -#include "volume.h" - -static inline bool ntfs_is_collation_rule_supported(COLLATION_RULE cr) { - int i; - - /* - * FIXME: At the moment we only support COLLATION_BINARY and - * COLLATION_NTOFS_ULONG, so we return false for everything else for - * now. - */ - if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG)) - return false; - i = le32_to_cpu(cr); - if (likely(((i >= 0) && (i <= 0x02)) || - ((i >= 0x10) && (i <= 0x13)))) - return true; - return false; -} - -extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr, - const void *data1, const int data1_len, - const void *data2, const int data2_len); - -#endif /* _LINUX_NTFS_COLLATE_H */ diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c deleted file mode 100644 index 761aaa0195d66..0000000000000 --- a/fs/ntfs/compress.c +++ /dev/null @@ -1,950 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * compress.c - NTFS kernel compressed attributes handling. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#include -#include -#include -#include -#include - -#include "attrib.h" -#include "inode.h" -#include "debug.h" -#include "ntfs.h" - -/** - * ntfs_compression_constants - enum of constants used in the compression code - */ -typedef enum { - /* Token types and access mask. */ - NTFS_SYMBOL_TOKEN = 0, - NTFS_PHRASE_TOKEN = 1, - NTFS_TOKEN_MASK = 1, - - /* Compression sub-block constants. */ - NTFS_SB_SIZE_MASK = 0x0fff, - NTFS_SB_SIZE = 0x1000, - NTFS_SB_IS_COMPRESSED = 0x8000, - - /* - * The maximum compression block size is by definition 16 * the cluster - * size, with the maximum supported cluster size being 4kiB. Thus the - * maximum compression buffer size is 64kiB, so we use this when - * initializing the compression buffer. - */ - NTFS_MAX_CB_SIZE = 64 * 1024, -} ntfs_compression_constants; - -/* - * ntfs_compression_buffer - one buffer for the decompression engine - */ -static u8 *ntfs_compression_buffer; - -/* - * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer - */ -static DEFINE_SPINLOCK(ntfs_cb_lock); - -/** - * allocate_compression_buffers - allocate the decompression buffers - * - * Caller has to hold the ntfs_lock mutex. - * - * Return 0 on success or -ENOMEM if the allocations failed. - */ -int allocate_compression_buffers(void) -{ - BUG_ON(ntfs_compression_buffer); - - ntfs_compression_buffer = vmalloc(NTFS_MAX_CB_SIZE); - if (!ntfs_compression_buffer) - return -ENOMEM; - return 0; -} - -/** - * free_compression_buffers - free the decompression buffers - * - * Caller has to hold the ntfs_lock mutex. - */ -void free_compression_buffers(void) -{ - BUG_ON(!ntfs_compression_buffer); - vfree(ntfs_compression_buffer); - ntfs_compression_buffer = NULL; -} - -/** - * zero_partial_compressed_page - zero out of bounds compressed page region - */ -static void zero_partial_compressed_page(struct page *page, - const s64 initialized_size) -{ - u8 *kp = page_address(page); - unsigned int kp_ofs; - - ntfs_debug("Zeroing page region outside initialized size."); - if (((s64)page->index << PAGE_SHIFT) >= initialized_size) { - clear_page(kp); - return; - } - kp_ofs = initialized_size & ~PAGE_MASK; - memset(kp + kp_ofs, 0, PAGE_SIZE - kp_ofs); - return; -} - -/** - * handle_bounds_compressed_page - test for&handle out of bounds compressed page - */ -static inline void handle_bounds_compressed_page(struct page *page, - const loff_t i_size, const s64 initialized_size) -{ - if ((page->index >= (initialized_size >> PAGE_SHIFT)) && - (initialized_size < i_size)) - zero_partial_compressed_page(page, initialized_size); - return; -} - -/** - * ntfs_decompress - decompress a compression block into an array of pages - * @dest_pages: destination array of pages - * @completed_pages: scratch space to track completed pages - * @dest_index: current index into @dest_pages (IN/OUT) - * @dest_ofs: current offset within @dest_pages[@dest_index] (IN/OUT) - * @dest_max_index: maximum index into @dest_pages (IN) - * @dest_max_ofs: maximum offset within @dest_pages[@dest_max_index] (IN) - * @xpage: the target page (-1 if none) (IN) - * @xpage_done: set to 1 if xpage was completed successfully (IN/OUT) - * @cb_start: compression block to decompress (IN) - * @cb_size: size of compression block @cb_start in bytes (IN) - * @i_size: file size when we started the read (IN) - * @initialized_size: initialized file size when we started the read (IN) - * - * The caller must have disabled preemption. ntfs_decompress() reenables it when - * the critical section is finished. - * - * This decompresses the compression block @cb_start into the array of - * destination pages @dest_pages starting at index @dest_index into @dest_pages - * and at offset @dest_pos into the page @dest_pages[@dest_index]. - * - * When the page @dest_pages[@xpage] is completed, @xpage_done is set to 1. - * If xpage is -1 or @xpage has not been completed, @xpage_done is not modified. - * - * @cb_start is a pointer to the compression block which needs decompressing - * and @cb_size is the size of @cb_start in bytes (8-64kiB). - * - * Return 0 if success or -EOVERFLOW on error in the compressed stream. - * @xpage_done indicates whether the target page (@dest_pages[@xpage]) was - * completed during the decompression of the compression block (@cb_start). - * - * Warning: This function *REQUIRES* PAGE_SIZE >= 4096 or it will blow up - * unpredicatbly! You have been warned! - * - * Note to hackers: This function may not sleep until it has finished accessing - * the compression block @cb_start as it is a per-CPU buffer. - */ -static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], - int *dest_index, int *dest_ofs, const int dest_max_index, - const int dest_max_ofs, const int xpage, char *xpage_done, - u8 *const cb_start, const u32 cb_size, const loff_t i_size, - const s64 initialized_size) -{ - /* - * Pointers into the compressed data, i.e. the compression block (cb), - * and the therein contained sub-blocks (sb). - */ - u8 *cb_end = cb_start + cb_size; /* End of cb. */ - u8 *cb = cb_start; /* Current position in cb. */ - u8 *cb_sb_start; /* Beginning of the current sb in the cb. */ - u8 *cb_sb_end; /* End of current sb / beginning of next sb. */ - - /* Variables for uncompressed data / destination. */ - struct page *dp; /* Current destination page being worked on. */ - u8 *dp_addr; /* Current pointer into dp. */ - u8 *dp_sb_start; /* Start of current sub-block in dp. */ - u8 *dp_sb_end; /* End of current sb in dp (dp_sb_start + - NTFS_SB_SIZE). */ - u16 do_sb_start; /* @dest_ofs when starting this sub-block. */ - u16 do_sb_end; /* @dest_ofs of end of this sb (do_sb_start + - NTFS_SB_SIZE). */ - - /* Variables for tag and token parsing. */ - u8 tag; /* Current tag. */ - int token; /* Loop counter for the eight tokens in tag. */ - int nr_completed_pages = 0; - - /* Default error code. */ - int err = -EOVERFLOW; - - ntfs_debug("Entering, cb_size = 0x%x.", cb_size); -do_next_sb: - ntfs_debug("Beginning sub-block at offset = 0x%zx in the cb.", - cb - cb_start); - /* - * Have we reached the end of the compression block or the end of the - * decompressed data? The latter can happen for example if the current - * position in the compression block is one byte before its end so the - * first two checks do not detect it. - */ - if (cb == cb_end || !le16_to_cpup((le16*)cb) || - (*dest_index == dest_max_index && - *dest_ofs == dest_max_ofs)) { - int i; - - ntfs_debug("Completed. Returning success (0)."); - err = 0; -return_error: - /* We can sleep from now on, so we drop lock. */ - spin_unlock(&ntfs_cb_lock); - /* Second stage: finalize completed pages. */ - if (nr_completed_pages > 0) { - for (i = 0; i < nr_completed_pages; i++) { - int di = completed_pages[i]; - - dp = dest_pages[di]; - /* - * If we are outside the initialized size, zero - * the out of bounds page range. - */ - handle_bounds_compressed_page(dp, i_size, - initialized_size); - flush_dcache_page(dp); - kunmap(dp); - SetPageUptodate(dp); - unlock_page(dp); - if (di == xpage) - *xpage_done = 1; - else - put_page(dp); - dest_pages[di] = NULL; - } - } - return err; - } - - /* Setup offsets for the current sub-block destination. */ - do_sb_start = *dest_ofs; - do_sb_end = do_sb_start + NTFS_SB_SIZE; - - /* Check that we are still within allowed boundaries. */ - if (*dest_index == dest_max_index && do_sb_end > dest_max_ofs) - goto return_overflow; - - /* Does the minimum size of a compressed sb overflow valid range? */ - if (cb + 6 > cb_end) - goto return_overflow; - - /* Setup the current sub-block source pointers and validate range. */ - cb_sb_start = cb; - cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK) - + 3; - if (cb_sb_end > cb_end) - goto return_overflow; - - /* Get the current destination page. */ - dp = dest_pages[*dest_index]; - if (!dp) { - /* No page present. Skip decompression of this sub-block. */ - cb = cb_sb_end; - - /* Advance destination position to next sub-block. */ - *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_MASK; - if (!*dest_ofs && (++*dest_index > dest_max_index)) - goto return_overflow; - goto do_next_sb; - } - - /* We have a valid destination page. Setup the destination pointers. */ - dp_addr = (u8*)page_address(dp) + do_sb_start; - - /* Now, we are ready to process the current sub-block (sb). */ - if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) { - ntfs_debug("Found uncompressed sub-block."); - /* This sb is not compressed, just copy it into destination. */ - - /* Advance source position to first data byte. */ - cb += 2; - - /* An uncompressed sb must be full size. */ - if (cb_sb_end - cb != NTFS_SB_SIZE) - goto return_overflow; - - /* Copy the block and advance the source position. */ - memcpy(dp_addr, cb, NTFS_SB_SIZE); - cb += NTFS_SB_SIZE; - - /* Advance destination position to next sub-block. */ - *dest_ofs += NTFS_SB_SIZE; - if (!(*dest_ofs &= ~PAGE_MASK)) { -finalize_page: - /* - * First stage: add current page index to array of - * completed pages. - */ - completed_pages[nr_completed_pages++] = *dest_index; - if (++*dest_index > dest_max_index) - goto return_overflow; - } - goto do_next_sb; - } - ntfs_debug("Found compressed sub-block."); - /* This sb is compressed, decompress it into destination. */ - - /* Setup destination pointers. */ - dp_sb_start = dp_addr; - dp_sb_end = dp_sb_start + NTFS_SB_SIZE; - - /* Forward to the first tag in the sub-block. */ - cb += 2; -do_next_tag: - if (cb == cb_sb_end) { - /* Check if the decompressed sub-block was not full-length. */ - if (dp_addr < dp_sb_end) { - int nr_bytes = do_sb_end - *dest_ofs; - - ntfs_debug("Filling incomplete sub-block with " - "zeroes."); - /* Zero remainder and update destination position. */ - memset(dp_addr, 0, nr_bytes); - *dest_ofs += nr_bytes; - } - /* We have finished the current sub-block. */ - if (!(*dest_ofs &= ~PAGE_MASK)) - goto finalize_page; - goto do_next_sb; - } - - /* Check we are still in range. */ - if (cb > cb_sb_end || dp_addr > dp_sb_end) - goto return_overflow; - - /* Get the next tag and advance to first token. */ - tag = *cb++; - - /* Parse the eight tokens described by the tag. */ - for (token = 0; token < 8; token++, tag >>= 1) { - u16 lg, pt, length, max_non_overlap; - register u16 i; - u8 *dp_back_addr; - - /* Check if we are done / still in range. */ - if (cb >= cb_sb_end || dp_addr > dp_sb_end) - break; - - /* Determine token type and parse appropriately.*/ - if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) { - /* - * We have a symbol token, copy the symbol across, and - * advance the source and destination positions. - */ - *dp_addr++ = *cb++; - ++*dest_ofs; - - /* Continue with the next token. */ - continue; - } - - /* - * We have a phrase token. Make sure it is not the first tag in - * the sb as this is illegal and would confuse the code below. - */ - if (dp_addr == dp_sb_start) - goto return_overflow; - - /* - * Determine the number of bytes to go back (p) and the number - * of bytes to copy (l). We use an optimized algorithm in which - * we first calculate log2(current destination position in sb), - * which allows determination of l and p in O(1) rather than - * O(n). We just need an arch-optimized log2() function now. - */ - lg = 0; - for (i = *dest_ofs - do_sb_start - 1; i >= 0x10; i >>= 1) - lg++; - - /* Get the phrase token into i. */ - pt = le16_to_cpup((le16*)cb); - - /* - * Calculate starting position of the byte sequence in - * the destination using the fact that p = (pt >> (12 - lg)) + 1 - * and make sure we don't go too far back. - */ - dp_back_addr = dp_addr - (pt >> (12 - lg)) - 1; - if (dp_back_addr < dp_sb_start) - goto return_overflow; - - /* Now calculate the length of the byte sequence. */ - length = (pt & (0xfff >> lg)) + 3; - - /* Advance destination position and verify it is in range. */ - *dest_ofs += length; - if (*dest_ofs > do_sb_end) - goto return_overflow; - - /* The number of non-overlapping bytes. */ - max_non_overlap = dp_addr - dp_back_addr; - - if (length <= max_non_overlap) { - /* The byte sequence doesn't overlap, just copy it. */ - memcpy(dp_addr, dp_back_addr, length); - - /* Advance destination pointer. */ - dp_addr += length; - } else { - /* - * The byte sequence does overlap, copy non-overlapping - * part and then do a slow byte by byte copy for the - * overlapping part. Also, advance the destination - * pointer. - */ - memcpy(dp_addr, dp_back_addr, max_non_overlap); - dp_addr += max_non_overlap; - dp_back_addr += max_non_overlap; - length -= max_non_overlap; - while (length--) - *dp_addr++ = *dp_back_addr++; - } - - /* Advance source position and continue with the next token. */ - cb += 2; - } - - /* No tokens left in the current tag. Continue with the next tag. */ - goto do_next_tag; - -return_overflow: - ntfs_error(NULL, "Failed. Returning -EOVERFLOW."); - goto return_error; -} - -/** - * ntfs_read_compressed_block - read a compressed block into the page cache - * @page: locked page in the compression block(s) we need to read - * - * When we are called the page has already been verified to be locked and the - * attribute is known to be non-resident, not encrypted, but compressed. - * - * 1. Determine which compression block(s) @page is in. - * 2. Get hold of all pages corresponding to this/these compression block(s). - * 3. Read the (first) compression block. - * 4. Decompress it into the corresponding pages. - * 5. Throw the compressed data away and proceed to 3. for the next compression - * block or return success if no more compression blocks left. - * - * Warning: We have to be careful what we do about existing pages. They might - * have been written to so that we would lose data if we were to just overwrite - * them with the out-of-date uncompressed data. - * - * FIXME: For PAGE_SIZE > cb_size we are not doing the Right Thing(TM) at - * the end of the file I think. We need to detect this case and zero the out - * of bounds remainder of the page in question and mark it as handled. At the - * moment we would just return -EIO on such a page. This bug will only become - * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte - * clusters so is probably not going to be seen by anyone. Still this should - * be fixed. (AIA) - * - * FIXME: Again for PAGE_SIZE > cb_size we are screwing up both in - * handling sparse and compressed cbs. (AIA) - * - * FIXME: At the moment we don't do any zeroing out in the case that - * initialized_size is less than data_size. This should be safe because of the - * nature of the compression algorithm used. Just in case we check and output - * an error message in read inode if the two sizes are not equal for a - * compressed file. (AIA) - */ -int ntfs_read_compressed_block(struct page *page) -{ - loff_t i_size; - s64 initialized_size; - struct address_space *mapping = page->mapping; - ntfs_inode *ni = NTFS_I(mapping->host); - ntfs_volume *vol = ni->vol; - struct super_block *sb = vol->sb; - runlist_element *rl; - unsigned long flags, block_size = sb->s_blocksize; - unsigned char block_size_bits = sb->s_blocksize_bits; - u8 *cb, *cb_pos, *cb_end; - struct buffer_head **bhs; - unsigned long offset, index = page->index; - u32 cb_size = ni->itype.compressed.block_size; - u64 cb_size_mask = cb_size - 1UL; - VCN vcn; - LCN lcn; - /* The first wanted vcn (minimum alignment is PAGE_SIZE). */ - VCN start_vcn = (((s64)index << PAGE_SHIFT) & ~cb_size_mask) >> - vol->cluster_size_bits; - /* - * The first vcn after the last wanted vcn (minimum alignment is again - * PAGE_SIZE. - */ - VCN end_vcn = ((((s64)(index + 1UL) << PAGE_SHIFT) + cb_size - 1) - & ~cb_size_mask) >> vol->cluster_size_bits; - /* Number of compression blocks (cbs) in the wanted vcn range. */ - unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits - >> ni->itype.compressed.block_size_bits; - /* - * Number of pages required to store the uncompressed data from all - * compression blocks (cbs) overlapping @page. Due to alignment - * guarantees of start_vcn and end_vcn, no need to round up here. - */ - unsigned int nr_pages = (end_vcn - start_vcn) << - vol->cluster_size_bits >> PAGE_SHIFT; - unsigned int xpage, max_page, cur_page, cur_ofs, i; - unsigned int cb_clusters, cb_max_ofs; - int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0; - struct page **pages; - int *completed_pages; - unsigned char xpage_done = 0; - - ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = " - "%i.", index, cb_size, nr_pages); - /* - * Bad things happen if we get here for anything that is not an - * unnamed $DATA attribute. - */ - BUG_ON(ni->type != AT_DATA); - BUG_ON(ni->name_len); - - pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); - completed_pages = kmalloc_array(nr_pages + 1, sizeof(int), GFP_NOFS); - - /* Allocate memory to store the buffer heads we need. */ - bhs_size = cb_size / block_size * sizeof(struct buffer_head *); - bhs = kmalloc(bhs_size, GFP_NOFS); - - if (unlikely(!pages || !bhs || !completed_pages)) { - kfree(bhs); - kfree(pages); - kfree(completed_pages); - unlock_page(page); - ntfs_error(vol->sb, "Failed to allocate internal buffers."); - return -ENOMEM; - } - - /* - * We have already been given one page, this is the one we must do. - * Once again, the alignment guarantees keep it simple. - */ - offset = start_vcn << vol->cluster_size_bits >> PAGE_SHIFT; - xpage = index - offset; - pages[xpage] = page; - /* - * The remaining pages need to be allocated and inserted into the page - * cache, alignment guarantees keep all the below much simpler. (-8 - */ - read_lock_irqsave(&ni->size_lock, flags); - i_size = i_size_read(VFS_I(ni)); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - max_page = ((i_size + PAGE_SIZE - 1) >> PAGE_SHIFT) - - offset; - /* Is the page fully outside i_size? (truncate in progress) */ - if (xpage >= max_page) { - kfree(bhs); - kfree(pages); - kfree(completed_pages); - zero_user(page, 0, PAGE_SIZE); - ntfs_debug("Compressed read outside i_size - truncated?"); - SetPageUptodate(page); - unlock_page(page); - return 0; - } - if (nr_pages < max_page) - max_page = nr_pages; - for (i = 0; i < max_page; i++, offset++) { - if (i != xpage) - pages[i] = grab_cache_page_nowait(mapping, offset); - page = pages[i]; - if (page) { - /* - * We only (re)read the page if it isn't already read - * in and/or dirty or we would be losing data or at - * least wasting our time. - */ - if (!PageDirty(page) && (!PageUptodate(page) || - PageError(page))) { - ClearPageError(page); - kmap(page); - continue; - } - unlock_page(page); - put_page(page); - pages[i] = NULL; - } - } - - /* - * We have the runlist, and all the destination pages we need to fill. - * Now read the first compression block. - */ - cur_page = 0; - cur_ofs = 0; - cb_clusters = ni->itype.compressed.block_clusters; -do_next_cb: - nr_cbs--; - nr_bhs = 0; - - /* Read all cb buffer heads one cluster at a time. */ - rl = NULL; - for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn; - vcn++) { - bool is_retry = false; - - if (!rl) { -lock_retry_remap: - down_read(&ni->runlist.lock); - rl = ni->runlist.rl; - } - if (likely(rl != NULL)) { - /* Seek to element containing target vcn. */ - while (rl->length && rl[1].vcn <= vcn) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); - } else - lcn = LCN_RL_NOT_MAPPED; - ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.", - (unsigned long long)vcn, - (unsigned long long)lcn); - if (lcn < 0) { - /* - * When we reach the first sparse cluster we have - * finished with the cb. - */ - if (lcn == LCN_HOLE) - break; - if (is_retry || lcn != LCN_RL_NOT_MAPPED) - goto rl_err; - is_retry = true; - /* - * Attempt to map runlist, dropping lock for the - * duration. - */ - up_read(&ni->runlist.lock); - if (!ntfs_map_runlist(ni, vcn)) - goto lock_retry_remap; - goto map_rl_err; - } - block = lcn << vol->cluster_size_bits >> block_size_bits; - /* Read the lcn from device in chunks of block_size bytes. */ - max_block = block + (vol->cluster_size >> block_size_bits); - do { - ntfs_debug("block = 0x%x.", block); - if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block)))) - goto getblk_err; - nr_bhs++; - } while (++block < max_block); - } - - /* Release the lock if we took it. */ - if (rl) - up_read(&ni->runlist.lock); - - /* Setup and initiate io on all buffer heads. */ - for (i = 0; i < nr_bhs; i++) { - struct buffer_head *tbh = bhs[i]; - - if (!trylock_buffer(tbh)) - continue; - if (unlikely(buffer_uptodate(tbh))) { - unlock_buffer(tbh); - continue; - } - get_bh(tbh); - tbh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, tbh); - } - - /* Wait for io completion on all buffer heads. */ - for (i = 0; i < nr_bhs; i++) { - struct buffer_head *tbh = bhs[i]; - - if (buffer_uptodate(tbh)) - continue; - wait_on_buffer(tbh); - /* - * We need an optimization barrier here, otherwise we start - * hitting the below fixup code when accessing a loopback - * mounted ntfs partition. This indicates either there is a - * race condition in the loop driver or, more likely, gcc - * overoptimises the code without the barrier and it doesn't - * do the Right Thing(TM). - */ - barrier(); - if (unlikely(!buffer_uptodate(tbh))) { - ntfs_warning(vol->sb, "Buffer is unlocked but not " - "uptodate! Unplugging the disk queue " - "and rescheduling."); - get_bh(tbh); - io_schedule(); - put_bh(tbh); - if (unlikely(!buffer_uptodate(tbh))) - goto read_err; - ntfs_warning(vol->sb, "Buffer is now uptodate. Good."); - } - } - - /* - * Get the compression buffer. We must not sleep any more - * until we are finished with it. - */ - spin_lock(&ntfs_cb_lock); - cb = ntfs_compression_buffer; - - BUG_ON(!cb); - - cb_pos = cb; - cb_end = cb + cb_size; - - /* Copy the buffer heads into the contiguous buffer. */ - for (i = 0; i < nr_bhs; i++) { - memcpy(cb_pos, bhs[i]->b_data, block_size); - cb_pos += block_size; - } - - /* Just a precaution. */ - if (cb_pos + 2 <= cb + cb_size) - *(u16*)cb_pos = 0; - - /* Reset cb_pos back to the beginning. */ - cb_pos = cb; - - /* We now have both source (if present) and destination. */ - ntfs_debug("Successfully read the compression block."); - - /* The last page and maximum offset within it for the current cb. */ - cb_max_page = (cur_page << PAGE_SHIFT) + cur_ofs + cb_size; - cb_max_ofs = cb_max_page & ~PAGE_MASK; - cb_max_page >>= PAGE_SHIFT; - - /* Catch end of file inside a compression block. */ - if (cb_max_page > max_page) - cb_max_page = max_page; - - if (vcn == start_vcn - cb_clusters) { - /* Sparse cb, zero out page range overlapping the cb. */ - ntfs_debug("Found sparse compression block."); - /* We can sleep from now on, so we drop lock. */ - spin_unlock(&ntfs_cb_lock); - if (cb_max_ofs) - cb_max_page--; - for (; cur_page < cb_max_page; cur_page++) { - page = pages[cur_page]; - if (page) { - if (likely(!cur_ofs)) - clear_page(page_address(page)); - else - memset(page_address(page) + cur_ofs, 0, - PAGE_SIZE - - cur_ofs); - flush_dcache_page(page); - kunmap(page); - SetPageUptodate(page); - unlock_page(page); - if (cur_page == xpage) - xpage_done = 1; - else - put_page(page); - pages[cur_page] = NULL; - } - cb_pos += PAGE_SIZE - cur_ofs; - cur_ofs = 0; - if (cb_pos >= cb_end) - break; - } - /* If we have a partial final page, deal with it now. */ - if (cb_max_ofs && cb_pos < cb_end) { - page = pages[cur_page]; - if (page) - memset(page_address(page) + cur_ofs, 0, - cb_max_ofs - cur_ofs); - /* - * No need to update cb_pos at this stage: - * cb_pos += cb_max_ofs - cur_ofs; - */ - cur_ofs = cb_max_ofs; - } - } else if (vcn == start_vcn) { - /* We can't sleep so we need two stages. */ - unsigned int cur2_page = cur_page; - unsigned int cur_ofs2 = cur_ofs; - u8 *cb_pos2 = cb_pos; - - ntfs_debug("Found uncompressed compression block."); - /* Uncompressed cb, copy it to the destination pages. */ - /* - * TODO: As a big optimization, we could detect this case - * before we read all the pages and use block_read_full_folio() - * on all full pages instead (we still have to treat partial - * pages especially but at least we are getting rid of the - * synchronous io for the majority of pages. - * Or if we choose not to do the read-ahead/-behind stuff, we - * could just return block_read_full_folio(pages[xpage]) as long - * as PAGE_SIZE <= cb_size. - */ - if (cb_max_ofs) - cb_max_page--; - /* First stage: copy data into destination pages. */ - for (; cur_page < cb_max_page; cur_page++) { - page = pages[cur_page]; - if (page) - memcpy(page_address(page) + cur_ofs, cb_pos, - PAGE_SIZE - cur_ofs); - cb_pos += PAGE_SIZE - cur_ofs; - cur_ofs = 0; - if (cb_pos >= cb_end) - break; - } - /* If we have a partial final page, deal with it now. */ - if (cb_max_ofs && cb_pos < cb_end) { - page = pages[cur_page]; - if (page) - memcpy(page_address(page) + cur_ofs, cb_pos, - cb_max_ofs - cur_ofs); - cb_pos += cb_max_ofs - cur_ofs; - cur_ofs = cb_max_ofs; - } - /* We can sleep from now on, so drop lock. */ - spin_unlock(&ntfs_cb_lock); - /* Second stage: finalize pages. */ - for (; cur2_page < cb_max_page; cur2_page++) { - page = pages[cur2_page]; - if (page) { - /* - * If we are outside the initialized size, zero - * the out of bounds page range. - */ - handle_bounds_compressed_page(page, i_size, - initialized_size); - flush_dcache_page(page); - kunmap(page); - SetPageUptodate(page); - unlock_page(page); - if (cur2_page == xpage) - xpage_done = 1; - else - put_page(page); - pages[cur2_page] = NULL; - } - cb_pos2 += PAGE_SIZE - cur_ofs2; - cur_ofs2 = 0; - if (cb_pos2 >= cb_end) - break; - } - } else { - /* Compressed cb, decompress it into the destination page(s). */ - unsigned int prev_cur_page = cur_page; - - ntfs_debug("Found compressed compression block."); - err = ntfs_decompress(pages, completed_pages, &cur_page, - &cur_ofs, cb_max_page, cb_max_ofs, xpage, - &xpage_done, cb_pos, cb_size - (cb_pos - cb), - i_size, initialized_size); - /* - * We can sleep from now on, lock already dropped by - * ntfs_decompress(). - */ - if (err) { - ntfs_error(vol->sb, "ntfs_decompress() failed in inode " - "0x%lx with error code %i. Skipping " - "this compression block.", - ni->mft_no, -err); - /* Release the unfinished pages. */ - for (; prev_cur_page < cur_page; prev_cur_page++) { - page = pages[prev_cur_page]; - if (page) { - flush_dcache_page(page); - kunmap(page); - unlock_page(page); - if (prev_cur_page != xpage) - put_page(page); - pages[prev_cur_page] = NULL; - } - } - } - } - - /* Release the buffer heads. */ - for (i = 0; i < nr_bhs; i++) - brelse(bhs[i]); - - /* Do we have more work to do? */ - if (nr_cbs) - goto do_next_cb; - - /* We no longer need the list of buffer heads. */ - kfree(bhs); - - /* Clean up if we have any pages left. Should never happen. */ - for (cur_page = 0; cur_page < max_page; cur_page++) { - page = pages[cur_page]; - if (page) { - ntfs_error(vol->sb, "Still have pages left! " - "Terminating them with extreme " - "prejudice. Inode 0x%lx, page index " - "0x%lx.", ni->mft_no, page->index); - flush_dcache_page(page); - kunmap(page); - unlock_page(page); - if (cur_page != xpage) - put_page(page); - pages[cur_page] = NULL; - } - } - - /* We no longer need the list of pages. */ - kfree(pages); - kfree(completed_pages); - - /* If we have completed the requested page, we return success. */ - if (likely(xpage_done)) - return 0; - - ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ? - "EOVERFLOW" : (!err ? "EIO" : "unknown error")); - return err < 0 ? err : -EIO; - -read_err: - ntfs_error(vol->sb, "IO error while reading compressed data."); - /* Release the buffer heads. */ - for (i = 0; i < nr_bhs; i++) - brelse(bhs[i]); - goto err_out; - -map_rl_err: - ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read " - "compression block."); - goto err_out; - -rl_err: - up_read(&ni->runlist.lock); - ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read " - "compression block."); - goto err_out; - -getblk_err: - up_read(&ni->runlist.lock); - ntfs_error(vol->sb, "getblk() failed. Cannot read compression block."); - -err_out: - kfree(bhs); - for (i = cur_page; i < max_page; i++) { - page = pages[i]; - if (page) { - flush_dcache_page(page); - kunmap(page); - unlock_page(page); - if (i != xpage) - put_page(page); - } - } - kfree(pages); - kfree(completed_pages); - return -EIO; -} diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c deleted file mode 100644 index a3c1c5656f8f6..0000000000000 --- a/fs/ntfs/debug.c +++ /dev/null @@ -1,159 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * debug.c - NTFS kernel debug support. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include "debug.h" - -/** - * __ntfs_warning - output a warning to the syslog - * @function: name of function outputting the warning - * @sb: super block of mounted ntfs filesystem - * @fmt: warning string containing format specifications - * @...: a variable number of arguments specified in @fmt - * - * Outputs a warning to the syslog for the mounted ntfs filesystem described - * by @sb. - * - * @fmt and the corresponding @... is printf style format string containing - * the warning string and the corresponding format arguments, respectively. - * - * @function is the name of the function from which __ntfs_warning is being - * called. - * - * Note, you should be using debug.h::ntfs_warning(@sb, @fmt, @...) instead - * as this provides the @function parameter automatically. - */ -void __ntfs_warning(const char *function, const struct super_block *sb, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - int flen = 0; - -#ifndef DEBUG - if (!printk_ratelimit()) - return; -#endif - if (function) - flen = strlen(function); - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - if (sb) - pr_warn("(device %s): %s(): %pV\n", - sb->s_id, flen ? function : "", &vaf); - else - pr_warn("%s(): %pV\n", flen ? function : "", &vaf); - va_end(args); -} - -/** - * __ntfs_error - output an error to the syslog - * @function: name of function outputting the error - * @sb: super block of mounted ntfs filesystem - * @fmt: error string containing format specifications - * @...: a variable number of arguments specified in @fmt - * - * Outputs an error to the syslog for the mounted ntfs filesystem described - * by @sb. - * - * @fmt and the corresponding @... is printf style format string containing - * the error string and the corresponding format arguments, respectively. - * - * @function is the name of the function from which __ntfs_error is being - * called. - * - * Note, you should be using debug.h::ntfs_error(@sb, @fmt, @...) instead - * as this provides the @function parameter automatically. - */ -void __ntfs_error(const char *function, const struct super_block *sb, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - int flen = 0; - -#ifndef DEBUG - if (!printk_ratelimit()) - return; -#endif - if (function) - flen = strlen(function); - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - if (sb) - pr_err("(device %s): %s(): %pV\n", - sb->s_id, flen ? function : "", &vaf); - else - pr_err("%s(): %pV\n", flen ? function : "", &vaf); - va_end(args); -} - -#ifdef DEBUG - -/* If 1, output debug messages, and if 0, don't. */ -int debug_msgs = 0; - -void __ntfs_debug(const char *file, int line, const char *function, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - int flen = 0; - - if (!debug_msgs) - return; - if (function) - flen = strlen(function); - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf); - va_end(args); -} - -/* Dump a runlist. Caller has to provide synchronisation for @rl. */ -void ntfs_debug_dump_runlist(const runlist_element *rl) -{ - int i; - const char *lcn_str[5] = { "LCN_HOLE ", "LCN_RL_NOT_MAPPED", - "LCN_ENOENT ", "LCN_unknown " }; - - if (!debug_msgs) - return; - pr_debug("Dumping runlist (values in hex):\n"); - if (!rl) { - pr_debug("Run list not present.\n"); - return; - } - pr_debug("VCN LCN Run length\n"); - for (i = 0; ; i++) { - LCN lcn = (rl + i)->lcn; - - if (lcn < (LCN)0) { - int index = -lcn - 1; - - if (index > -LCN_ENOENT - 1) - index = 3; - pr_debug("%-16Lx %s %-16Lx%s\n", - (long long)(rl + i)->vcn, lcn_str[index], - (long long)(rl + i)->length, - (rl + i)->length ? "" : - " (runlist end)"); - } else - pr_debug("%-16Lx %-16Lx %-16Lx%s\n", - (long long)(rl + i)->vcn, - (long long)(rl + i)->lcn, - (long long)(rl + i)->length, - (rl + i)->length ? "" : - " (runlist end)"); - if (!(rl + i)->length) - break; - } -} - -#endif diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h deleted file mode 100644 index 6fdef388f1294..0000000000000 --- a/fs/ntfs/debug.h +++ /dev/null @@ -1,57 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * debug.h - NTFS kernel debug support. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_DEBUG_H -#define _LINUX_NTFS_DEBUG_H - -#include - -#include "runlist.h" - -#ifdef DEBUG - -extern int debug_msgs; - -extern __printf(4, 5) -void __ntfs_debug(const char *file, int line, const char *function, - const char *format, ...); -/** - * ntfs_debug - write a debug level message to syslog - * @f: a printf format string containing the message - * @...: the variables to substitute into @f - * - * ntfs_debug() writes a DEBUG level message to the syslog but only if the - * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP. - */ -#define ntfs_debug(f, a...) \ - __ntfs_debug(__FILE__, __LINE__, __func__, f, ##a) - -extern void ntfs_debug_dump_runlist(const runlist_element *rl); - -#else /* !DEBUG */ - -#define ntfs_debug(fmt, ...) \ -do { \ - if (0) \ - no_printk(fmt, ##__VA_ARGS__); \ -} while (0) - -#define ntfs_debug_dump_runlist(rl) do {} while (0) - -#endif /* !DEBUG */ - -extern __printf(3, 4) -void __ntfs_warning(const char *function, const struct super_block *sb, - const char *fmt, ...); -#define ntfs_warning(sb, f, a...) __ntfs_warning(__func__, sb, f, ##a) - -extern __printf(3, 4) -void __ntfs_error(const char *function, const struct super_block *sb, - const char *fmt, ...); -#define ntfs_error(sb, f, a...) __ntfs_error(__func__, sb, f, ##a) - -#endif /* _LINUX_NTFS_DEBUG_H */ diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c deleted file mode 100644 index 629723a8d7125..0000000000000 --- a/fs/ntfs/dir.c +++ /dev/null @@ -1,1540 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2007 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#include -#include -#include - -#include "dir.h" -#include "aops.h" -#include "attrib.h" -#include "mft.h" -#include "debug.h" -#include "ntfs.h" - -/* - * The little endian Unicode string $I30 as a global constant. - */ -ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'), - cpu_to_le16('3'), cpu_to_le16('0'), 0 }; - -/** - * ntfs_lookup_inode_by_name - find an inode in a directory given its name - * @dir_ni: ntfs inode of the directory in which to search for the name - * @uname: Unicode name for which to search in the directory - * @uname_len: length of the name @uname in Unicode characters - * @res: return the found file name if necessary (see below) - * - * Look for an inode with name @uname in the directory with inode @dir_ni. - * ntfs_lookup_inode_by_name() walks the contents of the directory looking for - * the Unicode name. If the name is found in the directory, the corresponding - * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it - * is a 64-bit number containing the sequence number. - * - * On error, a negative value is returned corresponding to the error code. In - * particular if the inode is not found -ENOENT is returned. Note that you - * can't just check the return value for being negative, you have to check the - * inode number for being negative which you can extract using MREC(return - * value). - * - * Note, @uname_len does not include the (optional) terminating NULL character. - * - * Note, we look for a case sensitive match first but we also look for a case - * insensitive match at the same time. If we find a case insensitive match, we - * save that for the case that we don't find an exact match, where we return - * the case insensitive match and setup @res (which we allocate!) with the mft - * reference, the file name type, length and with a copy of the little endian - * Unicode file name itself. If we match a file name which is in the DOS name - * space, we only return the mft reference and file name type in @res. - * ntfs_lookup() then uses this to find the long file name in the inode itself. - * This is to avoid polluting the dcache with short file names. We want them to - * work but we don't care for how quickly one can access them. This also fixes - * the dcache aliasing issues. - * - * Locking: - Caller must hold i_mutex on the directory. - * - Each page cache page in the index allocation mapping must be - * locked whilst being accessed otherwise we may find a corrupt - * page due to it being under ->writepage at the moment which - * applies the mst protection fixups before writing out and then - * removes them again after the write is complete after which it - * unlocks the page. - */ -MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, - const int uname_len, ntfs_name **res) -{ - ntfs_volume *vol = dir_ni->vol; - struct super_block *sb = vol->sb; - MFT_RECORD *m; - INDEX_ROOT *ir; - INDEX_ENTRY *ie; - INDEX_ALLOCATION *ia; - u8 *index_end; - u64 mref; - ntfs_attr_search_ctx *ctx; - int err, rc; - VCN vcn, old_vcn; - struct address_space *ia_mapping; - struct page *page; - u8 *kaddr; - ntfs_name *name = NULL; - - BUG_ON(!S_ISDIR(VFS_I(dir_ni)->i_mode)); - BUG_ON(NInoAttr(dir_ni)); - /* Get hold of the mft record for the directory. */ - m = map_mft_record(dir_ni); - if (IS_ERR(m)) { - ntfs_error(sb, "map_mft_record() failed with error code %ld.", - -PTR_ERR(m)); - return ERR_MREF(PTR_ERR(m)); - } - ctx = ntfs_attr_get_search_ctx(dir_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - /* Find the index root attribute in the mft record. */ - err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, - 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) { - ntfs_error(sb, "Index root attribute missing in " - "directory inode 0x%lx.", - dir_ni->mft_no); - err = -EIO; - } - goto err_out; - } - /* Get to the index root value (it's been verified in read_inode). */ - ir = (INDEX_ROOT*)((u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset)); - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ir->index + - le32_to_cpu(ir->index.entries_offset)); - /* - * Loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds checks. */ - if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end) - goto dir_err_out; - /* - * The last entry cannot contain a name. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* - * We perform a case sensitive comparison and if that matches - * we are done and return the mft reference of the inode (i.e. - * the inode number together with the sequence number for - * consistency checking). We convert it to cpu format before - * returning. - */ - if (ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, - CASE_SENSITIVE, vol->upcase, vol->upcase_len)) { -found_it: - /* - * We have a perfect match, so we don't need to care - * about having matched imperfectly before, so we can - * free name and set *res to NULL. - * However, if the perfect match is a short file name, - * we need to signal this through *res, so that - * ntfs_lookup() can fix dcache aliasing issues. - * As an optimization we just reuse an existing - * allocation of *res. - */ - if (ie->key.file_name.file_name_type == FILE_NAME_DOS) { - if (!name) { - name = kmalloc(sizeof(ntfs_name), - GFP_NOFS); - if (!name) { - err = -ENOMEM; - goto err_out; - } - } - name->mref = le64_to_cpu( - ie->data.dir.indexed_file); - name->type = FILE_NAME_DOS; - name->len = 0; - *res = name; - } else { - kfree(name); - *res = NULL; - } - mref = le64_to_cpu(ie->data.dir.indexed_file); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(dir_ni); - return mref; - } - /* - * For a case insensitive mount, we also perform a case - * insensitive comparison (provided the file name is not in the - * POSIX namespace). If the comparison matches, and the name is - * in the WIN32 namespace, we cache the filename in *res so - * that the caller, ntfs_lookup(), can work on it. If the - * comparison matches, and the name is in the DOS namespace, we - * only cache the mft reference and the file name type (we set - * the name length to zero for simplicity). - */ - if (!NVolCaseSensitive(vol) && - ie->key.file_name.file_name_type && - ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, - IGNORE_CASE, vol->upcase, vol->upcase_len)) { - int name_size = sizeof(ntfs_name); - u8 type = ie->key.file_name.file_name_type; - u8 len = ie->key.file_name.file_name_length; - - /* Only one case insensitive matching name allowed. */ - if (name) { - ntfs_error(sb, "Found already allocated name " - "in phase 1. Please run chkdsk " - "and if that doesn't find any " - "errors please report you saw " - "this message to " - "linux-ntfs-dev@lists." - "sourceforge.net."); - goto dir_err_out; - } - - if (type != FILE_NAME_DOS) - name_size += len * sizeof(ntfschar); - name = kmalloc(name_size, GFP_NOFS); - if (!name) { - err = -ENOMEM; - goto err_out; - } - name->mref = le64_to_cpu(ie->data.dir.indexed_file); - name->type = type; - if (type != FILE_NAME_DOS) { - name->len = len; - memcpy(name->name, ie->key.file_name.file_name, - len * sizeof(ntfschar)); - } else - name->len = 0; - *res = name; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - IGNORE_CASE, vol->upcase, vol->upcase_len); - /* - * If uname collates before the name of the current entry, there - * is definitely no such name in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* The names are not equal, continue the search. */ - if (rc) - continue; - /* - * Names match with case insensitive comparison, now try the - * case sensitive comparison, which is required for proper - * collation. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - CASE_SENSITIVE, vol->upcase, vol->upcase_len); - if (rc == -1) - break; - if (rc) - continue; - /* - * Perfect match, this will never happen as the - * ntfs_are_names_equal() call will have gotten a match but we - * still treat it correctly. - */ - goto found_it; - } - /* - * We have finished with this index without success. Check for the - * presence of a child node and if not present return -ENOENT, unless - * we have got a matching name cached in name in which case return the - * mft reference associated with it. - */ - if (!(ie->flags & INDEX_ENTRY_NODE)) { - if (name) { - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(dir_ni); - return name->mref; - } - ntfs_debug("Entry not found."); - err = -ENOENT; - goto err_out; - } /* Child node present, descend into it. */ - /* Consistency check: Verify that an index allocation exists. */ - if (!NInoIndexAllocPresent(dir_ni)) { - ntfs_error(sb, "No index allocation attribute but index entry " - "requires one. Directory inode 0x%lx is " - "corrupt or driver bug.", dir_ni->mft_no); - goto err_out; - } - /* Get the starting vcn of the index_block holding the child node. */ - vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); - ia_mapping = VFS_I(dir_ni)->i_mapping; - /* - * We are done with the index root and the mft record. Release them, - * otherwise we deadlock with ntfs_map_page(). - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(dir_ni); - m = NULL; - ctx = NULL; -descend_into_child_node: - /* - * Convert vcn to index into the index allocation attribute in units - * of PAGE_SIZE and map the page cache page, reading it from - * disk if necessary. - */ - page = ntfs_map_page(ia_mapping, vcn << - dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); - if (IS_ERR(page)) { - ntfs_error(sb, "Failed to map directory index page, error %ld.", - -PTR_ERR(page)); - err = PTR_ERR(page); - goto err_out; - } - lock_page(page); - kaddr = (u8*)page_address(page); -fast_descend_into_child_node: - /* Get to the index allocation block. */ - ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << - dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); - /* Bounds checks. */ - if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Out of bounds check failed. Corrupt directory " - "inode 0x%lx or driver bug.", dir_ni->mft_no); - goto unm_err_out; - } - /* Catch multi sector transfer fixup errors. */ - if (unlikely(!ntfs_is_indx_record(ia->magic))) { - ntfs_error(sb, "Directory index record with vcn 0x%llx is " - "corrupt. Corrupt inode 0x%lx. Run chkdsk.", - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - if (sle64_to_cpu(ia->index_block_vcn) != vcn) { - ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " - "different from expected VCN (0x%llx). " - "Directory inode 0x%lx is corrupt or driver " - "bug.", (unsigned long long) - sle64_to_cpu(ia->index_block_vcn), - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - if (le32_to_cpu(ia->index.allocated_size) + 0x18 != - dir_ni->itype.index.block_size) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx has a size (%u) differing from the " - "directory specified size (%u). Directory " - "inode is corrupt or driver bug.", - (unsigned long long)vcn, dir_ni->mft_no, - le32_to_cpu(ia->index.allocated_size) + 0x18, - dir_ni->itype.index.block_size); - goto unm_err_out; - } - index_end = (u8*)ia + dir_ni->itype.index.block_size; - if (index_end > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx crosses page boundary. Impossible! " - "Cannot access! This is probably a bug in the " - "driver.", (unsigned long long)vcn, - dir_ni->mft_no); - goto unm_err_out; - } - index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); - if (index_end > (u8*)ia + dir_ni->itype.index.block_size) { - ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " - "inode 0x%lx exceeds maximum size.", - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ia->index + - le32_to_cpu(ia->index.entries_offset)); - /* - * Iterate similar to above big loop but applied to index buffer, thus - * loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds check. */ - if ((u8*)ie < (u8*)ia || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end) { - ntfs_error(sb, "Index entry out of bounds in " - "directory inode 0x%lx.", - dir_ni->mft_no); - goto unm_err_out; - } - /* - * The last entry cannot contain a name. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* - * We perform a case sensitive comparison and if that matches - * we are done and return the mft reference of the inode (i.e. - * the inode number together with the sequence number for - * consistency checking). We convert it to cpu format before - * returning. - */ - if (ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, - CASE_SENSITIVE, vol->upcase, vol->upcase_len)) { -found_it2: - /* - * We have a perfect match, so we don't need to care - * about having matched imperfectly before, so we can - * free name and set *res to NULL. - * However, if the perfect match is a short file name, - * we need to signal this through *res, so that - * ntfs_lookup() can fix dcache aliasing issues. - * As an optimization we just reuse an existing - * allocation of *res. - */ - if (ie->key.file_name.file_name_type == FILE_NAME_DOS) { - if (!name) { - name = kmalloc(sizeof(ntfs_name), - GFP_NOFS); - if (!name) { - err = -ENOMEM; - goto unm_err_out; - } - } - name->mref = le64_to_cpu( - ie->data.dir.indexed_file); - name->type = FILE_NAME_DOS; - name->len = 0; - *res = name; - } else { - kfree(name); - *res = NULL; - } - mref = le64_to_cpu(ie->data.dir.indexed_file); - unlock_page(page); - ntfs_unmap_page(page); - return mref; - } - /* - * For a case insensitive mount, we also perform a case - * insensitive comparison (provided the file name is not in the - * POSIX namespace). If the comparison matches, and the name is - * in the WIN32 namespace, we cache the filename in *res so - * that the caller, ntfs_lookup(), can work on it. If the - * comparison matches, and the name is in the DOS namespace, we - * only cache the mft reference and the file name type (we set - * the name length to zero for simplicity). - */ - if (!NVolCaseSensitive(vol) && - ie->key.file_name.file_name_type && - ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, - IGNORE_CASE, vol->upcase, vol->upcase_len)) { - int name_size = sizeof(ntfs_name); - u8 type = ie->key.file_name.file_name_type; - u8 len = ie->key.file_name.file_name_length; - - /* Only one case insensitive matching name allowed. */ - if (name) { - ntfs_error(sb, "Found already allocated name " - "in phase 2. Please run chkdsk " - "and if that doesn't find any " - "errors please report you saw " - "this message to " - "linux-ntfs-dev@lists." - "sourceforge.net."); - unlock_page(page); - ntfs_unmap_page(page); - goto dir_err_out; - } - - if (type != FILE_NAME_DOS) - name_size += len * sizeof(ntfschar); - name = kmalloc(name_size, GFP_NOFS); - if (!name) { - err = -ENOMEM; - goto unm_err_out; - } - name->mref = le64_to_cpu(ie->data.dir.indexed_file); - name->type = type; - if (type != FILE_NAME_DOS) { - name->len = len; - memcpy(name->name, ie->key.file_name.file_name, - len * sizeof(ntfschar)); - } else - name->len = 0; - *res = name; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - IGNORE_CASE, vol->upcase, vol->upcase_len); - /* - * If uname collates before the name of the current entry, there - * is definitely no such name in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* The names are not equal, continue the search. */ - if (rc) - continue; - /* - * Names match with case insensitive comparison, now try the - * case sensitive comparison, which is required for proper - * collation. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - CASE_SENSITIVE, vol->upcase, vol->upcase_len); - if (rc == -1) - break; - if (rc) - continue; - /* - * Perfect match, this will never happen as the - * ntfs_are_names_equal() call will have gotten a match but we - * still treat it correctly. - */ - goto found_it2; - } - /* - * We have finished with this index buffer without success. Check for - * the presence of a child node. - */ - if (ie->flags & INDEX_ENTRY_NODE) { - if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { - ntfs_error(sb, "Index entry with child node found in " - "a leaf node in directory inode 0x%lx.", - dir_ni->mft_no); - goto unm_err_out; - } - /* Child node present, descend into it. */ - old_vcn = vcn; - vcn = sle64_to_cpup((sle64*)((u8*)ie + - le16_to_cpu(ie->length) - 8)); - if (vcn >= 0) { - /* If vcn is in the same page cache page as old_vcn we - * recycle the mapped page. */ - if (old_vcn << vol->cluster_size_bits >> - PAGE_SHIFT == vcn << - vol->cluster_size_bits >> - PAGE_SHIFT) - goto fast_descend_into_child_node; - unlock_page(page); - ntfs_unmap_page(page); - goto descend_into_child_node; - } - ntfs_error(sb, "Negative child node vcn in directory inode " - "0x%lx.", dir_ni->mft_no); - goto unm_err_out; - } - /* - * No child node present, return -ENOENT, unless we have got a matching - * name cached in name in which case return the mft reference - * associated with it. - */ - if (name) { - unlock_page(page); - ntfs_unmap_page(page); - return name->mref; - } - ntfs_debug("Entry not found."); - err = -ENOENT; -unm_err_out: - unlock_page(page); - ntfs_unmap_page(page); -err_out: - if (!err) - err = -EIO; - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(dir_ni); - if (name) { - kfree(name); - *res = NULL; - } - return ERR_MREF(err); -dir_err_out: - ntfs_error(sb, "Corrupt directory. Aborting lookup."); - goto err_out; -} - -#if 0 - -// TODO: (AIA) -// The algorithm embedded in this code will be required for the time when we -// want to support adding of entries to directories, where we require correct -// collation of file names in order not to cause corruption of the filesystem. - -/** - * ntfs_lookup_inode_by_name - find an inode in a directory given its name - * @dir_ni: ntfs inode of the directory in which to search for the name - * @uname: Unicode name for which to search in the directory - * @uname_len: length of the name @uname in Unicode characters - * - * Look for an inode with name @uname in the directory with inode @dir_ni. - * ntfs_lookup_inode_by_name() walks the contents of the directory looking for - * the Unicode name. If the name is found in the directory, the corresponding - * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it - * is a 64-bit number containing the sequence number. - * - * On error, a negative value is returned corresponding to the error code. In - * particular if the inode is not found -ENOENT is returned. Note that you - * can't just check the return value for being negative, you have to check the - * inode number for being negative which you can extract using MREC(return - * value). - * - * Note, @uname_len does not include the (optional) terminating NULL character. - */ -u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, - const int uname_len) -{ - ntfs_volume *vol = dir_ni->vol; - struct super_block *sb = vol->sb; - MFT_RECORD *m; - INDEX_ROOT *ir; - INDEX_ENTRY *ie; - INDEX_ALLOCATION *ia; - u8 *index_end; - u64 mref; - ntfs_attr_search_ctx *ctx; - int err, rc; - IGNORE_CASE_BOOL ic; - VCN vcn, old_vcn; - struct address_space *ia_mapping; - struct page *page; - u8 *kaddr; - - /* Get hold of the mft record for the directory. */ - m = map_mft_record(dir_ni); - if (IS_ERR(m)) { - ntfs_error(sb, "map_mft_record() failed with error code %ld.", - -PTR_ERR(m)); - return ERR_MREF(PTR_ERR(m)); - } - ctx = ntfs_attr_get_search_ctx(dir_ni, m); - if (!ctx) { - err = -ENOMEM; - goto err_out; - } - /* Find the index root attribute in the mft record. */ - err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, - 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) { - ntfs_error(sb, "Index root attribute missing in " - "directory inode 0x%lx.", - dir_ni->mft_no); - err = -EIO; - } - goto err_out; - } - /* Get to the index root value (it's been verified in read_inode). */ - ir = (INDEX_ROOT*)((u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset)); - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ir->index + - le32_to_cpu(ir->index.entries_offset)); - /* - * Loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds checks. */ - if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end) - goto dir_err_out; - /* - * The last entry cannot contain a name. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* - * If the current entry has a name type of POSIX, the name is - * case sensitive and not otherwise. This has the effect of us - * not being able to access any POSIX file names which collate - * after the non-POSIX one when they only differ in case, but - * anyone doing screwy stuff like that deserves to burn in - * hell... Doing that kind of stuff on NT4 actually causes - * corruption on the partition even when using SP6a and Linux - * is not involved at all. - */ - ic = ie->key.file_name.file_name_type ? IGNORE_CASE : - CASE_SENSITIVE; - /* - * If the names match perfectly, we are done and return the - * mft reference of the inode (i.e. the inode number together - * with the sequence number for consistency checking. We - * convert it to cpu format before returning. - */ - if (ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, ic, - vol->upcase, vol->upcase_len)) { -found_it: - mref = le64_to_cpu(ie->data.dir.indexed_file); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(dir_ni); - return mref; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - IGNORE_CASE, vol->upcase, vol->upcase_len); - /* - * If uname collates before the name of the current entry, there - * is definitely no such name in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* The names are not equal, continue the search. */ - if (rc) - continue; - /* - * Names match with case insensitive comparison, now try the - * case sensitive comparison, which is required for proper - * collation. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - CASE_SENSITIVE, vol->upcase, vol->upcase_len); - if (rc == -1) - break; - if (rc) - continue; - /* - * Perfect match, this will never happen as the - * ntfs_are_names_equal() call will have gotten a match but we - * still treat it correctly. - */ - goto found_it; - } - /* - * We have finished with this index without success. Check for the - * presence of a child node. - */ - if (!(ie->flags & INDEX_ENTRY_NODE)) { - /* No child node, return -ENOENT. */ - err = -ENOENT; - goto err_out; - } /* Child node present, descend into it. */ - /* Consistency check: Verify that an index allocation exists. */ - if (!NInoIndexAllocPresent(dir_ni)) { - ntfs_error(sb, "No index allocation attribute but index entry " - "requires one. Directory inode 0x%lx is " - "corrupt or driver bug.", dir_ni->mft_no); - goto err_out; - } - /* Get the starting vcn of the index_block holding the child node. */ - vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8); - ia_mapping = VFS_I(dir_ni)->i_mapping; - /* - * We are done with the index root and the mft record. Release them, - * otherwise we deadlock with ntfs_map_page(). - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(dir_ni); - m = NULL; - ctx = NULL; -descend_into_child_node: - /* - * Convert vcn to index into the index allocation attribute in units - * of PAGE_SIZE and map the page cache page, reading it from - * disk if necessary. - */ - page = ntfs_map_page(ia_mapping, vcn << - dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); - if (IS_ERR(page)) { - ntfs_error(sb, "Failed to map directory index page, error %ld.", - -PTR_ERR(page)); - err = PTR_ERR(page); - goto err_out; - } - lock_page(page); - kaddr = (u8*)page_address(page); -fast_descend_into_child_node: - /* Get to the index allocation block. */ - ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << - dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); - /* Bounds checks. */ - if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Out of bounds check failed. Corrupt directory " - "inode 0x%lx or driver bug.", dir_ni->mft_no); - goto unm_err_out; - } - /* Catch multi sector transfer fixup errors. */ - if (unlikely(!ntfs_is_indx_record(ia->magic))) { - ntfs_error(sb, "Directory index record with vcn 0x%llx is " - "corrupt. Corrupt inode 0x%lx. Run chkdsk.", - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - if (sle64_to_cpu(ia->index_block_vcn) != vcn) { - ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " - "different from expected VCN (0x%llx). " - "Directory inode 0x%lx is corrupt or driver " - "bug.", (unsigned long long) - sle64_to_cpu(ia->index_block_vcn), - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - if (le32_to_cpu(ia->index.allocated_size) + 0x18 != - dir_ni->itype.index.block_size) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx has a size (%u) differing from the " - "directory specified size (%u). Directory " - "inode is corrupt or driver bug.", - (unsigned long long)vcn, dir_ni->mft_no, - le32_to_cpu(ia->index.allocated_size) + 0x18, - dir_ni->itype.index.block_size); - goto unm_err_out; - } - index_end = (u8*)ia + dir_ni->itype.index.block_size; - if (index_end > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx crosses page boundary. Impossible! " - "Cannot access! This is probably a bug in the " - "driver.", (unsigned long long)vcn, - dir_ni->mft_no); - goto unm_err_out; - } - index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); - if (index_end > (u8*)ia + dir_ni->itype.index.block_size) { - ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " - "inode 0x%lx exceeds maximum size.", - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ia->index + - le32_to_cpu(ia->index.entries_offset)); - /* - * Iterate similar to above big loop but applied to index buffer, thus - * loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds check. */ - if ((u8*)ie < (u8*)ia || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end) { - ntfs_error(sb, "Index entry out of bounds in " - "directory inode 0x%lx.", - dir_ni->mft_no); - goto unm_err_out; - } - /* - * The last entry cannot contain a name. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* - * If the current entry has a name type of POSIX, the name is - * case sensitive and not otherwise. This has the effect of us - * not being able to access any POSIX file names which collate - * after the non-POSIX one when they only differ in case, but - * anyone doing screwy stuff like that deserves to burn in - * hell... Doing that kind of stuff on NT4 actually causes - * corruption on the partition even when using SP6a and Linux - * is not involved at all. - */ - ic = ie->key.file_name.file_name_type ? IGNORE_CASE : - CASE_SENSITIVE; - /* - * If the names match perfectly, we are done and return the - * mft reference of the inode (i.e. the inode number together - * with the sequence number for consistency checking. We - * convert it to cpu format before returning. - */ - if (ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, ic, - vol->upcase, vol->upcase_len)) { -found_it2: - mref = le64_to_cpu(ie->data.dir.indexed_file); - unlock_page(page); - ntfs_unmap_page(page); - return mref; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - IGNORE_CASE, vol->upcase, vol->upcase_len); - /* - * If uname collates before the name of the current entry, there - * is definitely no such name in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* The names are not equal, continue the search. */ - if (rc) - continue; - /* - * Names match with case insensitive comparison, now try the - * case sensitive comparison, which is required for proper - * collation. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - CASE_SENSITIVE, vol->upcase, vol->upcase_len); - if (rc == -1) - break; - if (rc) - continue; - /* - * Perfect match, this will never happen as the - * ntfs_are_names_equal() call will have gotten a match but we - * still treat it correctly. - */ - goto found_it2; - } - /* - * We have finished with this index buffer without success. Check for - * the presence of a child node. - */ - if (ie->flags & INDEX_ENTRY_NODE) { - if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { - ntfs_error(sb, "Index entry with child node found in " - "a leaf node in directory inode 0x%lx.", - dir_ni->mft_no); - goto unm_err_out; - } - /* Child node present, descend into it. */ - old_vcn = vcn; - vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8); - if (vcn >= 0) { - /* If vcn is in the same page cache page as old_vcn we - * recycle the mapped page. */ - if (old_vcn << vol->cluster_size_bits >> - PAGE_SHIFT == vcn << - vol->cluster_size_bits >> - PAGE_SHIFT) - goto fast_descend_into_child_node; - unlock_page(page); - ntfs_unmap_page(page); - goto descend_into_child_node; - } - ntfs_error(sb, "Negative child node vcn in directory inode " - "0x%lx.", dir_ni->mft_no); - goto unm_err_out; - } - /* No child node, return -ENOENT. */ - ntfs_debug("Entry not found."); - err = -ENOENT; -unm_err_out: - unlock_page(page); - ntfs_unmap_page(page); -err_out: - if (!err) - err = -EIO; - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(dir_ni); - return ERR_MREF(err); -dir_err_out: - ntfs_error(sb, "Corrupt directory. Aborting lookup."); - goto err_out; -} - -#endif - -/** - * ntfs_filldir - ntfs specific filldir method - * @vol: current ntfs volume - * @ndir: ntfs inode of current directory - * @ia_page: page in which the index allocation buffer @ie is in resides - * @ie: current index entry - * @name: buffer to use for the converted name - * @actor: what to feed the entries to - * - * Convert the Unicode @name to the loaded NLS and pass it to the @filldir - * callback. - * - * If @ia_page is not NULL it is the locked page containing the index - * allocation block containing the index entry @ie. - * - * Note, we drop (and then reacquire) the page lock on @ia_page across the - * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup - * since ntfs_lookup() will lock the same page. As an optimization, we do not - * retake the lock if we are returning a non-zero value as ntfs_readdir() - * would need to drop the lock immediately anyway. - */ -static inline int ntfs_filldir(ntfs_volume *vol, - ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie, - u8 *name, struct dir_context *actor) -{ - unsigned long mref; - int name_len; - unsigned dt_type; - FILE_NAME_TYPE_FLAGS name_type; - - name_type = ie->key.file_name.file_name_type; - if (name_type == FILE_NAME_DOS) { - ntfs_debug("Skipping DOS name space entry."); - return 0; - } - if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) { - ntfs_debug("Skipping root directory self reference entry."); - return 0; - } - if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user && - !NVolShowSystemFiles(vol)) { - ntfs_debug("Skipping system file."); - return 0; - } - name_len = ntfs_ucstonls(vol, (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, &name, - NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1); - if (name_len <= 0) { - ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.", - (long long)MREF_LE(ie->data.dir.indexed_file)); - return 0; - } - if (ie->key.file_name.file_attributes & - FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT) - dt_type = DT_DIR; - else - dt_type = DT_REG; - mref = MREF_LE(ie->data.dir.indexed_file); - /* - * Drop the page lock otherwise we deadlock with NFS when it calls - * ->lookup since ntfs_lookup() will lock the same page. - */ - if (ia_page) - unlock_page(ia_page); - ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode " - "0x%lx, DT_%s.", name, name_len, actor->pos, mref, - dt_type == DT_DIR ? "DIR" : "REG"); - if (!dir_emit(actor, name, name_len, mref, dt_type)) - return 1; - /* Relock the page but not if we are aborting ->readdir. */ - if (ia_page) - lock_page(ia_page); - return 0; -} - -/* - * We use the same basic approach as the old NTFS driver, i.e. we parse the - * index root entries and then the index allocation entries that are marked - * as in use in the index bitmap. - * - * While this will return the names in random order this doesn't matter for - * ->readdir but OTOH results in a faster ->readdir. - * - * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS - * parts (e.g. ->f_pos and ->i_size, and it also protects against directory - * modifications). - * - * Locking: - Caller must hold i_mutex on the directory. - * - Each page cache page in the index allocation mapping must be - * locked whilst being accessed otherwise we may find a corrupt - * page due to it being under ->writepage at the moment which - * applies the mst protection fixups before writing out and then - * removes them again after the write is complete after which it - * unlocks the page. - */ -static int ntfs_readdir(struct file *file, struct dir_context *actor) -{ - s64 ia_pos, ia_start, prev_ia_pos, bmp_pos; - loff_t i_size; - struct inode *bmp_vi, *vdir = file_inode(file); - struct super_block *sb = vdir->i_sb; - ntfs_inode *ndir = NTFS_I(vdir); - ntfs_volume *vol = NTFS_SB(sb); - MFT_RECORD *m; - INDEX_ROOT *ir = NULL; - INDEX_ENTRY *ie; - INDEX_ALLOCATION *ia; - u8 *name = NULL; - int rc, err, ir_pos, cur_bmp_pos; - struct address_space *ia_mapping, *bmp_mapping; - struct page *bmp_page = NULL, *ia_page = NULL; - u8 *kaddr, *bmp, *index_end; - ntfs_attr_search_ctx *ctx; - - ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.", - vdir->i_ino, actor->pos); - rc = err = 0; - /* Are we at end of dir yet? */ - i_size = i_size_read(vdir); - if (actor->pos >= i_size + vol->mft_record_size) - return 0; - /* Emulate . and .. for all directories. */ - if (!dir_emit_dots(file, actor)) - return 0; - m = NULL; - ctx = NULL; - /* - * Allocate a buffer to store the current name being processed - * converted to format determined by current NLS. - */ - name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS); - if (unlikely(!name)) { - err = -ENOMEM; - goto err_out; - } - /* Are we jumping straight into the index allocation attribute? */ - if (actor->pos >= vol->mft_record_size) - goto skip_index_root; - /* Get hold of the mft record for the directory. */ - m = map_mft_record(ndir); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(ndir, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - /* Get the offset into the index root attribute. */ - ir_pos = (s64)actor->pos; - /* Find the index root attribute in the mft record. */ - err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, - 0, ctx); - if (unlikely(err)) { - ntfs_error(sb, "Index root attribute missing in directory " - "inode 0x%lx.", vdir->i_ino); - goto err_out; - } - /* - * Copy the index root attribute value to a buffer so that we can put - * the search context and unmap the mft record before calling the - * filldir() callback. We need to do this because of NFSd which calls - * ->lookup() from its filldir callback() and this causes NTFS to - * deadlock as ntfs_lookup() maps the mft record of the directory and - * we have got it mapped here already. The only solution is for us to - * unmap the mft record here so that a call to ntfs_lookup() is able to - * map the mft record without deadlocking. - */ - rc = le32_to_cpu(ctx->attr->data.resident.value_length); - ir = kmalloc(rc, GFP_NOFS); - if (unlikely(!ir)) { - err = -ENOMEM; - goto err_out; - } - /* Copy the index root value (it has been verified in read_inode). */ - memcpy(ir, (u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset), rc); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ndir); - ctx = NULL; - m = NULL; - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ir->index + - le32_to_cpu(ir->index.entries_offset)); - /* - * Loop until we exceed valid memory (corruption case) or until we - * reach the last entry or until filldir tells us it has had enough - * or signals an error (both covered by the rc test). - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - ntfs_debug("In index root, offset 0x%zx.", (u8*)ie - (u8*)ir); - /* Bounds checks. */ - if (unlikely((u8*)ie < (u8*)ir || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end)) - goto err_out; - /* The last entry cannot contain a name. */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* Skip index root entry if continuing previous readdir. */ - if (ir_pos > (u8*)ie - (u8*)ir) - continue; - /* Advance the position even if going to skip the entry. */ - actor->pos = (u8*)ie - (u8*)ir; - /* Submit the name to the filldir callback. */ - rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor); - if (rc) { - kfree(ir); - goto abort; - } - } - /* We are done with the index root and can free the buffer. */ - kfree(ir); - ir = NULL; - /* If there is no index allocation attribute we are finished. */ - if (!NInoIndexAllocPresent(ndir)) - goto EOD; - /* Advance fpos to the beginning of the index allocation. */ - actor->pos = vol->mft_record_size; -skip_index_root: - kaddr = NULL; - prev_ia_pos = -1LL; - /* Get the offset into the index allocation attribute. */ - ia_pos = (s64)actor->pos - vol->mft_record_size; - ia_mapping = vdir->i_mapping; - ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino); - bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4); - if (IS_ERR(bmp_vi)) { - ntfs_error(sb, "Failed to get bitmap attribute."); - err = PTR_ERR(bmp_vi); - goto err_out; - } - bmp_mapping = bmp_vi->i_mapping; - /* Get the starting bitmap bit position and sanity check it. */ - bmp_pos = ia_pos >> ndir->itype.index.block_size_bits; - if (unlikely(bmp_pos >> 3 >= i_size_read(bmp_vi))) { - ntfs_error(sb, "Current index allocation position exceeds " - "index bitmap size."); - goto iput_err_out; - } - /* Get the starting bit position in the current bitmap page. */ - cur_bmp_pos = bmp_pos & ((PAGE_SIZE * 8) - 1); - bmp_pos &= ~(u64)((PAGE_SIZE * 8) - 1); -get_next_bmp_page: - ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx", - (unsigned long long)bmp_pos >> (3 + PAGE_SHIFT), - (unsigned long long)bmp_pos & - (unsigned long long)((PAGE_SIZE * 8) - 1)); - bmp_page = ntfs_map_page(bmp_mapping, - bmp_pos >> (3 + PAGE_SHIFT)); - if (IS_ERR(bmp_page)) { - ntfs_error(sb, "Reading index bitmap failed."); - err = PTR_ERR(bmp_page); - bmp_page = NULL; - goto iput_err_out; - } - bmp = (u8*)page_address(bmp_page); - /* Find next index block in use. */ - while (!(bmp[cur_bmp_pos >> 3] & (1 << (cur_bmp_pos & 7)))) { -find_next_index_buffer: - cur_bmp_pos++; - /* - * If we have reached the end of the bitmap page, get the next - * page, and put away the old one. - */ - if (unlikely((cur_bmp_pos >> 3) >= PAGE_SIZE)) { - ntfs_unmap_page(bmp_page); - bmp_pos += PAGE_SIZE * 8; - cur_bmp_pos = 0; - goto get_next_bmp_page; - } - /* If we have reached the end of the bitmap, we are done. */ - if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= i_size)) - goto unm_EOD; - ia_pos = (bmp_pos + cur_bmp_pos) << - ndir->itype.index.block_size_bits; - } - ntfs_debug("Handling index buffer 0x%llx.", - (unsigned long long)bmp_pos + cur_bmp_pos); - /* If the current index buffer is in the same page we reuse the page. */ - if ((prev_ia_pos & (s64)PAGE_MASK) != - (ia_pos & (s64)PAGE_MASK)) { - prev_ia_pos = ia_pos; - if (likely(ia_page != NULL)) { - unlock_page(ia_page); - ntfs_unmap_page(ia_page); - } - /* - * Map the page cache page containing the current ia_pos, - * reading it from disk if necessary. - */ - ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_SHIFT); - if (IS_ERR(ia_page)) { - ntfs_error(sb, "Reading index allocation data failed."); - err = PTR_ERR(ia_page); - ia_page = NULL; - goto err_out; - } - lock_page(ia_page); - kaddr = (u8*)page_address(ia_page); - } - /* Get the current index buffer. */ - ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_MASK & - ~(s64)(ndir->itype.index.block_size - 1))); - /* Bounds checks. */ - if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE)) { - ntfs_error(sb, "Out of bounds check failed. Corrupt directory " - "inode 0x%lx or driver bug.", vdir->i_ino); - goto err_out; - } - /* Catch multi sector transfer fixup errors. */ - if (unlikely(!ntfs_is_indx_record(ia->magic))) { - ntfs_error(sb, "Directory index record with vcn 0x%llx is " - "corrupt. Corrupt inode 0x%lx. Run chkdsk.", - (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino); - goto err_out; - } - if (unlikely(sle64_to_cpu(ia->index_block_vcn) != (ia_pos & - ~(s64)(ndir->itype.index.block_size - 1)) >> - ndir->itype.index.vcn_size_bits)) { - ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " - "different from expected VCN (0x%llx). " - "Directory inode 0x%lx is corrupt or driver " - "bug. ", (unsigned long long) - sle64_to_cpu(ia->index_block_vcn), - (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino); - goto err_out; - } - if (unlikely(le32_to_cpu(ia->index.allocated_size) + 0x18 != - ndir->itype.index.block_size)) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx has a size (%u) differing from the " - "directory specified size (%u). Directory " - "inode is corrupt or driver bug.", - (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino, - le32_to_cpu(ia->index.allocated_size) + 0x18, - ndir->itype.index.block_size); - goto err_out; - } - index_end = (u8*)ia + ndir->itype.index.block_size; - if (unlikely(index_end > kaddr + PAGE_SIZE)) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx crosses page boundary. Impossible! " - "Cannot access! This is probably a bug in the " - "driver.", (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino); - goto err_out; - } - ia_start = ia_pos & ~(s64)(ndir->itype.index.block_size - 1); - index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); - if (unlikely(index_end > (u8*)ia + ndir->itype.index.block_size)) { - ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " - "inode 0x%lx exceeds maximum size.", - (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino); - goto err_out; - } - /* The first index entry in this index buffer. */ - ie = (INDEX_ENTRY*)((u8*)&ia->index + - le32_to_cpu(ia->index.entries_offset)); - /* - * Loop until we exceed valid memory (corruption case) or until we - * reach the last entry or until filldir tells us it has had enough - * or signals an error (both covered by the rc test). - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - ntfs_debug("In index allocation, offset 0x%llx.", - (unsigned long long)ia_start + - (unsigned long long)((u8*)ie - (u8*)ia)); - /* Bounds checks. */ - if (unlikely((u8*)ie < (u8*)ia || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end)) - goto err_out; - /* The last entry cannot contain a name. */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* Skip index block entry if continuing previous readdir. */ - if (ia_pos - ia_start > (u8*)ie - (u8*)ia) - continue; - /* Advance the position even if going to skip the entry. */ - actor->pos = (u8*)ie - (u8*)ia + - (sle64_to_cpu(ia->index_block_vcn) << - ndir->itype.index.vcn_size_bits) + - vol->mft_record_size; - /* - * Submit the name to the @filldir callback. Note, - * ntfs_filldir() drops the lock on @ia_page but it retakes it - * before returning, unless a non-zero value is returned in - * which case the page is left unlocked. - */ - rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor); - if (rc) { - /* @ia_page is already unlocked in this case. */ - ntfs_unmap_page(ia_page); - ntfs_unmap_page(bmp_page); - iput(bmp_vi); - goto abort; - } - } - goto find_next_index_buffer; -unm_EOD: - if (ia_page) { - unlock_page(ia_page); - ntfs_unmap_page(ia_page); - } - ntfs_unmap_page(bmp_page); - iput(bmp_vi); -EOD: - /* We are finished, set fpos to EOD. */ - actor->pos = i_size + vol->mft_record_size; -abort: - kfree(name); - return 0; -err_out: - if (bmp_page) { - ntfs_unmap_page(bmp_page); -iput_err_out: - iput(bmp_vi); - } - if (ia_page) { - unlock_page(ia_page); - ntfs_unmap_page(ia_page); - } - kfree(ir); - kfree(name); - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(ndir); - if (!err) - err = -EIO; - ntfs_debug("Failed. Returning error code %i.", -err); - return err; -} - -/** - * ntfs_dir_open - called when an inode is about to be opened - * @vi: inode to be opened - * @filp: file structure describing the inode - * - * Limit directory size to the page cache limit on architectures where unsigned - * long is 32-bits. This is the most we can do for now without overflowing the - * page cache page index. Doing it this way means we don't run into problems - * because of existing too large directories. It would be better to allow the - * user to read the accessible part of the directory but I doubt very much - * anyone is going to hit this check on a 32-bit architecture, so there is no - * point in adding the extra complexity required to support this. - * - * On 64-bit architectures, the check is hopefully optimized away by the - * compiler. - */ -static int ntfs_dir_open(struct inode *vi, struct file *filp) -{ - if (sizeof(unsigned long) < 8) { - if (i_size_read(vi) > MAX_LFS_FILESIZE) - return -EFBIG; - } - return 0; -} - -#ifdef NTFS_RW - -/** - * ntfs_dir_fsync - sync a directory to disk - * @filp: directory to be synced - * @start: offset in bytes of the beginning of data range to sync - * @end: offset in bytes of the end of data range (inclusive) - * @datasync: if non-zero only flush user data and not metadata - * - * Data integrity sync of a directory to disk. Used for fsync, fdatasync, and - * msync system calls. This function is based on file.c::ntfs_file_fsync(). - * - * Write the mft record and all associated extent mft records as well as the - * $INDEX_ALLOCATION and $BITMAP attributes and then sync the block device. - * - * If @datasync is true, we do not wait on the inode(s) to be written out - * but we always wait on the page cache pages to be written out. - * - * Note: In the past @filp could be NULL so we ignore it as we don't need it - * anyway. - * - * Locking: Caller must hold i_mutex on the inode. - * - * TODO: We should probably also write all attribute/index inodes associated - * with this inode but since we have no simple way of getting to them we ignore - * this problem for now. We do write the $BITMAP attribute if it is present - * which is the important one for a directory so things are not too bad. - */ -static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, - int datasync) -{ - struct inode *bmp_vi, *vi = filp->f_mapping->host; - int err, ret; - ntfs_attr na; - - ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); - - err = file_write_and_wait_range(filp, start, end); - if (err) - return err; - inode_lock(vi); - - BUG_ON(!S_ISDIR(vi->i_mode)); - /* If the bitmap attribute inode is in memory sync it, too. */ - na.mft_no = vi->i_ino; - na.type = AT_BITMAP; - na.name = I30; - na.name_len = 4; - bmp_vi = ilookup5(vi->i_sb, vi->i_ino, ntfs_test_inode, &na); - if (bmp_vi) { - write_inode_now(bmp_vi, !datasync); - iput(bmp_vi); - } - ret = __ntfs_write_inode(vi, 1); - write_inode_now(vi, !datasync); - err = sync_blockdev(vi->i_sb->s_bdev); - if (unlikely(err && !ret)) - ret = err; - if (likely(!ret)) - ntfs_debug("Done."); - else - ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " - "%u.", datasync ? "data" : "", vi->i_ino, -ret); - inode_unlock(vi); - return ret; -} - -#endif /* NTFS_RW */ - -WRAP_DIR_ITER(ntfs_readdir) // FIXME! -const struct file_operations ntfs_dir_ops = { - .llseek = generic_file_llseek, /* Seek inside directory. */ - .read = generic_read_dir, /* Return -EISDIR. */ - .iterate_shared = shared_ntfs_readdir, /* Read directory contents. */ -#ifdef NTFS_RW - .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */ -#endif /* NTFS_RW */ - /*.ioctl = ,*/ /* Perform function on the - mounted filesystem. */ - .open = ntfs_dir_open, /* Open directory. */ -}; diff --git a/fs/ntfs/dir.h b/fs/ntfs/dir.h deleted file mode 100644 index 0e326753df401..0000000000000 --- a/fs/ntfs/dir.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * dir.h - Defines for directory handling in NTFS Linux kernel driver. Part of - * the Linux-NTFS project. - * - * Copyright (c) 2002-2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_DIR_H -#define _LINUX_NTFS_DIR_H - -#include "layout.h" -#include "inode.h" -#include "types.h" - -/* - * ntfs_name is used to return the file name to the caller of - * ntfs_lookup_inode_by_name() in order for the caller (namei.c::ntfs_lookup()) - * to be able to deal with dcache aliasing issues. - */ -typedef struct { - MFT_REF mref; - FILE_NAME_TYPE_FLAGS type; - u8 len; - ntfschar name[0]; -} __attribute__ ((__packed__)) ntfs_name; - -/* The little endian Unicode string $I30 as a global constant. */ -extern ntfschar I30[5]; - -extern MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, - const ntfschar *uname, const int uname_len, ntfs_name **res); - -#endif /* _LINUX_NTFS_FS_DIR_H */ diff --git a/fs/ntfs/endian.h b/fs/ntfs/endian.h deleted file mode 100644 index f30c139bf9ae8..0000000000000 --- a/fs/ntfs/endian.h +++ /dev/null @@ -1,79 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * endian.h - Defines for endianness handling in NTFS Linux kernel driver. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_ENDIAN_H -#define _LINUX_NTFS_ENDIAN_H - -#include -#include "types.h" - -/* - * Signed endianness conversion functions. - */ - -static inline s16 sle16_to_cpu(sle16 x) -{ - return le16_to_cpu((__force le16)x); -} - -static inline s32 sle32_to_cpu(sle32 x) -{ - return le32_to_cpu((__force le32)x); -} - -static inline s64 sle64_to_cpu(sle64 x) -{ - return le64_to_cpu((__force le64)x); -} - -static inline s16 sle16_to_cpup(sle16 *x) -{ - return le16_to_cpu(*(__force le16*)x); -} - -static inline s32 sle32_to_cpup(sle32 *x) -{ - return le32_to_cpu(*(__force le32*)x); -} - -static inline s64 sle64_to_cpup(sle64 *x) -{ - return le64_to_cpu(*(__force le64*)x); -} - -static inline sle16 cpu_to_sle16(s16 x) -{ - return (__force sle16)cpu_to_le16(x); -} - -static inline sle32 cpu_to_sle32(s32 x) -{ - return (__force sle32)cpu_to_le32(x); -} - -static inline sle64 cpu_to_sle64(s64 x) -{ - return (__force sle64)cpu_to_le64(x); -} - -static inline sle16 cpu_to_sle16p(s16 *x) -{ - return (__force sle16)cpu_to_le16(*x); -} - -static inline sle32 cpu_to_sle32p(s32 *x) -{ - return (__force sle32)cpu_to_le32(*x); -} - -static inline sle64 cpu_to_sle64p(s64 *x) -{ - return (__force sle64)cpu_to_le64(*x); -} - -#endif /* _LINUX_NTFS_ENDIAN_H */ diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c deleted file mode 100644 index 297c0b9db6211..0000000000000 --- a/fs/ntfs/file.c +++ /dev/null @@ -1,1997 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "attrib.h" -#include "bitmap.h" -#include "inode.h" -#include "debug.h" -#include "lcnalloc.h" -#include "malloc.h" -#include "mft.h" -#include "ntfs.h" - -/** - * ntfs_file_open - called when an inode is about to be opened - * @vi: inode to be opened - * @filp: file structure describing the inode - * - * Limit file size to the page cache limit on architectures where unsigned long - * is 32-bits. This is the most we can do for now without overflowing the page - * cache page index. Doing it this way means we don't run into problems because - * of existing too large files. It would be better to allow the user to read - * the beginning of the file but I doubt very much anyone is going to hit this - * check on a 32-bit architecture, so there is no point in adding the extra - * complexity required to support this. - * - * On 64-bit architectures, the check is hopefully optimized away by the - * compiler. - * - * After the check passes, just call generic_file_open() to do its work. - */ -static int ntfs_file_open(struct inode *vi, struct file *filp) -{ - if (sizeof(unsigned long) < 8) { - if (i_size_read(vi) > MAX_LFS_FILESIZE) - return -EOVERFLOW; - } - return generic_file_open(vi, filp); -} - -#ifdef NTFS_RW - -/** - * ntfs_attr_extend_initialized - extend the initialized size of an attribute - * @ni: ntfs inode of the attribute to extend - * @new_init_size: requested new initialized size in bytes - * - * Extend the initialized size of an attribute described by the ntfs inode @ni - * to @new_init_size bytes. This involves zeroing any non-sparse space between - * the old initialized size and @new_init_size both in the page cache and on - * disk (if relevant complete pages are already uptodate in the page cache then - * these are simply marked dirty). - * - * As a side-effect, the file size (vfs inode->i_size) may be incremented as, - * in the resident attribute case, it is tied to the initialized size and, in - * the non-resident attribute case, it may not fall below the initialized size. - * - * Note that if the attribute is resident, we do not need to touch the page - * cache at all. This is because if the page cache page is not uptodate we - * bring it uptodate later, when doing the write to the mft record since we - * then already have the page mapped. And if the page is uptodate, the - * non-initialized region will already have been zeroed when the page was - * brought uptodate and the region may in fact already have been overwritten - * with new data via mmap() based writes, so we cannot just zero it. And since - * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped - * is unspecified, we choose not to do zeroing and thus we do not need to touch - * the page at all. For a more detailed explanation see ntfs_truncate() in - * fs/ntfs/inode.c. - * - * Return 0 on success and -errno on error. In the case that an error is - * encountered it is possible that the initialized size will already have been - * incremented some way towards @new_init_size but it is guaranteed that if - * this is the case, the necessary zeroing will also have happened and that all - * metadata is self-consistent. - * - * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be - * held by the caller. - */ -static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size) -{ - s64 old_init_size; - loff_t old_i_size; - pgoff_t index, end_index; - unsigned long flags; - struct inode *vi = VFS_I(ni); - ntfs_inode *base_ni; - MFT_RECORD *m = NULL; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx = NULL; - struct address_space *mapping; - struct page *page = NULL; - u8 *kattr; - int err; - u32 attr_len; - - read_lock_irqsave(&ni->size_lock, flags); - old_init_size = ni->initialized_size; - old_i_size = i_size_read(vi); - BUG_ON(new_init_size > ni->allocated_size); - read_unlock_irqrestore(&ni->size_lock, flags); - ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " - "old_initialized_size 0x%llx, " - "new_initialized_size 0x%llx, i_size 0x%llx.", - vi->i_ino, (unsigned)le32_to_cpu(ni->type), - (unsigned long long)old_init_size, - (unsigned long long)new_init_size, old_i_size); - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - /* Use goto to reduce indentation and we need the label below anyway. */ - if (NInoNonResident(ni)) - goto do_non_resident_extend; - BUG_ON(old_init_size != old_i_size); - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - m = ctx->mrec; - a = ctx->attr; - BUG_ON(a->non_resident); - /* The total length of the attribute value. */ - attr_len = le32_to_cpu(a->data.resident.value_length); - BUG_ON(old_i_size != (loff_t)attr_len); - /* - * Do the zeroing in the mft record and update the attribute size in - * the mft record. - */ - kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); - memset(kattr + attr_len, 0, new_init_size - attr_len); - a->data.resident.value_length = cpu_to_le32((u32)new_init_size); - /* Finally, update the sizes in the vfs and ntfs inodes. */ - write_lock_irqsave(&ni->size_lock, flags); - i_size_write(vi, new_init_size); - ni->initialized_size = new_init_size; - write_unlock_irqrestore(&ni->size_lock, flags); - goto done; -do_non_resident_extend: - /* - * If the new initialized size @new_init_size exceeds the current file - * size (vfs inode->i_size), we need to extend the file size to the - * new initialized size. - */ - if (new_init_size > old_i_size) { - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - m = ctx->mrec; - a = ctx->attr; - BUG_ON(!a->non_resident); - BUG_ON(old_i_size != (loff_t) - sle64_to_cpu(a->data.non_resident.data_size)); - a->data.non_resident.data_size = cpu_to_sle64(new_init_size); - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - /* Update the file size in the vfs inode. */ - i_size_write(vi, new_init_size); - ntfs_attr_put_search_ctx(ctx); - ctx = NULL; - unmap_mft_record(base_ni); - m = NULL; - } - mapping = vi->i_mapping; - index = old_init_size >> PAGE_SHIFT; - end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - do { - /* - * Read the page. If the page is not present, this will zero - * the uninitialized regions for us. - */ - page = read_mapping_page(mapping, index, NULL); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto init_err_out; - } - /* - * Update the initialized size in the ntfs inode. This is - * enough to make ntfs_writepage() work. - */ - write_lock_irqsave(&ni->size_lock, flags); - ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT; - if (ni->initialized_size > new_init_size) - ni->initialized_size = new_init_size; - write_unlock_irqrestore(&ni->size_lock, flags); - /* Set the page dirty so it gets written out. */ - set_page_dirty(page); - put_page(page); - /* - * Play nice with the vm and the rest of the system. This is - * very much needed as we can potentially be modifying the - * initialised size from a very small value to a really huge - * value, e.g. - * f = open(somefile, O_TRUNC); - * truncate(f, 10GiB); - * seek(f, 10GiB); - * write(f, 1); - * And this would mean we would be marking dirty hundreds of - * thousands of pages or as in the above example more than - * two and a half million pages! - * - * TODO: For sparse pages could optimize this workload by using - * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This - * would be set in read_folio for sparse pages and here we would - * not need to mark dirty any pages which have this bit set. - * The only caveat is that we have to clear the bit everywhere - * where we allocate any clusters that lie in the page or that - * contain the page. - * - * TODO: An even greater optimization would be for us to only - * call read_folio() on pages which are not in sparse regions as - * determined from the runlist. This would greatly reduce the - * number of pages we read and make dirty in the case of sparse - * files. - */ - balance_dirty_pages_ratelimited(mapping); - cond_resched(); - } while (++index < end_index); - read_lock_irqsave(&ni->size_lock, flags); - BUG_ON(ni->initialized_size != new_init_size); - read_unlock_irqrestore(&ni->size_lock, flags); - /* Now bring in sync the initialized_size in the mft record. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - goto init_err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto init_err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto init_err_out; - } - m = ctx->mrec; - a = ctx->attr; - BUG_ON(!a->non_resident); - a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size); -done: - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.", - (unsigned long long)new_init_size, i_size_read(vi)); - return 0; -init_err_out: - write_lock_irqsave(&ni->size_lock, flags); - ni->initialized_size = old_init_size; - write_unlock_irqrestore(&ni->size_lock, flags); -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - ntfs_debug("Failed. Returning error code %i.", err); - return err; -} - -static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, - struct iov_iter *from) -{ - loff_t pos; - s64 end, ll; - ssize_t err; - unsigned long flags; - struct file *file = iocb->ki_filp; - struct inode *vi = file_inode(file); - ntfs_inode *ni = NTFS_I(vi); - ntfs_volume *vol = ni->vol; - - ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " - "0x%llx, count 0x%zx.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - (unsigned long long)iocb->ki_pos, - iov_iter_count(from)); - err = generic_write_checks(iocb, from); - if (unlikely(err <= 0)) - goto out; - /* - * All checks have passed. Before we start doing any writing we want - * to abort any totally illegal writes. - */ - BUG_ON(NInoMstProtected(ni)); - BUG_ON(ni->type != AT_DATA); - /* If file is encrypted, deny access, just like NT4. */ - if (NInoEncrypted(ni)) { - /* Only $DATA attributes can be encrypted. */ - /* - * Reminder for later: Encrypted files are _always_ - * non-resident so that the content can always be encrypted. - */ - ntfs_debug("Denying write access to encrypted file."); - err = -EACCES; - goto out; - } - if (NInoCompressed(ni)) { - /* Only unnamed $DATA attribute can be compressed. */ - BUG_ON(ni->name_len); - /* - * Reminder for later: If resident, the data is not actually - * compressed. Only on the switch to non-resident does - * compression kick in. This is in contrast to encrypted files - * (see above). - */ - ntfs_error(vi->i_sb, "Writing to compressed files is not " - "implemented yet. Sorry."); - err = -EOPNOTSUPP; - goto out; - } - err = file_remove_privs(file); - if (unlikely(err)) - goto out; - /* - * Our ->update_time method always succeeds thus file_update_time() - * cannot fail either so there is no need to check the return code. - */ - file_update_time(file); - pos = iocb->ki_pos; - /* The first byte after the last cluster being written to. */ - end = (pos + iov_iter_count(from) + vol->cluster_size_mask) & - ~(u64)vol->cluster_size_mask; - /* - * If the write goes beyond the allocated size, extend the allocation - * to cover the whole of the write, rounded up to the nearest cluster. - */ - read_lock_irqsave(&ni->size_lock, flags); - ll = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (end > ll) { - /* - * Extend the allocation without changing the data size. - * - * Note we ensure the allocation is big enough to at least - * write some data but we do not require the allocation to be - * complete, i.e. it may be partial. - */ - ll = ntfs_attr_extend_allocation(ni, end, -1, pos); - if (likely(ll >= 0)) { - BUG_ON(pos >= ll); - /* If the extension was partial truncate the write. */ - if (end > ll) { - ntfs_debug("Truncating write to inode 0x%lx, " - "attribute type 0x%x, because " - "the allocation was only " - "partially extended.", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - iov_iter_truncate(from, ll - pos); - } - } else { - err = ll; - read_lock_irqsave(&ni->size_lock, flags); - ll = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - /* Perform a partial write if possible or fail. */ - if (pos < ll) { - ntfs_debug("Truncating write to inode 0x%lx " - "attribute type 0x%x, because " - "extending the allocation " - "failed (error %d).", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type), - (int)-err); - iov_iter_truncate(from, ll - pos); - } else { - if (err != -ENOSPC) - ntfs_error(vi->i_sb, "Cannot perform " - "write to inode " - "0x%lx, attribute " - "type 0x%x, because " - "extending the " - "allocation failed " - "(error %ld).", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type), - (long)-err); - else - ntfs_debug("Cannot perform write to " - "inode 0x%lx, " - "attribute type 0x%x, " - "because there is not " - "space left.", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - goto out; - } - } - } - /* - * If the write starts beyond the initialized size, extend it up to the - * beginning of the write and initialize all non-sparse space between - * the old initialized size and the new one. This automatically also - * increments the vfs inode->i_size to keep it above or equal to the - * initialized_size. - */ - read_lock_irqsave(&ni->size_lock, flags); - ll = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (pos > ll) { - /* - * Wait for ongoing direct i/o to complete before proceeding. - * New direct i/o cannot start as we hold i_mutex. - */ - inode_dio_wait(vi); - err = ntfs_attr_extend_initialized(ni, pos); - if (unlikely(err < 0)) - ntfs_error(vi->i_sb, "Cannot perform write to inode " - "0x%lx, attribute type 0x%x, because " - "extending the initialized size " - "failed (error %d).", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - (int)-err); - } -out: - return err; -} - -/** - * __ntfs_grab_cache_pages - obtain a number of locked pages - * @mapping: address space mapping from which to obtain page cache pages - * @index: starting index in @mapping at which to begin obtaining pages - * @nr_pages: number of page cache pages to obtain - * @pages: array of pages in which to return the obtained page cache pages - * @cached_page: allocated but as yet unused page - * - * Obtain @nr_pages locked page cache pages from the mapping @mapping and - * starting at index @index. - * - * If a page is newly created, add it to lru list - * - * Note, the page locks are obtained in ascending page index order. - */ -static inline int __ntfs_grab_cache_pages(struct address_space *mapping, - pgoff_t index, const unsigned nr_pages, struct page **pages, - struct page **cached_page) -{ - int err, nr; - - BUG_ON(!nr_pages); - err = nr = 0; - do { - pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK | - FGP_ACCESSED); - if (!pages[nr]) { - if (!*cached_page) { - *cached_page = page_cache_alloc(mapping); - if (unlikely(!*cached_page)) { - err = -ENOMEM; - goto err_out; - } - } - err = add_to_page_cache_lru(*cached_page, mapping, - index, - mapping_gfp_constraint(mapping, GFP_KERNEL)); - if (unlikely(err)) { - if (err == -EEXIST) - continue; - goto err_out; - } - pages[nr] = *cached_page; - *cached_page = NULL; - } - index++; - nr++; - } while (nr < nr_pages); -out: - return err; -err_out: - while (nr > 0) { - unlock_page(pages[--nr]); - put_page(pages[nr]); - } - goto out; -} - -static inline void ntfs_submit_bh_for_read(struct buffer_head *bh) -{ - lock_buffer(bh); - get_bh(bh); - bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, bh); -} - -/** - * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data - * @pages: array of destination pages - * @nr_pages: number of pages in @pages - * @pos: byte position in file at which the write begins - * @bytes: number of bytes to be written - * - * This is called for non-resident attributes from ntfs_file_buffered_write() - * with i_mutex held on the inode (@pages[0]->mapping->host). There are - * @nr_pages pages in @pages which are locked but not kmap()ped. The source - * data has not yet been copied into the @pages. - * - * Need to fill any holes with actual clusters, allocate buffers if necessary, - * ensure all the buffers are mapped, and bring uptodate any buffers that are - * only partially being written to. - * - * If @nr_pages is greater than one, we are guaranteed that the cluster size is - * greater than PAGE_SIZE, that all pages in @pages are entirely inside - * the same cluster and that they are the entirety of that cluster, and that - * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole. - * - * i_size is not to be modified yet. - * - * Return 0 on success or -errno on error. - */ -static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, - unsigned nr_pages, s64 pos, size_t bytes) -{ - VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend; - LCN lcn; - s64 bh_pos, vcn_len, end, initialized_size; - sector_t lcn_block; - struct folio *folio; - struct inode *vi; - ntfs_inode *ni, *base_ni = NULL; - ntfs_volume *vol; - runlist_element *rl, *rl2; - struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; - ntfs_attr_search_ctx *ctx = NULL; - MFT_RECORD *m = NULL; - ATTR_RECORD *a = NULL; - unsigned long flags; - u32 attr_rec_len = 0; - unsigned blocksize, u; - int err, mp_size; - bool rl_write_locked, was_hole, is_retry; - unsigned char blocksize_bits; - struct { - u8 runlist_merged:1; - u8 mft_attr_mapped:1; - u8 mp_rebuilt:1; - u8 attr_switched:1; - } status = { 0, 0, 0, 0 }; - - BUG_ON(!nr_pages); - BUG_ON(!pages); - BUG_ON(!*pages); - vi = pages[0]->mapping->host; - ni = NTFS_I(vi); - vol = ni->vol; - ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " - "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", - vi->i_ino, ni->type, pages[0]->index, nr_pages, - (long long)pos, bytes); - blocksize = vol->sb->s_blocksize; - blocksize_bits = vol->sb->s_blocksize_bits; - rl_write_locked = false; - rl = NULL; - err = 0; - vcn = lcn = -1; - vcn_len = 0; - lcn_block = -1; - was_hole = false; - cpos = pos >> vol->cluster_size_bits; - end = pos + bytes; - cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits; - /* - * Loop over each buffer in each folio. Use goto to - * reduce indentation. - */ - u = 0; -do_next_folio: - folio = page_folio(pages[u]); - bh_pos = folio_pos(folio); - head = folio_buffers(folio); - if (!head) - /* - * create_empty_buffers() will create uptodate/dirty - * buffers if the folio is uptodate/dirty. - */ - head = create_empty_buffers(folio, blocksize, 0); - bh = head; - do { - VCN cdelta; - s64 bh_end; - unsigned bh_cofs; - - /* Clear buffer_new on all buffers to reinitialise state. */ - if (buffer_new(bh)) - clear_buffer_new(bh); - bh_end = bh_pos + blocksize; - bh_cpos = bh_pos >> vol->cluster_size_bits; - bh_cofs = bh_pos & vol->cluster_size_mask; - if (buffer_mapped(bh)) { - /* - * The buffer is already mapped. If it is uptodate, - * ignore it. - */ - if (buffer_uptodate(bh)) - continue; - /* - * The buffer is not uptodate. If the folio is uptodate - * set the buffer uptodate and otherwise ignore it. - */ - if (folio_test_uptodate(folio)) { - set_buffer_uptodate(bh); - continue; - } - /* - * Neither the folio nor the buffer are uptodate. If - * the buffer is only partially being written to, we - * need to read it in before the write, i.e. now. - */ - if ((bh_pos < pos && bh_end > pos) || - (bh_pos < end && bh_end > end)) { - /* - * If the buffer is fully or partially within - * the initialized size, do an actual read. - * Otherwise, simply zero the buffer. - */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (bh_pos < initialized_size) { - ntfs_submit_bh_for_read(bh); - *wait_bh++ = bh; - } else { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - } - continue; - } - /* Unmapped buffer. Need to map it. */ - bh->b_bdev = vol->sb->s_bdev; - /* - * If the current buffer is in the same clusters as the map - * cache, there is no need to check the runlist again. The - * map cache is made up of @vcn, which is the first cached file - * cluster, @vcn_len which is the number of cached file - * clusters, @lcn is the device cluster corresponding to @vcn, - * and @lcn_block is the block number corresponding to @lcn. - */ - cdelta = bh_cpos - vcn; - if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) { -map_buffer_cached: - BUG_ON(lcn < 0); - bh->b_blocknr = lcn_block + - (cdelta << (vol->cluster_size_bits - - blocksize_bits)) + - (bh_cofs >> blocksize_bits); - set_buffer_mapped(bh); - /* - * If the folio is uptodate so is the buffer. If the - * buffer is fully outside the write, we ignore it if - * it was already allocated and we mark it dirty so it - * gets written out if we allocated it. On the other - * hand, if we allocated the buffer but we are not - * marking it dirty we set buffer_new so we can do - * error recovery. - */ - if (folio_test_uptodate(folio)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); - if (unlikely(was_hole)) { - /* We allocated the buffer. */ - clean_bdev_bh_alias(bh); - if (bh_end <= pos || bh_pos >= end) - mark_buffer_dirty(bh); - else - set_buffer_new(bh); - } - continue; - } - /* Page is _not_ uptodate. */ - if (likely(!was_hole)) { - /* - * Buffer was already allocated. If it is not - * uptodate and is only partially being written - * to, we need to read it in before the write, - * i.e. now. - */ - if (!buffer_uptodate(bh) && bh_pos < end && - bh_end > pos && - (bh_pos < pos || - bh_end > end)) { - /* - * If the buffer is fully or partially - * within the initialized size, do an - * actual read. Otherwise, simply zero - * the buffer. - */ - read_lock_irqsave(&ni->size_lock, - flags); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, - flags); - if (bh_pos < initialized_size) { - ntfs_submit_bh_for_read(bh); - *wait_bh++ = bh; - } else { - folio_zero_range(folio, - bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - } - continue; - } - /* We allocated the buffer. */ - clean_bdev_bh_alias(bh); - /* - * If the buffer is fully outside the write, zero it, - * set it uptodate, and mark it dirty so it gets - * written out. If it is partially being written to, - * zero region surrounding the write but leave it to - * commit write to do anything else. Finally, if the - * buffer is fully being overwritten, do nothing. - */ - if (bh_end <= pos || bh_pos >= end) { - if (!buffer_uptodate(bh)) { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - mark_buffer_dirty(bh); - continue; - } - set_buffer_new(bh); - if (!buffer_uptodate(bh) && - (bh_pos < pos || bh_end > end)) { - u8 *kaddr; - unsigned pofs; - - kaddr = kmap_local_folio(folio, 0); - if (bh_pos < pos) { - pofs = bh_pos & ~PAGE_MASK; - memset(kaddr + pofs, 0, pos - bh_pos); - } - if (bh_end > end) { - pofs = end & ~PAGE_MASK; - memset(kaddr + pofs, 0, bh_end - end); - } - kunmap_local(kaddr); - flush_dcache_folio(folio); - } - continue; - } - /* - * Slow path: this is the first buffer in the cluster. If it - * is outside allocated size and is not uptodate, zero it and - * set it uptodate. - */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (bh_pos > initialized_size) { - if (folio_test_uptodate(folio)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); - } else if (!buffer_uptodate(bh)) { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - continue; - } - is_retry = false; - if (!rl) { - down_read(&ni->runlist.lock); -retry_remap: - rl = ni->runlist.rl; - } - if (likely(rl != NULL)) { - /* Seek to element containing target cluster. */ - while (rl->length && rl[1].vcn <= bh_cpos) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos); - if (likely(lcn >= 0)) { - /* - * Successful remap, setup the map cache and - * use that to deal with the buffer. - */ - was_hole = false; - vcn = bh_cpos; - vcn_len = rl[1].vcn - vcn; - lcn_block = lcn << (vol->cluster_size_bits - - blocksize_bits); - cdelta = 0; - /* - * If the number of remaining clusters touched - * by the write is smaller or equal to the - * number of cached clusters, unlock the - * runlist as the map cache will be used from - * now on. - */ - if (likely(vcn + vcn_len >= cend)) { - if (rl_write_locked) { - up_write(&ni->runlist.lock); - rl_write_locked = false; - } else - up_read(&ni->runlist.lock); - rl = NULL; - } - goto map_buffer_cached; - } - } else - lcn = LCN_RL_NOT_MAPPED; - /* - * If it is not a hole and not out of bounds, the runlist is - * probably unmapped so try to map it now. - */ - if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) { - if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) { - /* Attempt to map runlist. */ - if (!rl_write_locked) { - /* - * We need the runlist locked for - * writing, so if it is locked for - * reading relock it now and retry in - * case it changed whilst we dropped - * the lock. - */ - up_read(&ni->runlist.lock); - down_write(&ni->runlist.lock); - rl_write_locked = true; - goto retry_remap; - } - err = ntfs_map_runlist_nolock(ni, bh_cpos, - NULL); - if (likely(!err)) { - is_retry = true; - goto retry_remap; - } - /* - * If @vcn is out of bounds, pretend @lcn is - * LCN_ENOENT. As long as the buffer is out - * of bounds this will work fine. - */ - if (err == -ENOENT) { - lcn = LCN_ENOENT; - err = 0; - goto rl_not_mapped_enoent; - } - } else - err = -EIO; - /* Failed to map the buffer, even after retrying. */ - bh->b_blocknr = -1; - ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " - "attribute type 0x%x, vcn 0x%llx, " - "vcn offset 0x%x, because its " - "location on disk could not be " - "determined%s (error code %i).", - ni->mft_no, ni->type, - (unsigned long long)bh_cpos, - (unsigned)bh_pos & - vol->cluster_size_mask, - is_retry ? " even after retrying" : "", - err); - break; - } -rl_not_mapped_enoent: - /* - * The buffer is in a hole or out of bounds. We need to fill - * the hole, unless the buffer is in a cluster which is not - * touched by the write, in which case we just leave the buffer - * unmapped. This can only happen when the cluster size is - * less than the page cache size. - */ - if (unlikely(vol->cluster_size < PAGE_SIZE)) { - bh_cend = (bh_end + vol->cluster_size - 1) >> - vol->cluster_size_bits; - if ((bh_cend <= cpos || bh_cpos >= cend)) { - bh->b_blocknr = -1; - /* - * If the buffer is uptodate we skip it. If it - * is not but the folio is uptodate, we can set - * the buffer uptodate. If the folio is not - * uptodate, we can clear the buffer and set it - * uptodate. Whether this is worthwhile is - * debatable and this could be removed. - */ - if (folio_test_uptodate(folio)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); - } else if (!buffer_uptodate(bh)) { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - continue; - } - } - /* - * Out of bounds buffer is invalid if it was not really out of - * bounds. - */ - BUG_ON(lcn != LCN_HOLE); - /* - * We need the runlist locked for writing, so if it is locked - * for reading relock it now and retry in case it changed - * whilst we dropped the lock. - */ - BUG_ON(!rl); - if (!rl_write_locked) { - up_read(&ni->runlist.lock); - down_write(&ni->runlist.lock); - rl_write_locked = true; - goto retry_remap; - } - /* Find the previous last allocated cluster. */ - BUG_ON(rl->lcn != LCN_HOLE); - lcn = -1; - rl2 = rl; - while (--rl2 >= ni->runlist.rl) { - if (rl2->lcn >= 0) { - lcn = rl2->lcn + rl2->length; - break; - } - } - rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE, - false); - if (IS_ERR(rl2)) { - err = PTR_ERR(rl2); - ntfs_debug("Failed to allocate cluster, error code %i.", - err); - break; - } - lcn = rl2->lcn; - rl = ntfs_runlists_merge(ni->runlist.rl, rl2); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - if (err != -ENOMEM) - err = -EIO; - if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to release " - "allocated cluster in error " - "code path. Run chkdsk to " - "recover the lost cluster."); - NVolSetErrors(vol); - } - ntfs_free(rl2); - break; - } - ni->runlist.rl = rl; - status.runlist_merged = 1; - ntfs_debug("Allocated cluster, lcn 0x%llx.", - (unsigned long long)lcn); - /* Map and lock the mft record and get the attribute record. */ - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - break; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - unmap_mft_record(base_ni); - break; - } - status.mft_attr_mapped = 1; - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, bh_cpos, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - break; - } - m = ctx->mrec; - a = ctx->attr; - /* - * Find the runlist element with which the attribute extent - * starts. Note, we cannot use the _attr_ version because we - * have mapped the mft record. That is ok because we know the - * runlist fragment must be mapped already to have ever gotten - * here, so we can just use the _rl_ version. - */ - vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn); - rl2 = ntfs_rl_find_vcn_nolock(rl, vcn); - BUG_ON(!rl2); - BUG_ON(!rl2->length); - BUG_ON(rl2->lcn < LCN_HOLE); - highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); - /* - * If @highest_vcn is zero, calculate the real highest_vcn - * (which can really be zero). - */ - if (!highest_vcn) - highest_vcn = (sle64_to_cpu( - a->data.non_resident.allocated_size) >> - vol->cluster_size_bits) - 1; - /* - * Determine the size of the mapping pairs array for the new - * extent, i.e. the old extent with the hole filled. - */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn, - highest_vcn); - if (unlikely(mp_size <= 0)) { - if (!(err = mp_size)) - err = -EIO; - ntfs_debug("Failed to get size for mapping pairs " - "array, error code %i.", err); - break; - } - /* - * Resize the attribute record to fit the new mapping pairs - * array. - */ - attr_rec_len = le32_to_cpu(a->length); - err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu( - a->data.non_resident.mapping_pairs_offset)); - if (unlikely(err)) { - BUG_ON(err != -ENOSPC); - // TODO: Deal with this by using the current attribute - // and fill it with as much of the mapping pairs - // array as possible. Then loop over each attribute - // extent rewriting the mapping pairs arrays as we go - // along and if when we reach the end we have not - // enough space, try to resize the last attribute - // extent and if even that fails, add a new attribute - // extent. - // We could also try to resize at each step in the hope - // that we will not need to rewrite every single extent. - // Note, we may need to decompress some extents to fill - // the runlist as we are walking the extents... - ntfs_error(vol->sb, "Not enough space in the mft " - "record for the extended attribute " - "record. This case is not " - "implemented yet."); - err = -EOPNOTSUPP; - break ; - } - status.mp_rebuilt = 1; - /* - * Generate the mapping pairs array directly into the attribute - * record. - */ - err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), - mp_size, rl2, vcn, highest_vcn, NULL); - if (unlikely(err)) { - ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, " - "attribute type 0x%x, because building " - "the mapping pairs failed with error " - "code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - err = -EIO; - break; - } - /* Update the highest_vcn but only if it was not set. */ - if (unlikely(!a->data.non_resident.highest_vcn)) - a->data.non_resident.highest_vcn = - cpu_to_sle64(highest_vcn); - /* - * If the attribute is sparse/compressed, update the compressed - * size in the ntfs_inode structure and the attribute record. - */ - if (likely(NInoSparse(ni) || NInoCompressed(ni))) { - /* - * If we are not in the first attribute extent, switch - * to it, but first ensure the changes will make it to - * disk later. - */ - if (a->data.non_resident.lowest_vcn) { - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(ni->type, ni->name, - ni->name_len, CASE_SENSITIVE, - 0, NULL, 0, ctx); - if (unlikely(err)) { - status.attr_switched = 1; - break; - } - /* @m is not used any more so do not set it. */ - a = ctx->attr; - } - write_lock_irqsave(&ni->size_lock, flags); - ni->itype.compressed.size += vol->cluster_size; - a->data.non_resident.compressed_size = - cpu_to_sle64(ni->itype.compressed.size); - write_unlock_irqrestore(&ni->size_lock, flags); - } - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - /* Successfully filled the hole. */ - status.runlist_merged = 0; - status.mft_attr_mapped = 0; - status.mp_rebuilt = 0; - /* Setup the map cache and use that to deal with the buffer. */ - was_hole = true; - vcn = bh_cpos; - vcn_len = 1; - lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits); - cdelta = 0; - /* - * If the number of remaining clusters in the @pages is smaller - * or equal to the number of cached clusters, unlock the - * runlist as the map cache will be used from now on. - */ - if (likely(vcn + vcn_len >= cend)) { - up_write(&ni->runlist.lock); - rl_write_locked = false; - rl = NULL; - } - goto map_buffer_cached; - } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); - /* If there are no errors, do the next page. */ - if (likely(!err && ++u < nr_pages)) - goto do_next_folio; - /* If there are no errors, release the runlist lock if we took it. */ - if (likely(!err)) { - if (unlikely(rl_write_locked)) { - up_write(&ni->runlist.lock); - rl_write_locked = false; - } else if (unlikely(rl)) - up_read(&ni->runlist.lock); - rl = NULL; - } - /* If we issued read requests, let them complete. */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - while (wait_bh > wait) { - bh = *--wait_bh; - wait_on_buffer(bh); - if (likely(buffer_uptodate(bh))) { - folio = bh->b_folio; - bh_pos = folio_pos(folio) + bh_offset(bh); - /* - * If the buffer overflows the initialized size, need - * to zero the overflowing region. - */ - if (unlikely(bh_pos + blocksize > initialized_size)) { - int ofs = 0; - - if (likely(bh_pos < initialized_size)) - ofs = initialized_size - bh_pos; - folio_zero_segment(folio, bh_offset(bh) + ofs, - blocksize); - } - } else /* if (unlikely(!buffer_uptodate(bh))) */ - err = -EIO; - } - if (likely(!err)) { - /* Clear buffer_new on all buffers. */ - u = 0; - do { - bh = head = page_buffers(pages[u]); - do { - if (buffer_new(bh)) - clear_buffer_new(bh); - } while ((bh = bh->b_this_page) != head); - } while (++u < nr_pages); - ntfs_debug("Done."); - return err; - } - if (status.attr_switched) { - /* Get back to the attribute extent we modified. */ - ntfs_attr_reinit_search_ctx(ctx); - if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) { - ntfs_error(vol->sb, "Failed to find required " - "attribute extent of attribute in " - "error code path. Run chkdsk to " - "recover."); - write_lock_irqsave(&ni->size_lock, flags); - ni->itype.compressed.size += vol->cluster_size; - write_unlock_irqrestore(&ni->size_lock, flags); - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - /* - * The only thing that is now wrong is the compressed - * size of the base attribute extent which chkdsk - * should be able to fix. - */ - NVolSetErrors(vol); - } else { - m = ctx->mrec; - a = ctx->attr; - status.attr_switched = 0; - } - } - /* - * If the runlist has been modified, need to restore it by punching a - * hole into it and we then need to deallocate the on-disk cluster as - * well. Note, we only modify the runlist if we are able to generate a - * new mapping pairs array, i.e. only when the mapped attribute extent - * is not switched. - */ - if (status.runlist_merged && !status.attr_switched) { - BUG_ON(!rl_write_locked); - /* Make the file cluster we allocated sparse in the runlist. */ - if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) { - ntfs_error(vol->sb, "Failed to punch hole into " - "attribute runlist in error code " - "path. Run chkdsk to recover the " - "lost cluster."); - NVolSetErrors(vol); - } else /* if (success) */ { - status.runlist_merged = 0; - /* - * Deallocate the on-disk cluster we allocated but only - * if we succeeded in punching its vcn out of the - * runlist. - */ - down_write(&vol->lcnbmp_lock); - if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { - ntfs_error(vol->sb, "Failed to release " - "allocated cluster in error " - "code path. Run chkdsk to " - "recover the lost cluster."); - NVolSetErrors(vol); - } - up_write(&vol->lcnbmp_lock); - } - } - /* - * Resize the attribute record to its old size and rebuild the mapping - * pairs array. Note, we only can do this if the runlist has been - * restored to its old state which also implies that the mapped - * attribute extent is not switched. - */ - if (status.mp_rebuilt && !status.runlist_merged) { - if (ntfs_attr_record_resize(m, a, attr_rec_len)) { - ntfs_error(vol->sb, "Failed to restore attribute " - "record in error code path. Run " - "chkdsk to recover."); - NVolSetErrors(vol); - } else /* if (success) */ { - if (ntfs_mapping_pairs_build(vol, (u8*)a + - le16_to_cpu(a->data.non_resident. - mapping_pairs_offset), attr_rec_len - - le16_to_cpu(a->data.non_resident. - mapping_pairs_offset), ni->runlist.rl, - vcn, highest_vcn, NULL)) { - ntfs_error(vol->sb, "Failed to restore " - "mapping pairs array in error " - "code path. Run chkdsk to " - "recover."); - NVolSetErrors(vol); - } - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - } - } - /* Release the mft record and the attribute. */ - if (status.mft_attr_mapped) { - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - } - /* Release the runlist lock. */ - if (rl_write_locked) - up_write(&ni->runlist.lock); - else if (rl) - up_read(&ni->runlist.lock); - /* - * Zero out any newly allocated blocks to avoid exposing stale data. - * If BH_New is set, we know that the block was newly allocated above - * and that it has not been fully zeroed and marked dirty yet. - */ - nr_pages = u; - u = 0; - end = bh_cpos << vol->cluster_size_bits; - do { - folio = page_folio(pages[u]); - bh = head = folio_buffers(folio); - do { - if (u == nr_pages && - folio_pos(folio) + bh_offset(bh) >= end) - break; - if (!buffer_new(bh)) - continue; - clear_buffer_new(bh); - if (!buffer_uptodate(bh)) { - if (folio_test_uptodate(folio)) - set_buffer_uptodate(bh); - else { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - } - mark_buffer_dirty(bh); - } while ((bh = bh->b_this_page) != head); - } while (++u <= nr_pages); - ntfs_error(vol->sb, "Failed. Returning error code %i.", err); - return err; -} - -static inline void ntfs_flush_dcache_pages(struct page **pages, - unsigned nr_pages) -{ - BUG_ON(!nr_pages); - /* - * Warning: Do not do the decrement at the same time as the call to - * flush_dcache_page() because it is a NULL macro on i386 and hence the - * decrement never happens so the loop never terminates. - */ - do { - --nr_pages; - flush_dcache_page(pages[nr_pages]); - } while (nr_pages > 0); -} - -/** - * ntfs_commit_pages_after_non_resident_write - commit the received data - * @pages: array of destination pages - * @nr_pages: number of pages in @pages - * @pos: byte position in file at which the write begins - * @bytes: number of bytes to be written - * - * See description of ntfs_commit_pages_after_write(), below. - */ -static inline int ntfs_commit_pages_after_non_resident_write( - struct page **pages, const unsigned nr_pages, - s64 pos, size_t bytes) -{ - s64 end, initialized_size; - struct inode *vi; - ntfs_inode *ni, *base_ni; - struct buffer_head *bh, *head; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *m; - ATTR_RECORD *a; - unsigned long flags; - unsigned blocksize, u; - int err; - - vi = pages[0]->mapping->host; - ni = NTFS_I(vi); - blocksize = vi->i_sb->s_blocksize; - end = pos + bytes; - u = 0; - do { - s64 bh_pos; - struct page *page; - bool partial; - - page = pages[u]; - bh_pos = (s64)page->index << PAGE_SHIFT; - bh = head = page_buffers(page); - partial = false; - do { - s64 bh_end; - - bh_end = bh_pos + blocksize; - if (bh_end <= pos || bh_pos >= end) { - if (!buffer_uptodate(bh)) - partial = true; - } else { - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - } - } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); - /* - * If all buffers are now uptodate but the page is not, set the - * page uptodate. - */ - if (!partial && !PageUptodate(page)) - SetPageUptodate(page); - } while (++u < nr_pages); - /* - * Finally, if we do not need to update initialized_size or i_size we - * are finished. - */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (end <= initialized_size) { - ntfs_debug("Done."); - return 0; - } - /* - * Update initialized_size/i_size as appropriate, both in the inode and - * the mft record. - */ - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - /* Map, pin, and lock the mft record. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - BUG_ON(!NInoNonResident(ni)); - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - a = ctx->attr; - BUG_ON(!a->non_resident); - write_lock_irqsave(&ni->size_lock, flags); - BUG_ON(end > ni->allocated_size); - ni->initialized_size = end; - a->data.non_resident.initialized_size = cpu_to_sle64(end); - if (end > i_size_read(vi)) { - i_size_write(vi, end); - a->data.non_resident.data_size = - a->data.non_resident.initialized_size; - } - write_unlock_irqrestore(&ni->size_lock, flags); - /* Mark the mft record dirty, so it gets written back. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - ntfs_debug("Done."); - return 0; -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error " - "code %i).", err); - if (err != -ENOMEM) - NVolSetErrors(ni->vol); - return err; -} - -/** - * ntfs_commit_pages_after_write - commit the received data - * @pages: array of destination pages - * @nr_pages: number of pages in @pages - * @pos: byte position in file at which the write begins - * @bytes: number of bytes to be written - * - * This is called from ntfs_file_buffered_write() with i_mutex held on the inode - * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are - * locked but not kmap()ped. The source data has already been copied into the - * @page. ntfs_prepare_pages_for_non_resident_write() has been called before - * the data was copied (for non-resident attributes only) and it returned - * success. - * - * Need to set uptodate and mark dirty all buffers within the boundary of the - * write. If all buffers in a page are uptodate we set the page uptodate, too. - * - * Setting the buffers dirty ensures that they get written out later when - * ntfs_writepage() is invoked by the VM. - * - * Finally, we need to update i_size and initialized_size as appropriate both - * in the inode and the mft record. - * - * This is modelled after fs/buffer.c::generic_commit_write(), which marks - * buffers uptodate and dirty, sets the page uptodate if all buffers in the - * page are uptodate, and updates i_size if the end of io is beyond i_size. In - * that case, it also marks the inode dirty. - * - * If things have gone as outlined in - * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page - * content modifications here for non-resident attributes. For resident - * attributes we need to do the uptodate bringing here which we combine with - * the copying into the mft record which means we save one atomic kmap. - * - * Return 0 on success or -errno on error. - */ -static int ntfs_commit_pages_after_write(struct page **pages, - const unsigned nr_pages, s64 pos, size_t bytes) -{ - s64 end, initialized_size; - loff_t i_size; - struct inode *vi; - ntfs_inode *ni, *base_ni; - struct page *page; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *m; - ATTR_RECORD *a; - char *kattr, *kaddr; - unsigned long flags; - u32 attr_len; - int err; - - BUG_ON(!nr_pages); - BUG_ON(!pages); - page = pages[0]; - BUG_ON(!page); - vi = page->mapping->host; - ni = NTFS_I(vi); - ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " - "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", - vi->i_ino, ni->type, page->index, nr_pages, - (long long)pos, bytes); - if (NInoNonResident(ni)) - return ntfs_commit_pages_after_non_resident_write(pages, - nr_pages, pos, bytes); - BUG_ON(nr_pages > 1); - /* - * Attribute is resident, implying it is not compressed, encrypted, or - * sparse. - */ - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - BUG_ON(NInoNonResident(ni)); - /* Map, pin, and lock the mft record. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - a = ctx->attr; - BUG_ON(a->non_resident); - /* The total length of the attribute value. */ - attr_len = le32_to_cpu(a->data.resident.value_length); - i_size = i_size_read(vi); - BUG_ON(attr_len != i_size); - BUG_ON(pos > attr_len); - end = pos + bytes; - BUG_ON(end > le32_to_cpu(a->length) - - le16_to_cpu(a->data.resident.value_offset)); - kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); - kaddr = kmap_atomic(page); - /* Copy the received data from the page to the mft record. */ - memcpy(kattr + pos, kaddr + pos, bytes); - /* Update the attribute length if necessary. */ - if (end > attr_len) { - attr_len = end; - a->data.resident.value_length = cpu_to_le32(attr_len); - } - /* - * If the page is not uptodate, bring the out of bounds area(s) - * uptodate by copying data from the mft record to the page. - */ - if (!PageUptodate(page)) { - if (pos > 0) - memcpy(kaddr, kattr, pos); - if (end < attr_len) - memcpy(kaddr + end, kattr + end, attr_len - end); - /* Zero the region outside the end of the attribute value. */ - memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len); - flush_dcache_page(page); - SetPageUptodate(page); - } - kunmap_atomic(kaddr); - /* Update initialized_size/i_size if necessary. */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->initialized_size; - BUG_ON(end > ni->allocated_size); - read_unlock_irqrestore(&ni->size_lock, flags); - BUG_ON(initialized_size != i_size); - if (end > initialized_size) { - write_lock_irqsave(&ni->size_lock, flags); - ni->initialized_size = end; - i_size_write(vi, end); - write_unlock_irqrestore(&ni->size_lock, flags); - } - /* Mark the mft record dirty, so it gets written back. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - ntfs_debug("Done."); - return 0; -err_out: - if (err == -ENOMEM) { - ntfs_warning(vi->i_sb, "Error allocating memory required to " - "commit the write."); - if (PageUptodate(page)) { - ntfs_warning(vi->i_sb, "Page is uptodate, setting " - "dirty so the write will be retried " - "later on by the VM."); - /* - * Put the page on mapping->dirty_pages, but leave its - * buffers' dirty state as-is. - */ - __set_page_dirty_nobuffers(page); - err = 0; - } else - ntfs_error(vi->i_sb, "Page is not uptodate. Written " - "data has been lost."); - } else { - ntfs_error(vi->i_sb, "Resident attribute commit write failed " - "with error %i.", err); - NVolSetErrors(ni->vol); - } - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - return err; -} - -/* - * Copy as much as we can into the pages and return the number of bytes which - * were successfully copied. If a fault is encountered then clear the pages - * out to (ofs + bytes) and return the number of bytes which were copied. - */ -static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages, - unsigned ofs, struct iov_iter *i, size_t bytes) -{ - struct page **last_page = pages + nr_pages; - size_t total = 0; - unsigned len, copied; - - do { - len = PAGE_SIZE - ofs; - if (len > bytes) - len = bytes; - copied = copy_page_from_iter_atomic(*pages, ofs, len, i); - total += copied; - bytes -= copied; - if (!bytes) - break; - if (copied < len) - goto err; - ofs = 0; - } while (++pages < last_page); -out: - return total; -err: - /* Zero the rest of the target like __copy_from_user(). */ - len = PAGE_SIZE - copied; - do { - if (len > bytes) - len = bytes; - zero_user(*pages, copied, len); - bytes -= len; - copied = 0; - len = PAGE_SIZE; - } while (++pages < last_page); - goto out; -} - -/** - * ntfs_perform_write - perform buffered write to a file - * @file: file to write to - * @i: iov_iter with data to write - * @pos: byte offset in file at which to begin writing to - */ -static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, - loff_t pos) -{ - struct address_space *mapping = file->f_mapping; - struct inode *vi = mapping->host; - ntfs_inode *ni = NTFS_I(vi); - ntfs_volume *vol = ni->vol; - struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER]; - struct page *cached_page = NULL; - VCN last_vcn; - LCN lcn; - size_t bytes; - ssize_t status, written = 0; - unsigned nr_pages; - - ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " - "0x%llx, count 0x%lx.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - (unsigned long long)pos, - (unsigned long)iov_iter_count(i)); - /* - * If a previous ntfs_truncate() failed, repeat it and abort if it - * fails again. - */ - if (unlikely(NInoTruncateFailed(ni))) { - int err; - - inode_dio_wait(vi); - err = ntfs_truncate(vi); - if (err || NInoTruncateFailed(ni)) { - if (!err) - err = -EIO; - ntfs_error(vol->sb, "Cannot perform write to inode " - "0x%lx, attribute type 0x%x, because " - "ntfs_truncate() failed (error code " - "%i).", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - return err; - } - } - /* - * Determine the number of pages per cluster for non-resident - * attributes. - */ - nr_pages = 1; - if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni)) - nr_pages = vol->cluster_size >> PAGE_SHIFT; - last_vcn = -1; - do { - VCN vcn; - pgoff_t start_idx; - unsigned ofs, do_pages, u; - size_t copied; - - start_idx = pos >> PAGE_SHIFT; - ofs = pos & ~PAGE_MASK; - bytes = PAGE_SIZE - ofs; - do_pages = 1; - if (nr_pages > 1) { - vcn = pos >> vol->cluster_size_bits; - if (vcn != last_vcn) { - last_vcn = vcn; - /* - * Get the lcn of the vcn the write is in. If - * it is a hole, need to lock down all pages in - * the cluster. - */ - down_read(&ni->runlist.lock); - lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >> - vol->cluster_size_bits, false); - up_read(&ni->runlist.lock); - if (unlikely(lcn < LCN_HOLE)) { - if (lcn == LCN_ENOMEM) - status = -ENOMEM; - else { - status = -EIO; - ntfs_error(vol->sb, "Cannot " - "perform write to " - "inode 0x%lx, " - "attribute type 0x%x, " - "because the attribute " - "is corrupt.", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - } - break; - } - if (lcn == LCN_HOLE) { - start_idx = (pos & ~(s64) - vol->cluster_size_mask) - >> PAGE_SHIFT; - bytes = vol->cluster_size - (pos & - vol->cluster_size_mask); - do_pages = nr_pages; - } - } - } - if (bytes > iov_iter_count(i)) - bytes = iov_iter_count(i); -again: - /* - * Bring in the user page(s) that we will copy from _first_. - * Otherwise there is a nasty deadlock on copying from the same - * page(s) as we are writing to, without it/them being marked - * up-to-date. Note, at present there is nothing to stop the - * pages being swapped out between us bringing them into memory - * and doing the actual copying. - */ - if (unlikely(fault_in_iov_iter_readable(i, bytes))) { - status = -EFAULT; - break; - } - /* Get and lock @do_pages starting at index @start_idx. */ - status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, - pages, &cached_page); - if (unlikely(status)) - break; - /* - * For non-resident attributes, we need to fill any holes with - * actual clusters and ensure all bufferes are mapped. We also - * need to bring uptodate any buffers that are only partially - * being written to. - */ - if (NInoNonResident(ni)) { - status = ntfs_prepare_pages_for_non_resident_write( - pages, do_pages, pos, bytes); - if (unlikely(status)) { - do { - unlock_page(pages[--do_pages]); - put_page(pages[do_pages]); - } while (do_pages); - break; - } - } - u = (pos >> PAGE_SHIFT) - pages[0]->index; - copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs, - i, bytes); - ntfs_flush_dcache_pages(pages + u, do_pages - u); - status = 0; - if (likely(copied == bytes)) { - status = ntfs_commit_pages_after_write(pages, do_pages, - pos, bytes); - } - do { - unlock_page(pages[--do_pages]); - put_page(pages[do_pages]); - } while (do_pages); - if (unlikely(status < 0)) { - iov_iter_revert(i, copied); - break; - } - cond_resched(); - if (unlikely(copied < bytes)) { - iov_iter_revert(i, copied); - if (copied) - bytes = copied; - else if (bytes > PAGE_SIZE - ofs) - bytes = PAGE_SIZE - ofs; - goto again; - } - pos += copied; - written += copied; - balance_dirty_pages_ratelimited(mapping); - if (fatal_signal_pending(current)) { - status = -EINTR; - break; - } - } while (iov_iter_count(i)); - if (cached_page) - put_page(cached_page); - ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", - written ? "written" : "status", (unsigned long)written, - (long)status); - return written ? written : status; -} - -/** - * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock() - * @iocb: IO state structure - * @from: iov_iter with data to write - * - * Basically the same as generic_file_write_iter() except that it ends up - * up calling ntfs_perform_write() instead of generic_perform_write() and that - * O_DIRECT is not implemented. - */ -static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *vi = file_inode(file); - ssize_t written = 0; - ssize_t err; - - inode_lock(vi); - /* We can write back this queue in page reclaim. */ - err = ntfs_prepare_file_for_write(iocb, from); - if (iov_iter_count(from) && !err) - written = ntfs_perform_write(file, from, iocb->ki_pos); - inode_unlock(vi); - iocb->ki_pos += written; - if (likely(written > 0)) - written = generic_write_sync(iocb, written); - return written ? written : err; -} - -/** - * ntfs_file_fsync - sync a file to disk - * @filp: file to be synced - * @datasync: if non-zero only flush user data and not metadata - * - * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync - * system calls. This function is inspired by fs/buffer.c::file_fsync(). - * - * If @datasync is false, write the mft record and all associated extent mft - * records as well as the $DATA attribute and then sync the block device. - * - * If @datasync is true and the attribute is non-resident, we skip the writing - * of the mft record and all associated extent mft records (this might still - * happen due to the write_inode_now() call). - * - * Also, if @datasync is true, we do not wait on the inode to be written out - * but we always wait on the page cache pages to be written out. - * - * Locking: Caller must hold i_mutex on the inode. - * - * TODO: We should probably also write all attribute/index inodes associated - * with this inode but since we have no simple way of getting to them we ignore - * this problem for now. - */ -static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, - int datasync) -{ - struct inode *vi = filp->f_mapping->host; - int err, ret = 0; - - ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); - - err = file_write_and_wait_range(filp, start, end); - if (err) - return err; - inode_lock(vi); - - BUG_ON(S_ISDIR(vi->i_mode)); - if (!datasync || !NInoNonResident(NTFS_I(vi))) - ret = __ntfs_write_inode(vi, 1); - write_inode_now(vi, !datasync); - /* - * NOTE: If we were to use mapping->private_list (see ext2 and - * fs/buffer.c) for dirty blocks then we could optimize the below to be - * sync_mapping_buffers(vi->i_mapping). - */ - err = sync_blockdev(vi->i_sb->s_bdev); - if (unlikely(err && !ret)) - ret = err; - if (likely(!ret)) - ntfs_debug("Done."); - else - ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " - "%u.", datasync ? "data" : "", vi->i_ino, -ret); - inode_unlock(vi); - return ret; -} - -#endif /* NTFS_RW */ - -const struct file_operations ntfs_file_ops = { - .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, -#ifdef NTFS_RW - .write_iter = ntfs_file_write_iter, - .fsync = ntfs_file_fsync, -#endif /* NTFS_RW */ - .mmap = generic_file_mmap, - .open = ntfs_file_open, - .splice_read = filemap_splice_read, -}; - -const struct inode_operations ntfs_file_inode_ops = { -#ifdef NTFS_RW - .setattr = ntfs_setattr, -#endif /* NTFS_RW */ -}; - -const struct file_operations ntfs_empty_file_ops = {}; - -const struct inode_operations ntfs_empty_inode_ops = {}; diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c deleted file mode 100644 index d46c2c03a0325..0000000000000 --- a/fs/ntfs/index.c +++ /dev/null @@ -1,440 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * index.c - NTFS kernel index handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2004-2005 Anton Altaparmakov - */ - -#include - -#include "aops.h" -#include "collate.h" -#include "debug.h" -#include "index.h" -#include "ntfs.h" - -/** - * ntfs_index_ctx_get - allocate and initialize a new index context - * @idx_ni: ntfs index inode with which to initialize the context - * - * Allocate a new index context, initialize it with @idx_ni and return it. - * Return NULL if allocation failed. - * - * Locking: Caller must hold i_mutex on the index inode. - */ -ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni) -{ - ntfs_index_context *ictx; - - ictx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS); - if (ictx) - *ictx = (ntfs_index_context){ .idx_ni = idx_ni }; - return ictx; -} - -/** - * ntfs_index_ctx_put - release an index context - * @ictx: index context to free - * - * Release the index context @ictx, releasing all associated resources. - * - * Locking: Caller must hold i_mutex on the index inode. - */ -void ntfs_index_ctx_put(ntfs_index_context *ictx) -{ - if (ictx->entry) { - if (ictx->is_in_root) { - if (ictx->actx) - ntfs_attr_put_search_ctx(ictx->actx); - if (ictx->base_ni) - unmap_mft_record(ictx->base_ni); - } else { - struct page *page = ictx->page; - if (page) { - BUG_ON(!PageLocked(page)); - unlock_page(page); - ntfs_unmap_page(page); - } - } - } - kmem_cache_free(ntfs_index_ctx_cache, ictx); - return; -} - -/** - * ntfs_index_lookup - find a key in an index and return its index entry - * @key: [IN] key for which to search in the index - * @key_len: [IN] length of @key in bytes - * @ictx: [IN/OUT] context describing the index and the returned entry - * - * Before calling ntfs_index_lookup(), @ictx must have been obtained from a - * call to ntfs_index_ctx_get(). - * - * Look for the @key in the index specified by the index lookup context @ictx. - * ntfs_index_lookup() walks the contents of the index looking for the @key. - * - * If the @key is found in the index, 0 is returned and @ictx is setup to - * describe the index entry containing the matching @key. @ictx->entry is the - * index entry and @ictx->data and @ictx->data_len are the index entry data and - * its length in bytes, respectively. - * - * If the @key is not found in the index, -ENOENT is returned and @ictx is - * setup to describe the index entry whose key collates immediately after the - * search @key, i.e. this is the position in the index at which an index entry - * with a key of @key would need to be inserted. - * - * If an error occurs return the negative error code and @ictx is left - * untouched. - * - * When finished with the entry and its data, call ntfs_index_ctx_put() to free - * the context and other associated resources. - * - * If the index entry was modified, call flush_dcache_index_entry_page() - * immediately after the modification and either ntfs_index_entry_mark_dirty() - * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to - * ensure that the changes are written to disk. - * - * Locking: - Caller must hold i_mutex on the index inode. - * - Each page cache page in the index allocation mapping must be - * locked whilst being accessed otherwise we may find a corrupt - * page due to it being under ->writepage at the moment which - * applies the mst protection fixups before writing out and then - * removes them again after the write is complete after which it - * unlocks the page. - */ -int ntfs_index_lookup(const void *key, const int key_len, - ntfs_index_context *ictx) -{ - VCN vcn, old_vcn; - ntfs_inode *idx_ni = ictx->idx_ni; - ntfs_volume *vol = idx_ni->vol; - struct super_block *sb = vol->sb; - ntfs_inode *base_ni = idx_ni->ext.base_ntfs_ino; - MFT_RECORD *m; - INDEX_ROOT *ir; - INDEX_ENTRY *ie; - INDEX_ALLOCATION *ia; - u8 *index_end, *kaddr; - ntfs_attr_search_ctx *actx; - struct address_space *ia_mapping; - struct page *page; - int rc, err = 0; - - ntfs_debug("Entering."); - BUG_ON(!NInoAttr(idx_ni)); - BUG_ON(idx_ni->type != AT_INDEX_ALLOCATION); - BUG_ON(idx_ni->nr_extents != -1); - BUG_ON(!base_ni); - BUG_ON(!key); - BUG_ON(key_len <= 0); - if (!ntfs_is_collation_rule_supported( - idx_ni->itype.index.collation_rule)) { - ntfs_error(sb, "Index uses unsupported collation rule 0x%x. " - "Aborting lookup.", le32_to_cpu( - idx_ni->itype.index.collation_rule)); - return -EOPNOTSUPP; - } - /* Get hold of the mft record for the index inode. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - ntfs_error(sb, "map_mft_record() failed with error code %ld.", - -PTR_ERR(m)); - return PTR_ERR(m); - } - actx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!actx)) { - err = -ENOMEM; - goto err_out; - } - /* Find the index root attribute in the mft record. */ - err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, actx); - if (unlikely(err)) { - if (err == -ENOENT) { - ntfs_error(sb, "Index root attribute missing in inode " - "0x%lx.", idx_ni->mft_no); - err = -EIO; - } - goto err_out; - } - /* Get to the index root value (it has been verified in read_inode). */ - ir = (INDEX_ROOT*)((u8*)actx->attr + - le16_to_cpu(actx->attr->data.resident.value_offset)); - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ir->index + - le32_to_cpu(ir->index.entries_offset)); - /* - * Loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds checks. */ - if ((u8*)ie < (u8*)actx->mrec || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->length) > index_end) - goto idx_err_out; - /* - * The last entry cannot contain a key. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* Further bounds checks. */ - if ((u32)sizeof(INDEX_ENTRY_HEADER) + - le16_to_cpu(ie->key_length) > - le16_to_cpu(ie->data.vi.data_offset) || - (u32)le16_to_cpu(ie->data.vi.data_offset) + - le16_to_cpu(ie->data.vi.data_length) > - le16_to_cpu(ie->length)) - goto idx_err_out; - /* If the keys match perfectly, we setup @ictx and return 0. */ - if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key, - &ie->key, key_len)) { -ir_done: - ictx->is_in_root = true; - ictx->ir = ir; - ictx->actx = actx; - ictx->base_ni = base_ni; - ictx->ia = NULL; - ictx->page = NULL; -done: - ictx->entry = ie; - ictx->data = (u8*)ie + - le16_to_cpu(ie->data.vi.data_offset); - ictx->data_len = le16_to_cpu(ie->data.vi.data_length); - ntfs_debug("Done."); - return err; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key, - key_len, &ie->key, le16_to_cpu(ie->key_length)); - /* - * If @key collates before the key of the current entry, there - * is definitely no such key in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* - * A match should never happen as the memcmp() call should have - * cought it, but we still treat it correctly. - */ - if (!rc) - goto ir_done; - /* The keys are not equal, continue the search. */ - } - /* - * We have finished with this index without success. Check for the - * presence of a child node and if not present setup @ictx and return - * -ENOENT. - */ - if (!(ie->flags & INDEX_ENTRY_NODE)) { - ntfs_debug("Entry not found."); - err = -ENOENT; - goto ir_done; - } /* Child node present, descend into it. */ - /* Consistency check: Verify that an index allocation exists. */ - if (!NInoIndexAllocPresent(idx_ni)) { - ntfs_error(sb, "No index allocation attribute but index entry " - "requires one. Inode 0x%lx is corrupt or " - "driver bug.", idx_ni->mft_no); - goto err_out; - } - /* Get the starting vcn of the index_block holding the child node. */ - vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); - ia_mapping = VFS_I(idx_ni)->i_mapping; - /* - * We are done with the index root and the mft record. Release them, - * otherwise we deadlock with ntfs_map_page(). - */ - ntfs_attr_put_search_ctx(actx); - unmap_mft_record(base_ni); - m = NULL; - actx = NULL; -descend_into_child_node: - /* - * Convert vcn to index into the index allocation attribute in units - * of PAGE_SIZE and map the page cache page, reading it from - * disk if necessary. - */ - page = ntfs_map_page(ia_mapping, vcn << - idx_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); - if (IS_ERR(page)) { - ntfs_error(sb, "Failed to map index page, error %ld.", - -PTR_ERR(page)); - err = PTR_ERR(page); - goto err_out; - } - lock_page(page); - kaddr = (u8*)page_address(page); -fast_descend_into_child_node: - /* Get to the index allocation block. */ - ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << - idx_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); - /* Bounds checks. */ - if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Out of bounds check failed. Corrupt inode " - "0x%lx or driver bug.", idx_ni->mft_no); - goto unm_err_out; - } - /* Catch multi sector transfer fixup errors. */ - if (unlikely(!ntfs_is_indx_record(ia->magic))) { - ntfs_error(sb, "Index record with vcn 0x%llx is corrupt. " - "Corrupt inode 0x%lx. Run chkdsk.", - (long long)vcn, idx_ni->mft_no); - goto unm_err_out; - } - if (sle64_to_cpu(ia->index_block_vcn) != vcn) { - ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " - "different from expected VCN (0x%llx). Inode " - "0x%lx is corrupt or driver bug.", - (unsigned long long) - sle64_to_cpu(ia->index_block_vcn), - (unsigned long long)vcn, idx_ni->mft_no); - goto unm_err_out; - } - if (le32_to_cpu(ia->index.allocated_size) + 0x18 != - idx_ni->itype.index.block_size) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx has " - "a size (%u) differing from the index " - "specified size (%u). Inode is corrupt or " - "driver bug.", (unsigned long long)vcn, - idx_ni->mft_no, - le32_to_cpu(ia->index.allocated_size) + 0x18, - idx_ni->itype.index.block_size); - goto unm_err_out; - } - index_end = (u8*)ia + idx_ni->itype.index.block_size; - if (index_end > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx " - "crosses page boundary. Impossible! Cannot " - "access! This is probably a bug in the " - "driver.", (unsigned long long)vcn, - idx_ni->mft_no); - goto unm_err_out; - } - index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); - if (index_end > (u8*)ia + idx_ni->itype.index.block_size) { - ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of inode " - "0x%lx exceeds maximum size.", - (unsigned long long)vcn, idx_ni->mft_no); - goto unm_err_out; - } - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ia->index + - le32_to_cpu(ia->index.entries_offset)); - /* - * Iterate similar to above big loop but applied to index buffer, thus - * loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds checks. */ - if ((u8*)ie < (u8*)ia || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->length) > index_end) { - ntfs_error(sb, "Index entry out of bounds in inode " - "0x%lx.", idx_ni->mft_no); - goto unm_err_out; - } - /* - * The last entry cannot contain a key. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* Further bounds checks. */ - if ((u32)sizeof(INDEX_ENTRY_HEADER) + - le16_to_cpu(ie->key_length) > - le16_to_cpu(ie->data.vi.data_offset) || - (u32)le16_to_cpu(ie->data.vi.data_offset) + - le16_to_cpu(ie->data.vi.data_length) > - le16_to_cpu(ie->length)) { - ntfs_error(sb, "Index entry out of bounds in inode " - "0x%lx.", idx_ni->mft_no); - goto unm_err_out; - } - /* If the keys match perfectly, we setup @ictx and return 0. */ - if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key, - &ie->key, key_len)) { -ia_done: - ictx->is_in_root = false; - ictx->actx = NULL; - ictx->base_ni = NULL; - ictx->ia = ia; - ictx->page = page; - goto done; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key, - key_len, &ie->key, le16_to_cpu(ie->key_length)); - /* - * If @key collates before the key of the current entry, there - * is definitely no such key in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* - * A match should never happen as the memcmp() call should have - * cought it, but we still treat it correctly. - */ - if (!rc) - goto ia_done; - /* The keys are not equal, continue the search. */ - } - /* - * We have finished with this index buffer without success. Check for - * the presence of a child node and if not present return -ENOENT. - */ - if (!(ie->flags & INDEX_ENTRY_NODE)) { - ntfs_debug("Entry not found."); - err = -ENOENT; - goto ia_done; - } - if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { - ntfs_error(sb, "Index entry with child node found in a leaf " - "node in inode 0x%lx.", idx_ni->mft_no); - goto unm_err_out; - } - /* Child node present, descend into it. */ - old_vcn = vcn; - vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); - if (vcn >= 0) { - /* - * If vcn is in the same page cache page as old_vcn we recycle - * the mapped page. - */ - if (old_vcn << vol->cluster_size_bits >> - PAGE_SHIFT == vcn << - vol->cluster_size_bits >> - PAGE_SHIFT) - goto fast_descend_into_child_node; - unlock_page(page); - ntfs_unmap_page(page); - goto descend_into_child_node; - } - ntfs_error(sb, "Negative child node vcn in inode 0x%lx.", - idx_ni->mft_no); -unm_err_out: - unlock_page(page); - ntfs_unmap_page(page); -err_out: - if (!err) - err = -EIO; - if (actx) - ntfs_attr_put_search_ctx(actx); - if (m) - unmap_mft_record(base_ni); - return err; -idx_err_out: - ntfs_error(sb, "Corrupt index. Aborting lookup."); - goto err_out; -} diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h deleted file mode 100644 index bb3c3ae55138a..0000000000000 --- a/fs/ntfs/index.h +++ /dev/null @@ -1,134 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * index.h - Defines for NTFS kernel index handling. Part of the Linux-NTFS - * project. - * - * Copyright (c) 2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_INDEX_H -#define _LINUX_NTFS_INDEX_H - -#include - -#include "types.h" -#include "layout.h" -#include "inode.h" -#include "attrib.h" -#include "mft.h" -#include "aops.h" - -/** - * @idx_ni: index inode containing the @entry described by this context - * @entry: index entry (points into @ir or @ia) - * @data: index entry data (points into @entry) - * @data_len: length in bytes of @data - * @is_in_root: 'true' if @entry is in @ir and 'false' if it is in @ia - * @ir: index root if @is_in_root and NULL otherwise - * @actx: attribute search context if @is_in_root and NULL otherwise - * @base_ni: base inode if @is_in_root and NULL otherwise - * @ia: index block if @is_in_root is 'false' and NULL otherwise - * @page: page if @is_in_root is 'false' and NULL otherwise - * - * @idx_ni is the index inode this context belongs to. - * - * @entry is the index entry described by this context. @data and @data_len - * are the index entry data and its length in bytes, respectively. @data - * simply points into @entry. This is probably what the user is interested in. - * - * If @is_in_root is 'true', @entry is in the index root attribute @ir described - * by the attribute search context @actx and the base inode @base_ni. @ia and - * @page are NULL in this case. - * - * If @is_in_root is 'false', @entry is in the index allocation attribute and @ia - * and @page point to the index allocation block and the mapped, locked page it - * is in, respectively. @ir, @actx and @base_ni are NULL in this case. - * - * To obtain a context call ntfs_index_ctx_get(). - * - * We use this context to allow ntfs_index_lookup() to return the found index - * @entry and its @data without having to allocate a buffer and copy the @entry - * and/or its @data into it. - * - * When finished with the @entry and its @data, call ntfs_index_ctx_put() to - * free the context and other associated resources. - * - * If the index entry was modified, call flush_dcache_index_entry_page() - * immediately after the modification and either ntfs_index_entry_mark_dirty() - * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to - * ensure that the changes are written to disk. - */ -typedef struct { - ntfs_inode *idx_ni; - INDEX_ENTRY *entry; - void *data; - u16 data_len; - bool is_in_root; - INDEX_ROOT *ir; - ntfs_attr_search_ctx *actx; - ntfs_inode *base_ni; - INDEX_ALLOCATION *ia; - struct page *page; -} ntfs_index_context; - -extern ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni); -extern void ntfs_index_ctx_put(ntfs_index_context *ictx); - -extern int ntfs_index_lookup(const void *key, const int key_len, - ntfs_index_context *ictx); - -#ifdef NTFS_RW - -/** - * ntfs_index_entry_flush_dcache_page - flush_dcache_page() for index entries - * @ictx: ntfs index context describing the index entry - * - * Call flush_dcache_page() for the page in which an index entry resides. - * - * This must be called every time an index entry is modified, just after the - * modification. - * - * If the index entry is in the index root attribute, simply flush the page - * containing the mft record containing the index root attribute. - * - * If the index entry is in an index block belonging to the index allocation - * attribute, simply flush the page cache page containing the index block. - */ -static inline void ntfs_index_entry_flush_dcache_page(ntfs_index_context *ictx) -{ - if (ictx->is_in_root) - flush_dcache_mft_record_page(ictx->actx->ntfs_ino); - else - flush_dcache_page(ictx->page); -} - -/** - * ntfs_index_entry_mark_dirty - mark an index entry dirty - * @ictx: ntfs index context describing the index entry - * - * Mark the index entry described by the index entry context @ictx dirty. - * - * If the index entry is in the index root attribute, simply mark the mft - * record containing the index root attribute dirty. This ensures the mft - * record, and hence the index root attribute, will be written out to disk - * later. - * - * If the index entry is in an index block belonging to the index allocation - * attribute, mark the buffers belonging to the index record as well as the - * page cache page the index block is in dirty. This automatically marks the - * VFS inode of the ntfs index inode to which the index entry belongs dirty, - * too (I_DIRTY_PAGES) and this in turn ensures the page buffers, and hence the - * dirty index block, will be written out to disk later. - */ -static inline void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx) -{ - if (ictx->is_in_root) - mark_mft_record_dirty(ictx->actx->ntfs_ino); - else - mark_ntfs_record_dirty(ictx->page, - (u8*)ictx->ia - (u8*)page_address(ictx->page)); -} - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_INDEX_H */ diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c deleted file mode 100644 index aba1e22db4e9b..0000000000000 --- a/fs/ntfs/inode.c +++ /dev/null @@ -1,3102 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * inode.c - NTFS kernel inode handling. - * - * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "aops.h" -#include "attrib.h" -#include "bitmap.h" -#include "dir.h" -#include "debug.h" -#include "inode.h" -#include "lcnalloc.h" -#include "malloc.h" -#include "mft.h" -#include "time.h" -#include "ntfs.h" - -/** - * ntfs_test_inode - compare two (possibly fake) inodes for equality - * @vi: vfs inode which to test - * @data: data which is being tested with - * - * Compare the ntfs attribute embedded in the ntfs specific part of the vfs - * inode @vi for equality with the ntfs attribute @data. - * - * If searching for the normal file/directory inode, set @na->type to AT_UNUSED. - * @na->name and @na->name_len are then ignored. - * - * Return 1 if the attributes match and 0 if not. - * - * NOTE: This function runs with the inode_hash_lock spin lock held so it is not - * allowed to sleep. - */ -int ntfs_test_inode(struct inode *vi, void *data) -{ - ntfs_attr *na = (ntfs_attr *)data; - ntfs_inode *ni; - - if (vi->i_ino != na->mft_no) - return 0; - ni = NTFS_I(vi); - /* If !NInoAttr(ni), @vi is a normal file or directory inode. */ - if (likely(!NInoAttr(ni))) { - /* If not looking for a normal inode this is a mismatch. */ - if (unlikely(na->type != AT_UNUSED)) - return 0; - } else { - /* A fake inode describing an attribute. */ - if (ni->type != na->type) - return 0; - if (ni->name_len != na->name_len) - return 0; - if (na->name_len && memcmp(ni->name, na->name, - na->name_len * sizeof(ntfschar))) - return 0; - } - /* Match! */ - return 1; -} - -/** - * ntfs_init_locked_inode - initialize an inode - * @vi: vfs inode to initialize - * @data: data which to initialize @vi to - * - * Initialize the vfs inode @vi with the values from the ntfs attribute @data in - * order to enable ntfs_test_inode() to do its work. - * - * If initializing the normal file/directory inode, set @na->type to AT_UNUSED. - * In that case, @na->name and @na->name_len should be set to NULL and 0, - * respectively. Although that is not strictly necessary as - * ntfs_read_locked_inode() will fill them in later. - * - * Return 0 on success and -errno on error. - * - * NOTE: This function runs with the inode->i_lock spin lock held so it is not - * allowed to sleep. (Hence the GFP_ATOMIC allocation.) - */ -static int ntfs_init_locked_inode(struct inode *vi, void *data) -{ - ntfs_attr *na = (ntfs_attr *)data; - ntfs_inode *ni = NTFS_I(vi); - - vi->i_ino = na->mft_no; - - ni->type = na->type; - if (na->type == AT_INDEX_ALLOCATION) - NInoSetMstProtected(ni); - - ni->name = na->name; - ni->name_len = na->name_len; - - /* If initializing a normal inode, we are done. */ - if (likely(na->type == AT_UNUSED)) { - BUG_ON(na->name); - BUG_ON(na->name_len); - return 0; - } - - /* It is a fake inode. */ - NInoSetAttr(ni); - - /* - * We have I30 global constant as an optimization as it is the name - * in >99.9% of named attributes! The other <0.1% incur a GFP_ATOMIC - * allocation but that is ok. And most attributes are unnamed anyway, - * thus the fraction of named attributes with name != I30 is actually - * absolutely tiny. - */ - if (na->name_len && na->name != I30) { - unsigned int i; - - BUG_ON(!na->name); - i = na->name_len * sizeof(ntfschar); - ni->name = kmalloc(i + sizeof(ntfschar), GFP_ATOMIC); - if (!ni->name) - return -ENOMEM; - memcpy(ni->name, na->name, i); - ni->name[na->name_len] = 0; - } - return 0; -} - -static int ntfs_read_locked_inode(struct inode *vi); -static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi); -static int ntfs_read_locked_index_inode(struct inode *base_vi, - struct inode *vi); - -/** - * ntfs_iget - obtain a struct inode corresponding to a specific normal inode - * @sb: super block of mounted volume - * @mft_no: mft record number / inode number to obtain - * - * Obtain the struct inode corresponding to a specific normal inode (i.e. a - * file or directory). - * - * If the inode is in the cache, it is just returned with an increased - * reference count. Otherwise, a new struct inode is allocated and initialized, - * and finally ntfs_read_locked_inode() is called to read in the inode and - * fill in the remainder of the inode structure. - * - * Return the struct inode on success. Check the return value with IS_ERR() and - * if true, the function failed and the error code is obtained from PTR_ERR(). - */ -struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no) -{ - struct inode *vi; - int err; - ntfs_attr na; - - na.mft_no = mft_no; - na.type = AT_UNUSED; - na.name = NULL; - na.name_len = 0; - - vi = iget5_locked(sb, mft_no, ntfs_test_inode, - ntfs_init_locked_inode, &na); - if (unlikely(!vi)) - return ERR_PTR(-ENOMEM); - - err = 0; - - /* If this is a freshly allocated inode, need to read it now. */ - if (vi->i_state & I_NEW) { - err = ntfs_read_locked_inode(vi); - unlock_new_inode(vi); - } - /* - * There is no point in keeping bad inodes around if the failure was - * due to ENOMEM. We want to be able to retry again later. - */ - if (unlikely(err == -ENOMEM)) { - iput(vi); - vi = ERR_PTR(err); - } - return vi; -} - -/** - * ntfs_attr_iget - obtain a struct inode corresponding to an attribute - * @base_vi: vfs base inode containing the attribute - * @type: attribute type - * @name: Unicode name of the attribute (NULL if unnamed) - * @name_len: length of @name in Unicode characters (0 if unnamed) - * - * Obtain the (fake) struct inode corresponding to the attribute specified by - * @type, @name, and @name_len, which is present in the base mft record - * specified by the vfs inode @base_vi. - * - * If the attribute inode is in the cache, it is just returned with an - * increased reference count. Otherwise, a new struct inode is allocated and - * initialized, and finally ntfs_read_locked_attr_inode() is called to read the - * attribute and fill in the inode structure. - * - * Note, for index allocation attributes, you need to use ntfs_index_iget() - * instead of ntfs_attr_iget() as working with indices is a lot more complex. - * - * Return the struct inode of the attribute inode on success. Check the return - * value with IS_ERR() and if true, the function failed and the error code is - * obtained from PTR_ERR(). - */ -struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, - ntfschar *name, u32 name_len) -{ - struct inode *vi; - int err; - ntfs_attr na; - - /* Make sure no one calls ntfs_attr_iget() for indices. */ - BUG_ON(type == AT_INDEX_ALLOCATION); - - na.mft_no = base_vi->i_ino; - na.type = type; - na.name = name; - na.name_len = name_len; - - vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode, - ntfs_init_locked_inode, &na); - if (unlikely(!vi)) - return ERR_PTR(-ENOMEM); - - err = 0; - - /* If this is a freshly allocated inode, need to read it now. */ - if (vi->i_state & I_NEW) { - err = ntfs_read_locked_attr_inode(base_vi, vi); - unlock_new_inode(vi); - } - /* - * There is no point in keeping bad attribute inodes around. This also - * simplifies things in that we never need to check for bad attribute - * inodes elsewhere. - */ - if (unlikely(err)) { - iput(vi); - vi = ERR_PTR(err); - } - return vi; -} - -/** - * ntfs_index_iget - obtain a struct inode corresponding to an index - * @base_vi: vfs base inode containing the index related attributes - * @name: Unicode name of the index - * @name_len: length of @name in Unicode characters - * - * Obtain the (fake) struct inode corresponding to the index specified by @name - * and @name_len, which is present in the base mft record specified by the vfs - * inode @base_vi. - * - * If the index inode is in the cache, it is just returned with an increased - * reference count. Otherwise, a new struct inode is allocated and - * initialized, and finally ntfs_read_locked_index_inode() is called to read - * the index related attributes and fill in the inode structure. - * - * Return the struct inode of the index inode on success. Check the return - * value with IS_ERR() and if true, the function failed and the error code is - * obtained from PTR_ERR(). - */ -struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, - u32 name_len) -{ - struct inode *vi; - int err; - ntfs_attr na; - - na.mft_no = base_vi->i_ino; - na.type = AT_INDEX_ALLOCATION; - na.name = name; - na.name_len = name_len; - - vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode, - ntfs_init_locked_inode, &na); - if (unlikely(!vi)) - return ERR_PTR(-ENOMEM); - - err = 0; - - /* If this is a freshly allocated inode, need to read it now. */ - if (vi->i_state & I_NEW) { - err = ntfs_read_locked_index_inode(base_vi, vi); - unlock_new_inode(vi); - } - /* - * There is no point in keeping bad index inodes around. This also - * simplifies things in that we never need to check for bad index - * inodes elsewhere. - */ - if (unlikely(err)) { - iput(vi); - vi = ERR_PTR(err); - } - return vi; -} - -struct inode *ntfs_alloc_big_inode(struct super_block *sb) -{ - ntfs_inode *ni; - - ntfs_debug("Entering."); - ni = alloc_inode_sb(sb, ntfs_big_inode_cache, GFP_NOFS); - if (likely(ni != NULL)) { - ni->state = 0; - return VFS_I(ni); - } - ntfs_error(sb, "Allocation of NTFS big inode structure failed."); - return NULL; -} - -void ntfs_free_big_inode(struct inode *inode) -{ - kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode)); -} - -static inline ntfs_inode *ntfs_alloc_extent_inode(void) -{ - ntfs_inode *ni; - - ntfs_debug("Entering."); - ni = kmem_cache_alloc(ntfs_inode_cache, GFP_NOFS); - if (likely(ni != NULL)) { - ni->state = 0; - return ni; - } - ntfs_error(NULL, "Allocation of NTFS inode structure failed."); - return NULL; -} - -static void ntfs_destroy_extent_inode(ntfs_inode *ni) -{ - ntfs_debug("Entering."); - BUG_ON(ni->page); - if (!atomic_dec_and_test(&ni->count)) - BUG(); - kmem_cache_free(ntfs_inode_cache, ni); -} - -/* - * The attribute runlist lock has separate locking rules from the - * normal runlist lock, so split the two lock-classes: - */ -static struct lock_class_key attr_list_rl_lock_class; - -/** - * __ntfs_init_inode - initialize ntfs specific part of an inode - * @sb: super block of mounted volume - * @ni: freshly allocated ntfs inode which to initialize - * - * Initialize an ntfs inode to defaults. - * - * NOTE: ni->mft_no, ni->state, ni->type, ni->name, and ni->name_len are left - * untouched. Make sure to initialize them elsewhere. - * - * Return zero on success and -ENOMEM on error. - */ -void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni) -{ - ntfs_debug("Entering."); - rwlock_init(&ni->size_lock); - ni->initialized_size = ni->allocated_size = 0; - ni->seq_no = 0; - atomic_set(&ni->count, 1); - ni->vol = NTFS_SB(sb); - ntfs_init_runlist(&ni->runlist); - mutex_init(&ni->mrec_lock); - ni->page = NULL; - ni->page_ofs = 0; - ni->attr_list_size = 0; - ni->attr_list = NULL; - ntfs_init_runlist(&ni->attr_list_rl); - lockdep_set_class(&ni->attr_list_rl.lock, - &attr_list_rl_lock_class); - ni->itype.index.block_size = 0; - ni->itype.index.vcn_size = 0; - ni->itype.index.collation_rule = 0; - ni->itype.index.block_size_bits = 0; - ni->itype.index.vcn_size_bits = 0; - mutex_init(&ni->extent_lock); - ni->nr_extents = 0; - ni->ext.base_ntfs_ino = NULL; -} - -/* - * Extent inodes get MFT-mapped in a nested way, while the base inode - * is still mapped. Teach this nesting to the lock validator by creating - * a separate class for nested inode's mrec_lock's: - */ -static struct lock_class_key extent_inode_mrec_lock_key; - -inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb, - unsigned long mft_no) -{ - ntfs_inode *ni = ntfs_alloc_extent_inode(); - - ntfs_debug("Entering."); - if (likely(ni != NULL)) { - __ntfs_init_inode(sb, ni); - lockdep_set_class(&ni->mrec_lock, &extent_inode_mrec_lock_key); - ni->mft_no = mft_no; - ni->type = AT_UNUSED; - ni->name = NULL; - ni->name_len = 0; - } - return ni; -} - -/** - * ntfs_is_extended_system_file - check if a file is in the $Extend directory - * @ctx: initialized attribute search context - * - * Search all file name attributes in the inode described by the attribute - * search context @ctx and check if any of the names are in the $Extend system - * directory. - * - * Return values: - * 1: file is in $Extend directory - * 0: file is not in $Extend directory - * -errno: failed to determine if the file is in the $Extend directory - */ -static int ntfs_is_extended_system_file(ntfs_attr_search_ctx *ctx) -{ - int nr_links, err; - - /* Restart search. */ - ntfs_attr_reinit_search_ctx(ctx); - - /* Get number of hard links. */ - nr_links = le16_to_cpu(ctx->mrec->link_count); - - /* Loop through all hard links. */ - while (!(err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, NULL, 0, - ctx))) { - FILE_NAME_ATTR *file_name_attr; - ATTR_RECORD *attr = ctx->attr; - u8 *p, *p2; - - nr_links--; - /* - * Maximum sanity checking as we are called on an inode that - * we suspect might be corrupt. - */ - p = (u8*)attr + le32_to_cpu(attr->length); - if (p < (u8*)ctx->mrec || (u8*)p > (u8*)ctx->mrec + - le32_to_cpu(ctx->mrec->bytes_in_use)) { -err_corrupt_attr: - ntfs_error(ctx->ntfs_ino->vol->sb, "Corrupt file name " - "attribute. You should run chkdsk."); - return -EIO; - } - if (attr->non_resident) { - ntfs_error(ctx->ntfs_ino->vol->sb, "Non-resident file " - "name. You should run chkdsk."); - return -EIO; - } - if (attr->flags) { - ntfs_error(ctx->ntfs_ino->vol->sb, "File name with " - "invalid flags. You should run " - "chkdsk."); - return -EIO; - } - if (!(attr->data.resident.flags & RESIDENT_ATTR_IS_INDEXED)) { - ntfs_error(ctx->ntfs_ino->vol->sb, "Unindexed file " - "name. You should run chkdsk."); - return -EIO; - } - file_name_attr = (FILE_NAME_ATTR*)((u8*)attr + - le16_to_cpu(attr->data.resident.value_offset)); - p2 = (u8 *)file_name_attr + le32_to_cpu(attr->data.resident.value_length); - if (p2 < (u8*)attr || p2 > p) - goto err_corrupt_attr; - /* This attribute is ok, but is it in the $Extend directory? */ - if (MREF_LE(file_name_attr->parent_directory) == FILE_Extend) - return 1; /* YES, it's an extended system file. */ - } - if (unlikely(err != -ENOENT)) - return err; - if (unlikely(nr_links)) { - ntfs_error(ctx->ntfs_ino->vol->sb, "Inode hard link count " - "doesn't match number of name attributes. You " - "should run chkdsk."); - return -EIO; - } - return 0; /* NO, it is not an extended system file. */ -} - -/** - * ntfs_read_locked_inode - read an inode from its device - * @vi: inode to read - * - * ntfs_read_locked_inode() is called from ntfs_iget() to read the inode - * described by @vi into memory from the device. - * - * The only fields in @vi that we need to/can look at when the function is - * called are i_sb, pointing to the mounted device's super block, and i_ino, - * the number of the inode to load. - * - * ntfs_read_locked_inode() maps, pins and locks the mft record number i_ino - * for reading and sets up the necessary @vi fields as well as initializing - * the ntfs inode. - * - * Q: What locks are held when the function is called? - * A: i_state has I_NEW set, hence the inode is locked, also - * i_count is set to 1, so it is not going to go away - * i_flags is set to 0 and we have no business touching it. Only an ioctl() - * is allowed to write to them. We should of course be honouring them but - * we need to do that using the IS_* macros defined in include/linux/fs.h. - * In any case ntfs_read_locked_inode() has nothing to do with i_flags. - * - * Return 0 on success and -errno on error. In the error case, the inode will - * have had make_bad_inode() executed on it. - */ -static int ntfs_read_locked_inode(struct inode *vi) -{ - ntfs_volume *vol = NTFS_SB(vi->i_sb); - ntfs_inode *ni; - struct inode *bvi; - MFT_RECORD *m; - ATTR_RECORD *a; - STANDARD_INFORMATION *si; - ntfs_attr_search_ctx *ctx; - int err = 0; - - ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); - - /* Setup the generic vfs inode parts now. */ - vi->i_uid = vol->uid; - vi->i_gid = vol->gid; - vi->i_mode = 0; - - /* - * Initialize the ntfs specific part of @vi special casing - * FILE_MFT which we need to do at mount time. - */ - if (vi->i_ino != FILE_MFT) - ntfs_init_big_inode(vi); - ni = NTFS_I(vi); - - m = map_mft_record(ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(ni, m); - if (!ctx) { - err = -ENOMEM; - goto unm_err_out; - } - - if (!(m->flags & MFT_RECORD_IN_USE)) { - ntfs_error(vi->i_sb, "Inode is not in use!"); - goto unm_err_out; - } - if (m->base_mft_record) { - ntfs_error(vi->i_sb, "Inode is an extent inode!"); - goto unm_err_out; - } - - /* Transfer information from mft record into vfs and ntfs inodes. */ - vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); - - /* - * FIXME: Keep in mind that link_count is two for files which have both - * a long file name and a short file name as separate entries, so if - * we are hiding short file names this will be too high. Either we need - * to account for the short file names by subtracting them or we need - * to make sure we delete files even though i_nlink is not zero which - * might be tricky due to vfs interactions. Need to think about this - * some more when implementing the unlink command. - */ - set_nlink(vi, le16_to_cpu(m->link_count)); - /* - * FIXME: Reparse points can have the directory bit set even though - * they would be S_IFLNK. Need to deal with this further below when we - * implement reparse points / symbolic links but it will do for now. - * Also if not a directory, it could be something else, rather than - * a regular file. But again, will do for now. - */ - /* Everyone gets all permissions. */ - vi->i_mode |= S_IRWXUGO; - /* If read-only, no one gets write permissions. */ - if (IS_RDONLY(vi)) - vi->i_mode &= ~S_IWUGO; - if (m->flags & MFT_RECORD_IS_DIRECTORY) { - vi->i_mode |= S_IFDIR; - /* - * Apply the directory permissions mask set in the mount - * options. - */ - vi->i_mode &= ~vol->dmask; - /* Things break without this kludge! */ - if (vi->i_nlink > 1) - set_nlink(vi, 1); - } else { - vi->i_mode |= S_IFREG; - /* Apply the file permissions mask set in the mount options. */ - vi->i_mode &= ~vol->fmask; - } - /* - * Find the standard information attribute in the mft record. At this - * stage we haven't setup the attribute list stuff yet, so this could - * in fact fail if the standard information is in an extent record, but - * I don't think this actually ever happens. - */ - err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, 0, 0, NULL, 0, - ctx); - if (unlikely(err)) { - if (err == -ENOENT) { - /* - * TODO: We should be performing a hot fix here (if the - * recover mount option is set) by creating a new - * attribute. - */ - ntfs_error(vi->i_sb, "$STANDARD_INFORMATION attribute " - "is missing."); - } - goto unm_err_out; - } - a = ctx->attr; - /* Get the standard information attribute value. */ - if ((u8 *)a + le16_to_cpu(a->data.resident.value_offset) - + le32_to_cpu(a->data.resident.value_length) > - (u8 *)ctx->mrec + vol->mft_record_size) { - ntfs_error(vi->i_sb, "Corrupt standard information attribute in inode."); - goto unm_err_out; - } - si = (STANDARD_INFORMATION*)((u8*)a + - le16_to_cpu(a->data.resident.value_offset)); - - /* Transfer information from the standard information into vi. */ - /* - * Note: The i_?times do not quite map perfectly onto the NTFS times, - * but they are close enough, and in the end it doesn't really matter - * that much... - */ - /* - * mtime is the last change of the data within the file. Not changed - * when only metadata is changed, e.g. a rename doesn't affect mtime. - */ - inode_set_mtime_to_ts(vi, ntfs2utc(si->last_data_change_time)); - /* - * ctime is the last change of the metadata of the file. This obviously - * always changes, when mtime is changed. ctime can be changed on its - * own, mtime is then not changed, e.g. when a file is renamed. - */ - inode_set_ctime_to_ts(vi, ntfs2utc(si->last_mft_change_time)); - /* - * Last access to the data within the file. Not changed during a rename - * for example but changed whenever the file is written to. - */ - inode_set_atime_to_ts(vi, ntfs2utc(si->last_access_time)); - - /* Find the attribute list attribute if present. */ - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx); - if (err) { - if (unlikely(err != -ENOENT)) { - ntfs_error(vi->i_sb, "Failed to lookup attribute list " - "attribute."); - goto unm_err_out; - } - } else /* if (!err) */ { - if (vi->i_ino == FILE_MFT) - goto skip_attr_list_load; - ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino); - NInoSetAttrList(ni); - a = ctx->attr; - if (a->flags & ATTR_COMPRESSION_MASK) { - ntfs_error(vi->i_sb, "Attribute list attribute is " - "compressed."); - goto unm_err_out; - } - if (a->flags & ATTR_IS_ENCRYPTED || - a->flags & ATTR_IS_SPARSE) { - if (a->non_resident) { - ntfs_error(vi->i_sb, "Non-resident attribute " - "list attribute is encrypted/" - "sparse."); - goto unm_err_out; - } - ntfs_warning(vi->i_sb, "Resident attribute list " - "attribute in inode 0x%lx is marked " - "encrypted/sparse which is not true. " - "However, Windows allows this and " - "chkdsk does not detect or correct it " - "so we will just ignore the invalid " - "flags and pretend they are not set.", - vi->i_ino); - } - /* Now allocate memory for the attribute list. */ - ni->attr_list_size = (u32)ntfs_attr_size(a); - ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); - if (!ni->attr_list) { - ntfs_error(vi->i_sb, "Not enough memory to allocate " - "buffer for attribute list."); - err = -ENOMEM; - goto unm_err_out; - } - if (a->non_resident) { - NInoSetAttrListNonResident(ni); - if (a->data.non_resident.lowest_vcn) { - ntfs_error(vi->i_sb, "Attribute list has non " - "zero lowest_vcn."); - goto unm_err_out; - } - /* - * Setup the runlist. No need for locking as we have - * exclusive access to the inode at this time. - */ - ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol, - a, NULL); - if (IS_ERR(ni->attr_list_rl.rl)) { - err = PTR_ERR(ni->attr_list_rl.rl); - ni->attr_list_rl.rl = NULL; - ntfs_error(vi->i_sb, "Mapping pairs " - "decompression failed."); - goto unm_err_out; - } - /* Now load the attribute list. */ - if ((err = load_attribute_list(vol, &ni->attr_list_rl, - ni->attr_list, ni->attr_list_size, - sle64_to_cpu(a->data.non_resident. - initialized_size)))) { - ntfs_error(vi->i_sb, "Failed to load " - "attribute list attribute."); - goto unm_err_out; - } - } else /* if (!a->non_resident) */ { - if ((u8*)a + le16_to_cpu(a->data.resident.value_offset) - + le32_to_cpu( - a->data.resident.value_length) > - (u8*)ctx->mrec + vol->mft_record_size) { - ntfs_error(vi->i_sb, "Corrupt attribute list " - "in inode."); - goto unm_err_out; - } - /* Now copy the attribute list. */ - memcpy(ni->attr_list, (u8*)a + le16_to_cpu( - a->data.resident.value_offset), - le32_to_cpu( - a->data.resident.value_length)); - } - } -skip_attr_list_load: - /* - * If an attribute list is present we now have the attribute list value - * in ntfs_ino->attr_list and it is ntfs_ino->attr_list_size bytes. - */ - if (S_ISDIR(vi->i_mode)) { - loff_t bvi_size; - ntfs_inode *bni; - INDEX_ROOT *ir; - u8 *ir_end, *index_end; - - /* It is a directory, find index root attribute. */ - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, - 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) { - // FIXME: File is corrupt! Hot-fix with empty - // index root attribute if recovery option is - // set. - ntfs_error(vi->i_sb, "$INDEX_ROOT attribute " - "is missing."); - } - goto unm_err_out; - } - a = ctx->attr; - /* Set up the state. */ - if (unlikely(a->non_resident)) { - ntfs_error(vol->sb, "$INDEX_ROOT attribute is not " - "resident."); - goto unm_err_out; - } - /* Ensure the attribute name is placed before the value. */ - if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= - le16_to_cpu(a->data.resident.value_offset)))) { - ntfs_error(vol->sb, "$INDEX_ROOT attribute name is " - "placed after the attribute value."); - goto unm_err_out; - } - /* - * Compressed/encrypted index root just means that the newly - * created files in that directory should be created compressed/ - * encrypted. However index root cannot be both compressed and - * encrypted. - */ - if (a->flags & ATTR_COMPRESSION_MASK) - NInoSetCompressed(ni); - if (a->flags & ATTR_IS_ENCRYPTED) { - if (a->flags & ATTR_COMPRESSION_MASK) { - ntfs_error(vi->i_sb, "Found encrypted and " - "compressed attribute."); - goto unm_err_out; - } - NInoSetEncrypted(ni); - } - if (a->flags & ATTR_IS_SPARSE) - NInoSetSparse(ni); - ir = (INDEX_ROOT*)((u8*)a + - le16_to_cpu(a->data.resident.value_offset)); - ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length); - if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) { - ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is " - "corrupt."); - goto unm_err_out; - } - index_end = (u8*)&ir->index + - le32_to_cpu(ir->index.index_length); - if (index_end > ir_end) { - ntfs_error(vi->i_sb, "Directory index is corrupt."); - goto unm_err_out; - } - if (ir->type != AT_FILE_NAME) { - ntfs_error(vi->i_sb, "Indexed attribute is not " - "$FILE_NAME."); - goto unm_err_out; - } - if (ir->collation_rule != COLLATION_FILE_NAME) { - ntfs_error(vi->i_sb, "Index collation rule is not " - "COLLATION_FILE_NAME."); - goto unm_err_out; - } - ni->itype.index.collation_rule = ir->collation_rule; - ni->itype.index.block_size = le32_to_cpu(ir->index_block_size); - if (ni->itype.index.block_size & - (ni->itype.index.block_size - 1)) { - ntfs_error(vi->i_sb, "Index block size (%u) is not a " - "power of two.", - ni->itype.index.block_size); - goto unm_err_out; - } - if (ni->itype.index.block_size > PAGE_SIZE) { - ntfs_error(vi->i_sb, "Index block size (%u) > " - "PAGE_SIZE (%ld) is not " - "supported. Sorry.", - ni->itype.index.block_size, - PAGE_SIZE); - err = -EOPNOTSUPP; - goto unm_err_out; - } - if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) { - ntfs_error(vi->i_sb, "Index block size (%u) < " - "NTFS_BLOCK_SIZE (%i) is not " - "supported. Sorry.", - ni->itype.index.block_size, - NTFS_BLOCK_SIZE); - err = -EOPNOTSUPP; - goto unm_err_out; - } - ni->itype.index.block_size_bits = - ffs(ni->itype.index.block_size) - 1; - /* Determine the size of a vcn in the directory index. */ - if (vol->cluster_size <= ni->itype.index.block_size) { - ni->itype.index.vcn_size = vol->cluster_size; - ni->itype.index.vcn_size_bits = vol->cluster_size_bits; - } else { - ni->itype.index.vcn_size = vol->sector_size; - ni->itype.index.vcn_size_bits = vol->sector_size_bits; - } - - /* Setup the index allocation attribute, even if not present. */ - NInoSetMstProtected(ni); - ni->type = AT_INDEX_ALLOCATION; - ni->name = I30; - ni->name_len = 4; - - if (!(ir->index.flags & LARGE_INDEX)) { - /* No index allocation. */ - vi->i_size = ni->initialized_size = - ni->allocated_size = 0; - /* We are done with the mft record, so we release it. */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); - m = NULL; - ctx = NULL; - goto skip_large_dir_stuff; - } /* LARGE_INDEX: Index allocation present. Setup state. */ - NInoSetIndexAllocPresent(ni); - /* Find index allocation attribute. */ - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, I30, 4, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION " - "attribute is not present but " - "$INDEX_ROOT indicated it is."); - else - ntfs_error(vi->i_sb, "Failed to lookup " - "$INDEX_ALLOCATION " - "attribute."); - goto unm_err_out; - } - a = ctx->attr; - if (!a->non_resident) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " - "is resident."); - goto unm_err_out; - } - /* - * Ensure the attribute name is placed before the mapping pairs - * array. - */ - if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset)))) { - ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name " - "is placed after the mapping pairs " - "array."); - goto unm_err_out; - } - if (a->flags & ATTR_IS_ENCRYPTED) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " - "is encrypted."); - goto unm_err_out; - } - if (a->flags & ATTR_IS_SPARSE) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " - "is sparse."); - goto unm_err_out; - } - if (a->flags & ATTR_COMPRESSION_MASK) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " - "is compressed."); - goto unm_err_out; - } - if (a->data.non_resident.lowest_vcn) { - ntfs_error(vi->i_sb, "First extent of " - "$INDEX_ALLOCATION attribute has non " - "zero lowest_vcn."); - goto unm_err_out; - } - vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); - ni->initialized_size = sle64_to_cpu( - a->data.non_resident.initialized_size); - ni->allocated_size = sle64_to_cpu( - a->data.non_resident.allocated_size); - /* - * We are done with the mft record, so we release it. Otherwise - * we would deadlock in ntfs_attr_iget(). - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); - m = NULL; - ctx = NULL; - /* Get the index bitmap attribute inode. */ - bvi = ntfs_attr_iget(vi, AT_BITMAP, I30, 4); - if (IS_ERR(bvi)) { - ntfs_error(vi->i_sb, "Failed to get bitmap attribute."); - err = PTR_ERR(bvi); - goto unm_err_out; - } - bni = NTFS_I(bvi); - if (NInoCompressed(bni) || NInoEncrypted(bni) || - NInoSparse(bni)) { - ntfs_error(vi->i_sb, "$BITMAP attribute is compressed " - "and/or encrypted and/or sparse."); - goto iput_unm_err_out; - } - /* Consistency check bitmap size vs. index allocation size. */ - bvi_size = i_size_read(bvi); - if ((bvi_size << 3) < (vi->i_size >> - ni->itype.index.block_size_bits)) { - ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) " - "for index allocation (0x%llx).", - bvi_size << 3, vi->i_size); - goto iput_unm_err_out; - } - /* No longer need the bitmap attribute inode. */ - iput(bvi); -skip_large_dir_stuff: - /* Setup the operations for this inode. */ - vi->i_op = &ntfs_dir_inode_ops; - vi->i_fop = &ntfs_dir_ops; - vi->i_mapping->a_ops = &ntfs_mst_aops; - } else { - /* It is a file. */ - ntfs_attr_reinit_search_ctx(ctx); - - /* Setup the data attribute, even if not present. */ - ni->type = AT_DATA; - ni->name = NULL; - ni->name_len = 0; - - /* Find first extent of the unnamed data attribute. */ - err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, 0, NULL, 0, ctx); - if (unlikely(err)) { - vi->i_size = ni->initialized_size = - ni->allocated_size = 0; - if (err != -ENOENT) { - ntfs_error(vi->i_sb, "Failed to lookup $DATA " - "attribute."); - goto unm_err_out; - } - /* - * FILE_Secure does not have an unnamed $DATA - * attribute, so we special case it here. - */ - if (vi->i_ino == FILE_Secure) - goto no_data_attr_special_case; - /* - * Most if not all the system files in the $Extend - * system directory do not have unnamed data - * attributes so we need to check if the parent - * directory of the file is FILE_Extend and if it is - * ignore this error. To do this we need to get the - * name of this inode from the mft record as the name - * contains the back reference to the parent directory. - */ - if (ntfs_is_extended_system_file(ctx) > 0) - goto no_data_attr_special_case; - // FIXME: File is corrupt! Hot-fix with empty data - // attribute if recovery option is set. - ntfs_error(vi->i_sb, "$DATA attribute is missing."); - goto unm_err_out; - } - a = ctx->attr; - /* Setup the state. */ - if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) { - if (a->flags & ATTR_COMPRESSION_MASK) { - NInoSetCompressed(ni); - if (vol->cluster_size > 4096) { - ntfs_error(vi->i_sb, "Found " - "compressed data but " - "compression is " - "disabled due to " - "cluster size (%i) > " - "4kiB.", - vol->cluster_size); - goto unm_err_out; - } - if ((a->flags & ATTR_COMPRESSION_MASK) - != ATTR_IS_COMPRESSED) { - ntfs_error(vi->i_sb, "Found unknown " - "compression method " - "or corrupt file."); - goto unm_err_out; - } - } - if (a->flags & ATTR_IS_SPARSE) - NInoSetSparse(ni); - } - if (a->flags & ATTR_IS_ENCRYPTED) { - if (NInoCompressed(ni)) { - ntfs_error(vi->i_sb, "Found encrypted and " - "compressed data."); - goto unm_err_out; - } - NInoSetEncrypted(ni); - } - if (a->non_resident) { - NInoSetNonResident(ni); - if (NInoCompressed(ni) || NInoSparse(ni)) { - if (NInoCompressed(ni) && a->data.non_resident. - compression_unit != 4) { - ntfs_error(vi->i_sb, "Found " - "non-standard " - "compression unit (%u " - "instead of 4). " - "Cannot handle this.", - a->data.non_resident. - compression_unit); - err = -EOPNOTSUPP; - goto unm_err_out; - } - if (a->data.non_resident.compression_unit) { - ni->itype.compressed.block_size = 1U << - (a->data.non_resident. - compression_unit + - vol->cluster_size_bits); - ni->itype.compressed.block_size_bits = - ffs(ni->itype. - compressed. - block_size) - 1; - ni->itype.compressed.block_clusters = - 1U << a->data. - non_resident. - compression_unit; - } else { - ni->itype.compressed.block_size = 0; - ni->itype.compressed.block_size_bits = - 0; - ni->itype.compressed.block_clusters = - 0; - } - ni->itype.compressed.size = sle64_to_cpu( - a->data.non_resident. - compressed_size); - } - if (a->data.non_resident.lowest_vcn) { - ntfs_error(vi->i_sb, "First extent of $DATA " - "attribute has non zero " - "lowest_vcn."); - goto unm_err_out; - } - vi->i_size = sle64_to_cpu( - a->data.non_resident.data_size); - ni->initialized_size = sle64_to_cpu( - a->data.non_resident.initialized_size); - ni->allocated_size = sle64_to_cpu( - a->data.non_resident.allocated_size); - } else { /* Resident attribute. */ - vi->i_size = ni->initialized_size = le32_to_cpu( - a->data.resident.value_length); - ni->allocated_size = le32_to_cpu(a->length) - - le16_to_cpu( - a->data.resident.value_offset); - if (vi->i_size > ni->allocated_size) { - ntfs_error(vi->i_sb, "Resident data attribute " - "is corrupt (size exceeds " - "allocation)."); - goto unm_err_out; - } - } -no_data_attr_special_case: - /* We are done with the mft record, so we release it. */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); - m = NULL; - ctx = NULL; - /* Setup the operations for this inode. */ - vi->i_op = &ntfs_file_inode_ops; - vi->i_fop = &ntfs_file_ops; - vi->i_mapping->a_ops = &ntfs_normal_aops; - if (NInoMstProtected(ni)) - vi->i_mapping->a_ops = &ntfs_mst_aops; - else if (NInoCompressed(ni)) - vi->i_mapping->a_ops = &ntfs_compressed_aops; - } - /* - * The number of 512-byte blocks used on disk (for stat). This is in so - * far inaccurate as it doesn't account for any named streams or other - * special non-resident attributes, but that is how Windows works, too, - * so we are at least consistent with Windows, if not entirely - * consistent with the Linux Way. Doing it the Linux Way would cause a - * significant slowdown as it would involve iterating over all - * attributes in the mft record and adding the allocated/compressed - * sizes of all non-resident attributes present to give us the Linux - * correct size that should go into i_blocks (after division by 512). - */ - if (S_ISREG(vi->i_mode) && (NInoCompressed(ni) || NInoSparse(ni))) - vi->i_blocks = ni->itype.compressed.size >> 9; - else - vi->i_blocks = ni->allocated_size >> 9; - ntfs_debug("Done."); - return 0; -iput_unm_err_out: - iput(bvi); -unm_err_out: - if (!err) - err = -EIO; - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(ni); -err_out: - ntfs_error(vol->sb, "Failed with error code %i. Marking corrupt " - "inode 0x%lx as bad. Run chkdsk.", err, vi->i_ino); - make_bad_inode(vi); - if (err != -EOPNOTSUPP && err != -ENOMEM) - NVolSetErrors(vol); - return err; -} - -/** - * ntfs_read_locked_attr_inode - read an attribute inode from its base inode - * @base_vi: base inode - * @vi: attribute inode to read - * - * ntfs_read_locked_attr_inode() is called from ntfs_attr_iget() to read the - * attribute inode described by @vi into memory from the base mft record - * described by @base_ni. - * - * ntfs_read_locked_attr_inode() maps, pins and locks the base inode for - * reading and looks up the attribute described by @vi before setting up the - * necessary fields in @vi as well as initializing the ntfs inode. - * - * Q: What locks are held when the function is called? - * A: i_state has I_NEW set, hence the inode is locked, also - * i_count is set to 1, so it is not going to go away - * - * Return 0 on success and -errno on error. In the error case, the inode will - * have had make_bad_inode() executed on it. - * - * Note this cannot be called for AT_INDEX_ALLOCATION. - */ -static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) -{ - ntfs_volume *vol = NTFS_SB(vi->i_sb); - ntfs_inode *ni, *base_ni; - MFT_RECORD *m; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx; - int err = 0; - - ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); - - ntfs_init_big_inode(vi); - - ni = NTFS_I(vi); - base_ni = NTFS_I(base_vi); - - /* Just mirror the values from the base inode. */ - vi->i_uid = base_vi->i_uid; - vi->i_gid = base_vi->i_gid; - set_nlink(vi, base_vi->i_nlink); - inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi)); - inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi)); - inode_set_atime_to_ts(vi, inode_get_atime(base_vi)); - vi->i_generation = ni->seq_no = base_ni->seq_no; - - /* Set inode type to zero but preserve permissions. */ - vi->i_mode = base_vi->i_mode & ~S_IFMT; - - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (!ctx) { - err = -ENOMEM; - goto unm_err_out; - } - /* Find the attribute. */ - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) - goto unm_err_out; - a = ctx->attr; - if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) { - if (a->flags & ATTR_COMPRESSION_MASK) { - NInoSetCompressed(ni); - if ((ni->type != AT_DATA) || (ni->type == AT_DATA && - ni->name_len)) { - ntfs_error(vi->i_sb, "Found compressed " - "non-data or named data " - "attribute. Please report " - "you saw this message to " - "linux-ntfs-dev@lists." - "sourceforge.net"); - goto unm_err_out; - } - if (vol->cluster_size > 4096) { - ntfs_error(vi->i_sb, "Found compressed " - "attribute but compression is " - "disabled due to cluster size " - "(%i) > 4kiB.", - vol->cluster_size); - goto unm_err_out; - } - if ((a->flags & ATTR_COMPRESSION_MASK) != - ATTR_IS_COMPRESSED) { - ntfs_error(vi->i_sb, "Found unknown " - "compression method."); - goto unm_err_out; - } - } - /* - * The compressed/sparse flag set in an index root just means - * to compress all files. - */ - if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) { - ntfs_error(vi->i_sb, "Found mst protected attribute " - "but the attribute is %s. Please " - "report you saw this message to " - "linux-ntfs-dev@lists.sourceforge.net", - NInoCompressed(ni) ? "compressed" : - "sparse"); - goto unm_err_out; - } - if (a->flags & ATTR_IS_SPARSE) - NInoSetSparse(ni); - } - if (a->flags & ATTR_IS_ENCRYPTED) { - if (NInoCompressed(ni)) { - ntfs_error(vi->i_sb, "Found encrypted and compressed " - "data."); - goto unm_err_out; - } - /* - * The encryption flag set in an index root just means to - * encrypt all files. - */ - if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) { - ntfs_error(vi->i_sb, "Found mst protected attribute " - "but the attribute is encrypted. " - "Please report you saw this message " - "to linux-ntfs-dev@lists.sourceforge." - "net"); - goto unm_err_out; - } - if (ni->type != AT_DATA) { - ntfs_error(vi->i_sb, "Found encrypted non-data " - "attribute."); - goto unm_err_out; - } - NInoSetEncrypted(ni); - } - if (!a->non_resident) { - /* Ensure the attribute name is placed before the value. */ - if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= - le16_to_cpu(a->data.resident.value_offset)))) { - ntfs_error(vol->sb, "Attribute name is placed after " - "the attribute value."); - goto unm_err_out; - } - if (NInoMstProtected(ni)) { - ntfs_error(vi->i_sb, "Found mst protected attribute " - "but the attribute is resident. " - "Please report you saw this message to " - "linux-ntfs-dev@lists.sourceforge.net"); - goto unm_err_out; - } - vi->i_size = ni->initialized_size = le32_to_cpu( - a->data.resident.value_length); - ni->allocated_size = le32_to_cpu(a->length) - - le16_to_cpu(a->data.resident.value_offset); - if (vi->i_size > ni->allocated_size) { - ntfs_error(vi->i_sb, "Resident attribute is corrupt " - "(size exceeds allocation)."); - goto unm_err_out; - } - } else { - NInoSetNonResident(ni); - /* - * Ensure the attribute name is placed before the mapping pairs - * array. - */ - if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset)))) { - ntfs_error(vol->sb, "Attribute name is placed after " - "the mapping pairs array."); - goto unm_err_out; - } - if (NInoCompressed(ni) || NInoSparse(ni)) { - if (NInoCompressed(ni) && a->data.non_resident. - compression_unit != 4) { - ntfs_error(vi->i_sb, "Found non-standard " - "compression unit (%u instead " - "of 4). Cannot handle this.", - a->data.non_resident. - compression_unit); - err = -EOPNOTSUPP; - goto unm_err_out; - } - if (a->data.non_resident.compression_unit) { - ni->itype.compressed.block_size = 1U << - (a->data.non_resident. - compression_unit + - vol->cluster_size_bits); - ni->itype.compressed.block_size_bits = - ffs(ni->itype.compressed. - block_size) - 1; - ni->itype.compressed.block_clusters = 1U << - a->data.non_resident. - compression_unit; - } else { - ni->itype.compressed.block_size = 0; - ni->itype.compressed.block_size_bits = 0; - ni->itype.compressed.block_clusters = 0; - } - ni->itype.compressed.size = sle64_to_cpu( - a->data.non_resident.compressed_size); - } - if (a->data.non_resident.lowest_vcn) { - ntfs_error(vi->i_sb, "First extent of attribute has " - "non-zero lowest_vcn."); - goto unm_err_out; - } - vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); - ni->initialized_size = sle64_to_cpu( - a->data.non_resident.initialized_size); - ni->allocated_size = sle64_to_cpu( - a->data.non_resident.allocated_size); - } - vi->i_mapping->a_ops = &ntfs_normal_aops; - if (NInoMstProtected(ni)) - vi->i_mapping->a_ops = &ntfs_mst_aops; - else if (NInoCompressed(ni)) - vi->i_mapping->a_ops = &ntfs_compressed_aops; - if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT) - vi->i_blocks = ni->itype.compressed.size >> 9; - else - vi->i_blocks = ni->allocated_size >> 9; - /* - * Make sure the base inode does not go away and attach it to the - * attribute inode. - */ - igrab(base_vi); - ni->ext.base_ntfs_ino = base_ni; - ni->nr_extents = -1; - - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - - ntfs_debug("Done."); - return 0; - -unm_err_out: - if (!err) - err = -EIO; - if (ctx) - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); -err_out: - ntfs_error(vol->sb, "Failed with error code %i while reading attribute " - "inode (mft_no 0x%lx, type 0x%x, name_len %i). " - "Marking corrupt inode and base inode 0x%lx as bad. " - "Run chkdsk.", err, vi->i_ino, ni->type, ni->name_len, - base_vi->i_ino); - make_bad_inode(vi); - if (err != -ENOMEM) - NVolSetErrors(vol); - return err; -} - -/** - * ntfs_read_locked_index_inode - read an index inode from its base inode - * @base_vi: base inode - * @vi: index inode to read - * - * ntfs_read_locked_index_inode() is called from ntfs_index_iget() to read the - * index inode described by @vi into memory from the base mft record described - * by @base_ni. - * - * ntfs_read_locked_index_inode() maps, pins and locks the base inode for - * reading and looks up the attributes relating to the index described by @vi - * before setting up the necessary fields in @vi as well as initializing the - * ntfs inode. - * - * Note, index inodes are essentially attribute inodes (NInoAttr() is true) - * with the attribute type set to AT_INDEX_ALLOCATION. Apart from that, they - * are setup like directory inodes since directories are a special case of - * indices ao they need to be treated in much the same way. Most importantly, - * for small indices the index allocation attribute might not actually exist. - * However, the index root attribute always exists but this does not need to - * have an inode associated with it and this is why we define a new inode type - * index. Also, like for directories, we need to have an attribute inode for - * the bitmap attribute corresponding to the index allocation attribute and we - * can store this in the appropriate field of the inode, just like we do for - * normal directory inodes. - * - * Q: What locks are held when the function is called? - * A: i_state has I_NEW set, hence the inode is locked, also - * i_count is set to 1, so it is not going to go away - * - * Return 0 on success and -errno on error. In the error case, the inode will - * have had make_bad_inode() executed on it. - */ -static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) -{ - loff_t bvi_size; - ntfs_volume *vol = NTFS_SB(vi->i_sb); - ntfs_inode *ni, *base_ni, *bni; - struct inode *bvi; - MFT_RECORD *m; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx; - INDEX_ROOT *ir; - u8 *ir_end, *index_end; - int err = 0; - - ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); - ntfs_init_big_inode(vi); - ni = NTFS_I(vi); - base_ni = NTFS_I(base_vi); - /* Just mirror the values from the base inode. */ - vi->i_uid = base_vi->i_uid; - vi->i_gid = base_vi->i_gid; - set_nlink(vi, base_vi->i_nlink); - inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi)); - inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi)); - inode_set_atime_to_ts(vi, inode_get_atime(base_vi)); - vi->i_generation = ni->seq_no = base_ni->seq_no; - /* Set inode type to zero but preserve permissions. */ - vi->i_mode = base_vi->i_mode & ~S_IFMT; - /* Map the mft record for the base inode. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (!ctx) { - err = -ENOMEM; - goto unm_err_out; - } - /* Find the index root attribute. */ - err = ntfs_attr_lookup(AT_INDEX_ROOT, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is " - "missing."); - goto unm_err_out; - } - a = ctx->attr; - /* Set up the state. */ - if (unlikely(a->non_resident)) { - ntfs_error(vol->sb, "$INDEX_ROOT attribute is not resident."); - goto unm_err_out; - } - /* Ensure the attribute name is placed before the value. */ - if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= - le16_to_cpu(a->data.resident.value_offset)))) { - ntfs_error(vol->sb, "$INDEX_ROOT attribute name is placed " - "after the attribute value."); - goto unm_err_out; - } - /* - * Compressed/encrypted/sparse index root is not allowed, except for - * directories of course but those are not dealt with here. - */ - if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED | - ATTR_IS_SPARSE)) { - ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index " - "root attribute."); - goto unm_err_out; - } - ir = (INDEX_ROOT*)((u8*)a + le16_to_cpu(a->data.resident.value_offset)); - ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length); - if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) { - ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is corrupt."); - goto unm_err_out; - } - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); - if (index_end > ir_end) { - ntfs_error(vi->i_sb, "Index is corrupt."); - goto unm_err_out; - } - if (ir->type) { - ntfs_error(vi->i_sb, "Index type is not 0 (type is 0x%x).", - le32_to_cpu(ir->type)); - goto unm_err_out; - } - ni->itype.index.collation_rule = ir->collation_rule; - ntfs_debug("Index collation rule is 0x%x.", - le32_to_cpu(ir->collation_rule)); - ni->itype.index.block_size = le32_to_cpu(ir->index_block_size); - if (!is_power_of_2(ni->itype.index.block_size)) { - ntfs_error(vi->i_sb, "Index block size (%u) is not a power of " - "two.", ni->itype.index.block_size); - goto unm_err_out; - } - if (ni->itype.index.block_size > PAGE_SIZE) { - ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_SIZE " - "(%ld) is not supported. Sorry.", - ni->itype.index.block_size, PAGE_SIZE); - err = -EOPNOTSUPP; - goto unm_err_out; - } - if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) { - ntfs_error(vi->i_sb, "Index block size (%u) < NTFS_BLOCK_SIZE " - "(%i) is not supported. Sorry.", - ni->itype.index.block_size, NTFS_BLOCK_SIZE); - err = -EOPNOTSUPP; - goto unm_err_out; - } - ni->itype.index.block_size_bits = ffs(ni->itype.index.block_size) - 1; - /* Determine the size of a vcn in the index. */ - if (vol->cluster_size <= ni->itype.index.block_size) { - ni->itype.index.vcn_size = vol->cluster_size; - ni->itype.index.vcn_size_bits = vol->cluster_size_bits; - } else { - ni->itype.index.vcn_size = vol->sector_size; - ni->itype.index.vcn_size_bits = vol->sector_size_bits; - } - /* Check for presence of index allocation attribute. */ - if (!(ir->index.flags & LARGE_INDEX)) { - /* No index allocation. */ - vi->i_size = ni->initialized_size = ni->allocated_size = 0; - /* We are done with the mft record, so we release it. */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - m = NULL; - ctx = NULL; - goto skip_large_index_stuff; - } /* LARGE_INDEX: Index allocation present. Setup state. */ - NInoSetIndexAllocPresent(ni); - /* Find index allocation attribute. */ - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " - "not present but $INDEX_ROOT " - "indicated it is."); - else - ntfs_error(vi->i_sb, "Failed to lookup " - "$INDEX_ALLOCATION attribute."); - goto unm_err_out; - } - a = ctx->attr; - if (!a->non_resident) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " - "resident."); - goto unm_err_out; - } - /* - * Ensure the attribute name is placed before the mapping pairs array. - */ - if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset)))) { - ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name is " - "placed after the mapping pairs array."); - goto unm_err_out; - } - if (a->flags & ATTR_IS_ENCRYPTED) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " - "encrypted."); - goto unm_err_out; - } - if (a->flags & ATTR_IS_SPARSE) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is sparse."); - goto unm_err_out; - } - if (a->flags & ATTR_COMPRESSION_MASK) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " - "compressed."); - goto unm_err_out; - } - if (a->data.non_resident.lowest_vcn) { - ntfs_error(vi->i_sb, "First extent of $INDEX_ALLOCATION " - "attribute has non zero lowest_vcn."); - goto unm_err_out; - } - vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); - ni->initialized_size = sle64_to_cpu( - a->data.non_resident.initialized_size); - ni->allocated_size = sle64_to_cpu(a->data.non_resident.allocated_size); - /* - * We are done with the mft record, so we release it. Otherwise - * we would deadlock in ntfs_attr_iget(). - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - m = NULL; - ctx = NULL; - /* Get the index bitmap attribute inode. */ - bvi = ntfs_attr_iget(base_vi, AT_BITMAP, ni->name, ni->name_len); - if (IS_ERR(bvi)) { - ntfs_error(vi->i_sb, "Failed to get bitmap attribute."); - err = PTR_ERR(bvi); - goto unm_err_out; - } - bni = NTFS_I(bvi); - if (NInoCompressed(bni) || NInoEncrypted(bni) || - NInoSparse(bni)) { - ntfs_error(vi->i_sb, "$BITMAP attribute is compressed and/or " - "encrypted and/or sparse."); - goto iput_unm_err_out; - } - /* Consistency check bitmap size vs. index allocation size. */ - bvi_size = i_size_read(bvi); - if ((bvi_size << 3) < (vi->i_size >> ni->itype.index.block_size_bits)) { - ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) for " - "index allocation (0x%llx).", bvi_size << 3, - vi->i_size); - goto iput_unm_err_out; - } - iput(bvi); -skip_large_index_stuff: - /* Setup the operations for this index inode. */ - vi->i_mapping->a_ops = &ntfs_mst_aops; - vi->i_blocks = ni->allocated_size >> 9; - /* - * Make sure the base inode doesn't go away and attach it to the - * index inode. - */ - igrab(base_vi); - ni->ext.base_ntfs_ino = base_ni; - ni->nr_extents = -1; - - ntfs_debug("Done."); - return 0; -iput_unm_err_out: - iput(bvi); -unm_err_out: - if (!err) - err = -EIO; - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); -err_out: - ntfs_error(vi->i_sb, "Failed with error code %i while reading index " - "inode (mft_no 0x%lx, name_len %i.", err, vi->i_ino, - ni->name_len); - make_bad_inode(vi); - if (err != -EOPNOTSUPP && err != -ENOMEM) - NVolSetErrors(vol); - return err; -} - -/* - * The MFT inode has special locking, so teach the lock validator - * about this by splitting off the locking rules of the MFT from - * the locking rules of other inodes. The MFT inode can never be - * accessed from the VFS side (or even internally), only by the - * map_mft functions. - */ -static struct lock_class_key mft_ni_runlist_lock_key, mft_ni_mrec_lock_key; - -/** - * ntfs_read_inode_mount - special read_inode for mount time use only - * @vi: inode to read - * - * Read inode FILE_MFT at mount time, only called with super_block lock - * held from within the read_super() code path. - * - * This function exists because when it is called the page cache for $MFT/$DATA - * is not initialized and hence we cannot get at the contents of mft records - * by calling map_mft_record*(). - * - * Further it needs to cope with the circular references problem, i.e. cannot - * load any attributes other than $ATTRIBUTE_LIST until $DATA is loaded, because - * we do not know where the other extent mft records are yet and again, because - * we cannot call map_mft_record*() yet. Obviously this applies only when an - * attribute list is actually present in $MFT inode. - * - * We solve these problems by starting with the $DATA attribute before anything - * else and iterating using ntfs_attr_lookup($DATA) over all extents. As each - * extent is found, we ntfs_mapping_pairs_decompress() including the implied - * ntfs_runlists_merge(). Each step of the iteration necessarily provides - * sufficient information for the next step to complete. - * - * This should work but there are two possible pit falls (see inline comments - * below), but only time will tell if they are real pits or just smoke... - */ -int ntfs_read_inode_mount(struct inode *vi) -{ - VCN next_vcn, last_vcn, highest_vcn; - s64 block; - struct super_block *sb = vi->i_sb; - ntfs_volume *vol = NTFS_SB(sb); - struct buffer_head *bh; - ntfs_inode *ni; - MFT_RECORD *m = NULL; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx; - unsigned int i, nr_blocks; - int err; - - ntfs_debug("Entering."); - - /* Initialize the ntfs specific part of @vi. */ - ntfs_init_big_inode(vi); - - ni = NTFS_I(vi); - - /* Setup the data attribute. It is special as it is mst protected. */ - NInoSetNonResident(ni); - NInoSetMstProtected(ni); - NInoSetSparseDisabled(ni); - ni->type = AT_DATA; - ni->name = NULL; - ni->name_len = 0; - /* - * This sets up our little cheat allowing us to reuse the async read io - * completion handler for directories. - */ - ni->itype.index.block_size = vol->mft_record_size; - ni->itype.index.block_size_bits = vol->mft_record_size_bits; - - /* Very important! Needed to be able to call map_mft_record*(). */ - vol->mft_ino = vi; - - /* Allocate enough memory to read the first mft record. */ - if (vol->mft_record_size > 64 * 1024) { - ntfs_error(sb, "Unsupported mft record size %i (max 64kiB).", - vol->mft_record_size); - goto err_out; - } - i = vol->mft_record_size; - if (i < sb->s_blocksize) - i = sb->s_blocksize; - m = (MFT_RECORD*)ntfs_malloc_nofs(i); - if (!m) { - ntfs_error(sb, "Failed to allocate buffer for $MFT record 0."); - goto err_out; - } - - /* Determine the first block of the $MFT/$DATA attribute. */ - block = vol->mft_lcn << vol->cluster_size_bits >> - sb->s_blocksize_bits; - nr_blocks = vol->mft_record_size >> sb->s_blocksize_bits; - if (!nr_blocks) - nr_blocks = 1; - - /* Load $MFT/$DATA's first mft record. */ - for (i = 0; i < nr_blocks; i++) { - bh = sb_bread(sb, block++); - if (!bh) { - ntfs_error(sb, "Device read failed."); - goto err_out; - } - memcpy((char*)m + (i << sb->s_blocksize_bits), bh->b_data, - sb->s_blocksize); - brelse(bh); - } - - if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) { - ntfs_error(sb, "Incorrect mft record size %u in superblock, should be %u.", - le32_to_cpu(m->bytes_allocated), vol->mft_record_size); - goto err_out; - } - - /* Apply the mst fixups. */ - if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) { - /* FIXME: Try to use the $MFTMirr now. */ - ntfs_error(sb, "MST fixup failed. $MFT is corrupt."); - goto err_out; - } - - /* Sanity check offset to the first attribute */ - if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) { - ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.", - le16_to_cpu(m->attrs_offset)); - goto err_out; - } - - /* Need this to sanity check attribute list references to $MFT. */ - vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); - - /* Provides read_folio() for map_mft_record(). */ - vi->i_mapping->a_ops = &ntfs_mst_aops; - - ctx = ntfs_attr_get_search_ctx(ni, m); - if (!ctx) { - err = -ENOMEM; - goto err_out; - } - - /* Find the attribute list attribute if present. */ - err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx); - if (err) { - if (unlikely(err != -ENOENT)) { - ntfs_error(sb, "Failed to lookup attribute list " - "attribute. You should run chkdsk."); - goto put_err_out; - } - } else /* if (!err) */ { - ATTR_LIST_ENTRY *al_entry, *next_al_entry; - u8 *al_end; - static const char *es = " Not allowed. $MFT is corrupt. " - "You should run chkdsk."; - - ntfs_debug("Attribute list attribute found in $MFT."); - NInoSetAttrList(ni); - a = ctx->attr; - if (a->flags & ATTR_COMPRESSION_MASK) { - ntfs_error(sb, "Attribute list attribute is " - "compressed.%s", es); - goto put_err_out; - } - if (a->flags & ATTR_IS_ENCRYPTED || - a->flags & ATTR_IS_SPARSE) { - if (a->non_resident) { - ntfs_error(sb, "Non-resident attribute list " - "attribute is encrypted/" - "sparse.%s", es); - goto put_err_out; - } - ntfs_warning(sb, "Resident attribute list attribute " - "in $MFT system file is marked " - "encrypted/sparse which is not true. " - "However, Windows allows this and " - "chkdsk does not detect or correct it " - "so we will just ignore the invalid " - "flags and pretend they are not set."); - } - /* Now allocate memory for the attribute list. */ - ni->attr_list_size = (u32)ntfs_attr_size(a); - if (!ni->attr_list_size) { - ntfs_error(sb, "Attr_list_size is zero"); - goto put_err_out; - } - ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); - if (!ni->attr_list) { - ntfs_error(sb, "Not enough memory to allocate buffer " - "for attribute list."); - goto put_err_out; - } - if (a->non_resident) { - NInoSetAttrListNonResident(ni); - if (a->data.non_resident.lowest_vcn) { - ntfs_error(sb, "Attribute list has non zero " - "lowest_vcn. $MFT is corrupt. " - "You should run chkdsk."); - goto put_err_out; - } - /* Setup the runlist. */ - ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol, - a, NULL); - if (IS_ERR(ni->attr_list_rl.rl)) { - err = PTR_ERR(ni->attr_list_rl.rl); - ni->attr_list_rl.rl = NULL; - ntfs_error(sb, "Mapping pairs decompression " - "failed with error code %i.", - -err); - goto put_err_out; - } - /* Now load the attribute list. */ - if ((err = load_attribute_list(vol, &ni->attr_list_rl, - ni->attr_list, ni->attr_list_size, - sle64_to_cpu(a->data. - non_resident.initialized_size)))) { - ntfs_error(sb, "Failed to load attribute list " - "attribute with error code %i.", - -err); - goto put_err_out; - } - } else /* if (!ctx.attr->non_resident) */ { - if ((u8*)a + le16_to_cpu( - a->data.resident.value_offset) + - le32_to_cpu( - a->data.resident.value_length) > - (u8*)ctx->mrec + vol->mft_record_size) { - ntfs_error(sb, "Corrupt attribute list " - "attribute."); - goto put_err_out; - } - /* Now copy the attribute list. */ - memcpy(ni->attr_list, (u8*)a + le16_to_cpu( - a->data.resident.value_offset), - le32_to_cpu( - a->data.resident.value_length)); - } - /* The attribute list is now setup in memory. */ - /* - * FIXME: I don't know if this case is actually possible. - * According to logic it is not possible but I have seen too - * many weird things in MS software to rely on logic... Thus we - * perform a manual search and make sure the first $MFT/$DATA - * extent is in the base inode. If it is not we abort with an - * error and if we ever see a report of this error we will need - * to do some magic in order to have the necessary mft record - * loaded and in the right place in the page cache. But - * hopefully logic will prevail and this never happens... - */ - al_entry = (ATTR_LIST_ENTRY*)ni->attr_list; - al_end = (u8*)al_entry + ni->attr_list_size; - for (;; al_entry = next_al_entry) { - /* Out of bounds check. */ - if ((u8*)al_entry < ni->attr_list || - (u8*)al_entry > al_end) - goto em_put_err_out; - /* Catch the end of the attribute list. */ - if ((u8*)al_entry == al_end) - goto em_put_err_out; - if (!al_entry->length) - goto em_put_err_out; - if ((u8*)al_entry + 6 > al_end || (u8*)al_entry + - le16_to_cpu(al_entry->length) > al_end) - goto em_put_err_out; - next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + - le16_to_cpu(al_entry->length)); - if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA)) - goto em_put_err_out; - if (AT_DATA != al_entry->type) - continue; - /* We want an unnamed attribute. */ - if (al_entry->name_length) - goto em_put_err_out; - /* Want the first entry, i.e. lowest_vcn == 0. */ - if (al_entry->lowest_vcn) - goto em_put_err_out; - /* First entry has to be in the base mft record. */ - if (MREF_LE(al_entry->mft_reference) != vi->i_ino) { - /* MFT references do not match, logic fails. */ - ntfs_error(sb, "BUG: The first $DATA extent " - "of $MFT is not in the base " - "mft record. Please report " - "you saw this message to " - "linux-ntfs-dev@lists." - "sourceforge.net"); - goto put_err_out; - } else { - /* Sequence numbers must match. */ - if (MSEQNO_LE(al_entry->mft_reference) != - ni->seq_no) - goto em_put_err_out; - /* Got it. All is ok. We can stop now. */ - break; - } - } - } - - ntfs_attr_reinit_search_ctx(ctx); - - /* Now load all attribute extents. */ - a = NULL; - next_vcn = last_vcn = highest_vcn = 0; - while (!(err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, next_vcn, NULL, 0, - ctx))) { - runlist_element *nrl; - - /* Cache the current attribute. */ - a = ctx->attr; - /* $MFT must be non-resident. */ - if (!a->non_resident) { - ntfs_error(sb, "$MFT must be non-resident but a " - "resident extent was found. $MFT is " - "corrupt. Run chkdsk."); - goto put_err_out; - } - /* $MFT must be uncompressed and unencrypted. */ - if (a->flags & ATTR_COMPRESSION_MASK || - a->flags & ATTR_IS_ENCRYPTED || - a->flags & ATTR_IS_SPARSE) { - ntfs_error(sb, "$MFT must be uncompressed, " - "non-sparse, and unencrypted but a " - "compressed/sparse/encrypted extent " - "was found. $MFT is corrupt. Run " - "chkdsk."); - goto put_err_out; - } - /* - * Decompress the mapping pairs array of this extent and merge - * the result into the existing runlist. No need for locking - * as we have exclusive access to the inode at this time and we - * are a mount in progress task, too. - */ - nrl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl); - if (IS_ERR(nrl)) { - ntfs_error(sb, "ntfs_mapping_pairs_decompress() " - "failed with error code %ld. $MFT is " - "corrupt.", PTR_ERR(nrl)); - goto put_err_out; - } - ni->runlist.rl = nrl; - - /* Are we in the first extent? */ - if (!next_vcn) { - if (a->data.non_resident.lowest_vcn) { - ntfs_error(sb, "First extent of $DATA " - "attribute has non zero " - "lowest_vcn. $MFT is corrupt. " - "You should run chkdsk."); - goto put_err_out; - } - /* Get the last vcn in the $DATA attribute. */ - last_vcn = sle64_to_cpu( - a->data.non_resident.allocated_size) - >> vol->cluster_size_bits; - /* Fill in the inode size. */ - vi->i_size = sle64_to_cpu( - a->data.non_resident.data_size); - ni->initialized_size = sle64_to_cpu( - a->data.non_resident.initialized_size); - ni->allocated_size = sle64_to_cpu( - a->data.non_resident.allocated_size); - /* - * Verify the number of mft records does not exceed - * 2^32 - 1. - */ - if ((vi->i_size >> vol->mft_record_size_bits) >= - (1ULL << 32)) { - ntfs_error(sb, "$MFT is too big! Aborting."); - goto put_err_out; - } - /* - * We have got the first extent of the runlist for - * $MFT which means it is now relatively safe to call - * the normal ntfs_read_inode() function. - * Complete reading the inode, this will actually - * re-read the mft record for $MFT, this time entering - * it into the page cache with which we complete the - * kick start of the volume. It should be safe to do - * this now as the first extent of $MFT/$DATA is - * already known and we would hope that we don't need - * further extents in order to find the other - * attributes belonging to $MFT. Only time will tell if - * this is really the case. If not we will have to play - * magic at this point, possibly duplicating a lot of - * ntfs_read_inode() at this point. We will need to - * ensure we do enough of its work to be able to call - * ntfs_read_inode() on extents of $MFT/$DATA. But lets - * hope this never happens... - */ - ntfs_read_locked_inode(vi); - if (is_bad_inode(vi)) { - ntfs_error(sb, "ntfs_read_inode() of $MFT " - "failed. BUG or corrupt $MFT. " - "Run chkdsk and if no errors " - "are found, please report you " - "saw this message to " - "linux-ntfs-dev@lists." - "sourceforge.net"); - ntfs_attr_put_search_ctx(ctx); - /* Revert to the safe super operations. */ - ntfs_free(m); - return -1; - } - /* - * Re-initialize some specifics about $MFT's inode as - * ntfs_read_inode() will have set up the default ones. - */ - /* Set uid and gid to root. */ - vi->i_uid = GLOBAL_ROOT_UID; - vi->i_gid = GLOBAL_ROOT_GID; - /* Regular file. No access for anyone. */ - vi->i_mode = S_IFREG; - /* No VFS initiated operations allowed for $MFT. */ - vi->i_op = &ntfs_empty_inode_ops; - vi->i_fop = &ntfs_empty_file_ops; - } - - /* Get the lowest vcn for the next extent. */ - highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); - next_vcn = highest_vcn + 1; - - /* Only one extent or error, which we catch below. */ - if (next_vcn <= 0) - break; - - /* Avoid endless loops due to corruption. */ - if (next_vcn < sle64_to_cpu( - a->data.non_resident.lowest_vcn)) { - ntfs_error(sb, "$MFT has corrupt attribute list " - "attribute. Run chkdsk."); - goto put_err_out; - } - } - if (err != -ENOENT) { - ntfs_error(sb, "Failed to lookup $MFT/$DATA attribute extent. " - "$MFT is corrupt. Run chkdsk."); - goto put_err_out; - } - if (!a) { - ntfs_error(sb, "$MFT/$DATA attribute not found. $MFT is " - "corrupt. Run chkdsk."); - goto put_err_out; - } - if (highest_vcn && highest_vcn != last_vcn - 1) { - ntfs_error(sb, "Failed to load the complete runlist for " - "$MFT/$DATA. Driver bug or corrupt $MFT. " - "Run chkdsk."); - ntfs_debug("highest_vcn = 0x%llx, last_vcn - 1 = 0x%llx", - (unsigned long long)highest_vcn, - (unsigned long long)last_vcn - 1); - goto put_err_out; - } - ntfs_attr_put_search_ctx(ctx); - ntfs_debug("Done."); - ntfs_free(m); - - /* - * Split the locking rules of the MFT inode from the - * locking rules of other inodes: - */ - lockdep_set_class(&ni->runlist.lock, &mft_ni_runlist_lock_key); - lockdep_set_class(&ni->mrec_lock, &mft_ni_mrec_lock_key); - - return 0; - -em_put_err_out: - ntfs_error(sb, "Couldn't find first extent of $DATA attribute in " - "attribute list. $MFT is corrupt. Run chkdsk."); -put_err_out: - ntfs_attr_put_search_ctx(ctx); -err_out: - ntfs_error(sb, "Failed. Marking inode as bad."); - make_bad_inode(vi); - ntfs_free(m); - return -1; -} - -static void __ntfs_clear_inode(ntfs_inode *ni) -{ - /* Free all alocated memory. */ - down_write(&ni->runlist.lock); - if (ni->runlist.rl) { - ntfs_free(ni->runlist.rl); - ni->runlist.rl = NULL; - } - up_write(&ni->runlist.lock); - - if (ni->attr_list) { - ntfs_free(ni->attr_list); - ni->attr_list = NULL; - } - - down_write(&ni->attr_list_rl.lock); - if (ni->attr_list_rl.rl) { - ntfs_free(ni->attr_list_rl.rl); - ni->attr_list_rl.rl = NULL; - } - up_write(&ni->attr_list_rl.lock); - - if (ni->name_len && ni->name != I30) { - /* Catch bugs... */ - BUG_ON(!ni->name); - kfree(ni->name); - } -} - -void ntfs_clear_extent_inode(ntfs_inode *ni) -{ - ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); - - BUG_ON(NInoAttr(ni)); - BUG_ON(ni->nr_extents != -1); - -#ifdef NTFS_RW - if (NInoDirty(ni)) { - if (!is_bad_inode(VFS_I(ni->ext.base_ntfs_ino))) - ntfs_error(ni->vol->sb, "Clearing dirty extent inode! " - "Losing data! This is a BUG!!!"); - // FIXME: Do something!!! - } -#endif /* NTFS_RW */ - - __ntfs_clear_inode(ni); - - /* Bye, bye... */ - ntfs_destroy_extent_inode(ni); -} - -/** - * ntfs_evict_big_inode - clean up the ntfs specific part of an inode - * @vi: vfs inode pending annihilation - * - * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode() - * is called, which deallocates all memory belonging to the NTFS specific part - * of the inode and returns. - * - * If the MFT record is dirty, we commit it before doing anything else. - */ -void ntfs_evict_big_inode(struct inode *vi) -{ - ntfs_inode *ni = NTFS_I(vi); - - truncate_inode_pages_final(&vi->i_data); - clear_inode(vi); - -#ifdef NTFS_RW - if (NInoDirty(ni)) { - bool was_bad = (is_bad_inode(vi)); - - /* Committing the inode also commits all extent inodes. */ - ntfs_commit_inode(vi); - - if (!was_bad && (is_bad_inode(vi) || NInoDirty(ni))) { - ntfs_error(vi->i_sb, "Failed to commit dirty inode " - "0x%lx. Losing data!", vi->i_ino); - // FIXME: Do something!!! - } - } -#endif /* NTFS_RW */ - - /* No need to lock at this stage as no one else has a reference. */ - if (ni->nr_extents > 0) { - int i; - - for (i = 0; i < ni->nr_extents; i++) - ntfs_clear_extent_inode(ni->ext.extent_ntfs_inos[i]); - kfree(ni->ext.extent_ntfs_inos); - } - - __ntfs_clear_inode(ni); - - if (NInoAttr(ni)) { - /* Release the base inode if we are holding it. */ - if (ni->nr_extents == -1) { - iput(VFS_I(ni->ext.base_ntfs_ino)); - ni->nr_extents = 0; - ni->ext.base_ntfs_ino = NULL; - } - } - BUG_ON(ni->page); - if (!atomic_dec_and_test(&ni->count)) - BUG(); - return; -} - -/** - * ntfs_show_options - show mount options in /proc/mounts - * @sf: seq_file in which to write our mount options - * @root: root of the mounted tree whose mount options to display - * - * Called by the VFS once for each mounted ntfs volume when someone reads - * /proc/mounts in order to display the NTFS specific mount options of each - * mount. The mount options of fs specified by @root are written to the seq file - * @sf and success is returned. - */ -int ntfs_show_options(struct seq_file *sf, struct dentry *root) -{ - ntfs_volume *vol = NTFS_SB(root->d_sb); - int i; - - seq_printf(sf, ",uid=%i", from_kuid_munged(&init_user_ns, vol->uid)); - seq_printf(sf, ",gid=%i", from_kgid_munged(&init_user_ns, vol->gid)); - if (vol->fmask == vol->dmask) - seq_printf(sf, ",umask=0%o", vol->fmask); - else { - seq_printf(sf, ",fmask=0%o", vol->fmask); - seq_printf(sf, ",dmask=0%o", vol->dmask); - } - seq_printf(sf, ",nls=%s", vol->nls_map->charset); - if (NVolCaseSensitive(vol)) - seq_printf(sf, ",case_sensitive"); - if (NVolShowSystemFiles(vol)) - seq_printf(sf, ",show_sys_files"); - if (!NVolSparseEnabled(vol)) - seq_printf(sf, ",disable_sparse"); - for (i = 0; on_errors_arr[i].val; i++) { - if (on_errors_arr[i].val & vol->on_errors) - seq_printf(sf, ",errors=%s", on_errors_arr[i].str); - } - seq_printf(sf, ",mft_zone_multiplier=%i", vol->mft_zone_multiplier); - return 0; -} - -#ifdef NTFS_RW - -static const char *es = " Leaving inconsistent metadata. Unmount and run " - "chkdsk."; - -/** - * ntfs_truncate - called when the i_size of an ntfs inode is changed - * @vi: inode for which the i_size was changed - * - * We only support i_size changes for normal files at present, i.e. not - * compressed and not encrypted. This is enforced in ntfs_setattr(), see - * below. - * - * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and - * that the change is allowed. - * - * This implies for us that @vi is a file inode rather than a directory, index, - * or attribute inode as well as that @vi is a base inode. - * - * Returns 0 on success or -errno on error. - * - * Called with ->i_mutex held. - */ -int ntfs_truncate(struct inode *vi) -{ - s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size; - VCN highest_vcn; - unsigned long flags; - ntfs_inode *base_ni, *ni = NTFS_I(vi); - ntfs_volume *vol = ni->vol; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *m; - ATTR_RECORD *a; - const char *te = " Leaving file length out of sync with i_size."; - int err, mp_size, size_change, alloc_change; - - ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); - BUG_ON(NInoAttr(ni)); - BUG_ON(S_ISDIR(vi->i_mode)); - BUG_ON(NInoMstProtected(ni)); - BUG_ON(ni->nr_extents < 0); -retry_truncate: - /* - * Lock the runlist for writing and map the mft record to ensure it is - * safe to mess with the attribute runlist and sizes. - */ - down_write(&ni->runlist.lock); - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx " - "(error code %d).%s", vi->i_ino, err, te); - ctx = NULL; - m = NULL; - goto old_bad_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - ntfs_error(vi->i_sb, "Failed to allocate a search context for " - "inode 0x%lx (not enough memory).%s", - vi->i_ino, te); - err = -ENOMEM; - goto old_bad_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) { - ntfs_error(vi->i_sb, "Open attribute is missing from " - "mft record. Inode 0x%lx is corrupt. " - "Run chkdsk.%s", vi->i_ino, te); - err = -EIO; - } else - ntfs_error(vi->i_sb, "Failed to lookup attribute in " - "inode 0x%lx (error code %d).%s", - vi->i_ino, err, te); - goto old_bad_out; - } - m = ctx->mrec; - a = ctx->attr; - /* - * The i_size of the vfs inode is the new size for the attribute value. - */ - new_size = i_size_read(vi); - /* The current size of the attribute value is the old size. */ - old_size = ntfs_attr_size(a); - /* Calculate the new allocated size. */ - if (NInoNonResident(ni)) - new_alloc_size = (new_size + vol->cluster_size - 1) & - ~(s64)vol->cluster_size_mask; - else - new_alloc_size = (new_size + 7) & ~7; - /* The current allocated size is the old allocated size. */ - read_lock_irqsave(&ni->size_lock, flags); - old_alloc_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - /* - * The change in the file size. This will be 0 if no change, >0 if the - * size is growing, and <0 if the size is shrinking. - */ - size_change = -1; - if (new_size - old_size >= 0) { - size_change = 1; - if (new_size == old_size) - size_change = 0; - } - /* As above for the allocated size. */ - alloc_change = -1; - if (new_alloc_size - old_alloc_size >= 0) { - alloc_change = 1; - if (new_alloc_size == old_alloc_size) - alloc_change = 0; - } - /* - * If neither the size nor the allocation are being changed there is - * nothing to do. - */ - if (!size_change && !alloc_change) - goto unm_done; - /* If the size is changing, check if new size is allowed in $AttrDef. */ - if (size_change) { - err = ntfs_attr_size_bounds_check(vol, ni->type, new_size); - if (unlikely(err)) { - if (err == -ERANGE) { - ntfs_error(vol->sb, "Truncate would cause the " - "inode 0x%lx to %simum size " - "for its attribute type " - "(0x%x). Aborting truncate.", - vi->i_ino, - new_size > old_size ? "exceed " - "the max" : "go under the min", - le32_to_cpu(ni->type)); - err = -EFBIG; - } else { - ntfs_error(vol->sb, "Inode 0x%lx has unknown " - "attribute type 0x%x. " - "Aborting truncate.", - vi->i_ino, - le32_to_cpu(ni->type)); - err = -EIO; - } - /* Reset the vfs inode size to the old size. */ - i_size_write(vi, old_size); - goto err_out; - } - } - if (NInoCompressed(ni) || NInoEncrypted(ni)) { - ntfs_warning(vi->i_sb, "Changes in inode size are not " - "supported yet for %s files, ignoring.", - NInoCompressed(ni) ? "compressed" : - "encrypted"); - err = -EOPNOTSUPP; - goto bad_out; - } - if (a->non_resident) - goto do_non_resident_truncate; - BUG_ON(NInoNonResident(ni)); - /* Resize the attribute record to best fit the new attribute size. */ - if (new_size < vol->mft_record_size && - !ntfs_resident_attr_value_resize(m, a, new_size)) { - /* The resize succeeded! */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - write_lock_irqsave(&ni->size_lock, flags); - /* Update the sizes in the ntfs inode and all is done. */ - ni->allocated_size = le32_to_cpu(a->length) - - le16_to_cpu(a->data.resident.value_offset); - /* - * Note ntfs_resident_attr_value_resize() has already done any - * necessary data clearing in the attribute record. When the - * file is being shrunk vmtruncate() will already have cleared - * the top part of the last partial page, i.e. since this is - * the resident case this is the page with index 0. However, - * when the file is being expanded, the page cache page data - * between the old data_size, i.e. old_size, and the new_size - * has not been zeroed. Fortunately, we do not need to zero it - * either since on one hand it will either already be zero due - * to both read_folio and writepage clearing partial page data - * beyond i_size in which case there is nothing to do or in the - * case of the file being mmap()ped at the same time, POSIX - * specifies that the behaviour is unspecified thus we do not - * have to do anything. This means that in our implementation - * in the rare case that the file is mmap()ped and a write - * occurred into the mmap()ped region just beyond the file size - * and writepage has not yet been called to write out the page - * (which would clear the area beyond the file size) and we now - * extend the file size to incorporate this dirty region - * outside the file size, a write of the page would result in - * this data being written to disk instead of being cleared. - * Given both POSIX and the Linux mmap(2) man page specify that - * this corner case is undefined, we choose to leave it like - * that as this is much simpler for us as we cannot lock the - * relevant page now since we are holding too many ntfs locks - * which would result in a lock reversal deadlock. - */ - ni->initialized_size = new_size; - write_unlock_irqrestore(&ni->size_lock, flags); - goto unm_done; - } - /* If the above resize failed, this must be an attribute extension. */ - BUG_ON(size_change < 0); - /* - * We have to drop all the locks so we can call - * ntfs_attr_make_non_resident(). This could be optimised by try- - * locking the first page cache page and only if that fails dropping - * the locks, locking the page, and redoing all the locking and - * lookups. While this would be a huge optimisation, it is not worth - * it as this is definitely a slow code path as it only ever can happen - * once for any given file. - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - /* - * Not enough space in the mft record, try to make the attribute - * non-resident and if successful restart the truncation process. - */ - err = ntfs_attr_make_non_resident(ni, old_size); - if (likely(!err)) - goto retry_truncate; - /* - * Could not make non-resident. If this is due to this not being - * permitted for this attribute type or there not being enough space, - * try to make other attributes non-resident. Otherwise fail. - */ - if (unlikely(err != -EPERM && err != -ENOSPC)) { - ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute " - "type 0x%x, because the conversion from " - "resident to non-resident attribute failed " - "with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err != -ENOMEM) - err = -EIO; - goto conv_err_out; - } - /* TODO: Not implemented from here, abort. */ - if (err == -ENOSPC) - ntfs_error(vol->sb, "Not enough space in the mft record/on " - "disk for the non-resident attribute value. " - "This case is not implemented yet."); - else /* if (err == -EPERM) */ - ntfs_error(vol->sb, "This attribute type may not be " - "non-resident. This case is not implemented " - "yet."); - err = -EOPNOTSUPP; - goto conv_err_out; -#if 0 - // TODO: Attempt to make other attributes non-resident. - if (!err) - goto do_resident_extend; - /* - * Both the attribute list attribute and the standard information - * attribute must remain in the base inode. Thus, if this is one of - * these attributes, we have to try to move other attributes out into - * extent mft records instead. - */ - if (ni->type == AT_ATTRIBUTE_LIST || - ni->type == AT_STANDARD_INFORMATION) { - // TODO: Attempt to move other attributes into extent mft - // records. - err = -EOPNOTSUPP; - if (!err) - goto do_resident_extend; - goto err_out; - } - // TODO: Attempt to move this attribute to an extent mft record, but - // only if it is not already the only attribute in an mft record in - // which case there would be nothing to gain. - err = -EOPNOTSUPP; - if (!err) - goto do_resident_extend; - /* There is nothing we can do to make enough space. )-: */ - goto err_out; -#endif -do_non_resident_truncate: - BUG_ON(!NInoNonResident(ni)); - if (alloc_change < 0) { - highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); - if (highest_vcn > 0 && - old_alloc_size >> vol->cluster_size_bits > - highest_vcn + 1) { - /* - * This attribute has multiple extents. Not yet - * supported. - */ - ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, " - "attribute type 0x%x, because the " - "attribute is highly fragmented (it " - "consists of multiple extents) and " - "this case is not implemented yet.", - vi->i_ino, - (unsigned)le32_to_cpu(ni->type)); - err = -EOPNOTSUPP; - goto bad_out; - } - } - /* - * If the size is shrinking, need to reduce the initialized_size and - * the data_size before reducing the allocation. - */ - if (size_change < 0) { - /* - * Make the valid size smaller (i_size is already up-to-date). - */ - write_lock_irqsave(&ni->size_lock, flags); - if (new_size < ni->initialized_size) { - ni->initialized_size = new_size; - a->data.non_resident.initialized_size = - cpu_to_sle64(new_size); - } - a->data.non_resident.data_size = cpu_to_sle64(new_size); - write_unlock_irqrestore(&ni->size_lock, flags); - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - /* If the allocated size is not changing, we are done. */ - if (!alloc_change) - goto unm_done; - /* - * If the size is shrinking it makes no sense for the - * allocation to be growing. - */ - BUG_ON(alloc_change > 0); - } else /* if (size_change >= 0) */ { - /* - * The file size is growing or staying the same but the - * allocation can be shrinking, growing or staying the same. - */ - if (alloc_change > 0) { - /* - * We need to extend the allocation and possibly update - * the data size. If we are updating the data size, - * since we are not touching the initialized_size we do - * not need to worry about the actual data on disk. - * And as far as the page cache is concerned, there - * will be no pages beyond the old data size and any - * partial region in the last page between the old and - * new data size (or the end of the page if the new - * data size is outside the page) does not need to be - * modified as explained above for the resident - * attribute truncate case. To do this, we simply drop - * the locks we hold and leave all the work to our - * friendly helper ntfs_attr_extend_allocation(). - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - err = ntfs_attr_extend_allocation(ni, new_size, - size_change > 0 ? new_size : -1, -1); - /* - * ntfs_attr_extend_allocation() will have done error - * output already. - */ - goto done; - } - if (!alloc_change) - goto alloc_done; - } - /* alloc_change < 0 */ - /* Free the clusters. */ - nr_freed = ntfs_cluster_free(ni, new_alloc_size >> - vol->cluster_size_bits, -1, ctx); - m = ctx->mrec; - a = ctx->attr; - if (unlikely(nr_freed < 0)) { - ntfs_error(vol->sb, "Failed to release cluster(s) (error code " - "%lli). Unmount and run chkdsk to recover " - "the lost cluster(s).", (long long)nr_freed); - NVolSetErrors(vol); - nr_freed = 0; - } - /* Truncate the runlist. */ - err = ntfs_rl_truncate_nolock(vol, &ni->runlist, - new_alloc_size >> vol->cluster_size_bits); - /* - * If the runlist truncation failed and/or the search context is no - * longer valid, we cannot resize the attribute record or build the - * mapping pairs array thus we mark the inode bad so that no access to - * the freed clusters can happen. - */ - if (unlikely(err || IS_ERR(m))) { - ntfs_error(vol->sb, "Failed to %s (error code %li).%s", - IS_ERR(m) ? - "restore attribute search context" : - "truncate attribute runlist", - IS_ERR(m) ? PTR_ERR(m) : err, es); - err = -EIO; - goto bad_out; - } - /* Get the size for the shrunk mapping pairs array for the runlist. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1); - if (unlikely(mp_size <= 0)) { - ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, " - "attribute type 0x%x, because determining the " - "size for the mapping pairs failed with error " - "code %i.%s", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), mp_size, es); - err = -EIO; - goto bad_out; - } - /* - * Shrink the attribute record for the new mapping pairs array. Note, - * this cannot fail since we are making the attribute smaller thus by - * definition there is enough space to do so. - */ - err = ntfs_attr_record_resize(m, a, mp_size + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); - BUG_ON(err); - /* - * Generate the mapping pairs array directly into the attribute record. - */ - err = ntfs_mapping_pairs_build(vol, (u8*)a + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset), - mp_size, ni->runlist.rl, 0, -1, NULL); - if (unlikely(err)) { - ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, " - "attribute type 0x%x, because building the " - "mapping pairs failed with error code %i.%s", - vi->i_ino, (unsigned)le32_to_cpu(ni->type), - err, es); - err = -EIO; - goto bad_out; - } - /* Update the allocated/compressed size as well as the highest vcn. */ - a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >> - vol->cluster_size_bits) - 1); - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = new_alloc_size; - a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size); - if (NInoSparse(ni) || NInoCompressed(ni)) { - if (nr_freed) { - ni->itype.compressed.size -= nr_freed << - vol->cluster_size_bits; - BUG_ON(ni->itype.compressed.size < 0); - a->data.non_resident.compressed_size = cpu_to_sle64( - ni->itype.compressed.size); - vi->i_blocks = ni->itype.compressed.size >> 9; - } - } else - vi->i_blocks = new_alloc_size >> 9; - write_unlock_irqrestore(&ni->size_lock, flags); - /* - * We have shrunk the allocation. If this is a shrinking truncate we - * have already dealt with the initialized_size and the data_size above - * and we are done. If the truncate is only changing the allocation - * and not the data_size, we are also done. If this is an extending - * truncate, need to extend the data_size now which is ensured by the - * fact that @size_change is positive. - */ -alloc_done: - /* - * If the size is growing, need to update it now. If it is shrinking, - * we have already updated it above (before the allocation change). - */ - if (size_change > 0) - a->data.non_resident.data_size = cpu_to_sle64(new_size); - /* Ensure the modified mft record is written out. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); -unm_done: - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); -done: - /* Update the mtime and ctime on the base inode. */ - /* normally ->truncate shouldn't update ctime or mtime, - * but ntfs did before so it got a copy & paste version - * of file_update_time. one day someone should fix this - * for real. - */ - if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) { - struct timespec64 now = current_time(VFS_I(base_ni)); - struct timespec64 ctime = inode_get_ctime(VFS_I(base_ni)); - struct timespec64 mtime = inode_get_mtime(VFS_I(base_ni)); - int sync_it = 0; - - if (!timespec64_equal(&mtime, &now) || - !timespec64_equal(&ctime, &now)) - sync_it = 1; - inode_set_ctime_to_ts(VFS_I(base_ni), now); - inode_set_mtime_to_ts(VFS_I(base_ni), now); - - if (sync_it) - mark_inode_dirty_sync(VFS_I(base_ni)); - } - - if (likely(!err)) { - NInoClearTruncateFailed(ni); - ntfs_debug("Done."); - } - return err; -old_bad_out: - old_size = -1; -bad_out: - if (err != -ENOMEM && err != -EOPNOTSUPP) - NVolSetErrors(vol); - if (err != -EOPNOTSUPP) - NInoSetTruncateFailed(ni); - else if (old_size >= 0) - i_size_write(vi, old_size); -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); -out: - ntfs_debug("Failed. Returning error code %i.", err); - return err; -conv_err_out: - if (err != -ENOMEM && err != -EOPNOTSUPP) - NVolSetErrors(vol); - if (err != -EOPNOTSUPP) - NInoSetTruncateFailed(ni); - else - i_size_write(vi, old_size); - goto out; -} - -/** - * ntfs_truncate_vfs - wrapper for ntfs_truncate() that has no return value - * @vi: inode for which the i_size was changed - * - * Wrapper for ntfs_truncate() that has no return value. - * - * See ntfs_truncate() description above for details. - */ -#ifdef NTFS_RW -void ntfs_truncate_vfs(struct inode *vi) { - ntfs_truncate(vi); -} -#endif - -/** - * ntfs_setattr - called from notify_change() when an attribute is being changed - * @idmap: idmap of the mount the inode was found from - * @dentry: dentry whose attributes to change - * @attr: structure describing the attributes and the changes - * - * We have to trap VFS attempts to truncate the file described by @dentry as - * soon as possible, because we do not implement changes in i_size yet. So we - * abort all i_size changes here. - * - * We also abort all changes of user, group, and mode as we do not implement - * the NTFS ACLs yet. - * - * Called with ->i_mutex held. - */ -int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, - struct iattr *attr) -{ - struct inode *vi = d_inode(dentry); - int err; - unsigned int ia_valid = attr->ia_valid; - - err = setattr_prepare(&nop_mnt_idmap, dentry, attr); - if (err) - goto out; - /* We do not support NTFS ACLs yet. */ - if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) { - ntfs_warning(vi->i_sb, "Changes in user/group/mode are not " - "supported yet, ignoring."); - err = -EOPNOTSUPP; - goto out; - } - if (ia_valid & ATTR_SIZE) { - if (attr->ia_size != i_size_read(vi)) { - ntfs_inode *ni = NTFS_I(vi); - /* - * FIXME: For now we do not support resizing of - * compressed or encrypted files yet. - */ - if (NInoCompressed(ni) || NInoEncrypted(ni)) { - ntfs_warning(vi->i_sb, "Changes in inode size " - "are not supported yet for " - "%s files, ignoring.", - NInoCompressed(ni) ? - "compressed" : "encrypted"); - err = -EOPNOTSUPP; - } else { - truncate_setsize(vi, attr->ia_size); - ntfs_truncate_vfs(vi); - } - if (err || ia_valid == ATTR_SIZE) - goto out; - } else { - /* - * We skipped the truncate but must still update - * timestamps. - */ - ia_valid |= ATTR_MTIME | ATTR_CTIME; - } - } - if (ia_valid & ATTR_ATIME) - inode_set_atime_to_ts(vi, attr->ia_atime); - if (ia_valid & ATTR_MTIME) - inode_set_mtime_to_ts(vi, attr->ia_mtime); - if (ia_valid & ATTR_CTIME) - inode_set_ctime_to_ts(vi, attr->ia_ctime); - mark_inode_dirty(vi); -out: - return err; -} - -/** - * __ntfs_write_inode - write out a dirty inode - * @vi: inode to write out - * @sync: if true, write out synchronously - * - * Write out a dirty inode to disk including any extent inodes if present. - * - * If @sync is true, commit the inode to disk and wait for io completion. This - * is done using write_mft_record(). - * - * If @sync is false, just schedule the write to happen but do not wait for i/o - * completion. In 2.6 kernels, scheduling usually happens just by virtue of - * marking the page (and in this case mft record) dirty but we do not implement - * this yet as write_mft_record() largely ignores the @sync parameter and - * always performs synchronous writes. - * - * Return 0 on success and -errno on error. - */ -int __ntfs_write_inode(struct inode *vi, int sync) -{ - sle64 nt; - ntfs_inode *ni = NTFS_I(vi); - ntfs_attr_search_ctx *ctx; - MFT_RECORD *m; - STANDARD_INFORMATION *si; - int err = 0; - bool modified = false; - - ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "", - vi->i_ino); - /* - * Dirty attribute inodes are written via their real inodes so just - * clean them here. Access time updates are taken care off when the - * real inode is written. - */ - if (NInoAttr(ni)) { - NInoClearDirty(ni); - ntfs_debug("Done."); - return 0; - } - /* Map, pin, and lock the mft record belonging to the inode. */ - m = map_mft_record(ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - goto err_out; - } - /* Update the access times in the standard information attribute. */ - ctx = ntfs_attr_get_search_ctx(ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto unm_err_out; - } - err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - ntfs_attr_put_search_ctx(ctx); - goto unm_err_out; - } - si = (STANDARD_INFORMATION*)((u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset)); - /* Update the access times if they have changed. */ - nt = utc2ntfs(inode_get_mtime(vi)); - if (si->last_data_change_time != nt) { - ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, " - "new = 0x%llx", vi->i_ino, (long long) - sle64_to_cpu(si->last_data_change_time), - (long long)sle64_to_cpu(nt)); - si->last_data_change_time = nt; - modified = true; - } - nt = utc2ntfs(inode_get_ctime(vi)); - if (si->last_mft_change_time != nt) { - ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, " - "new = 0x%llx", vi->i_ino, (long long) - sle64_to_cpu(si->last_mft_change_time), - (long long)sle64_to_cpu(nt)); - si->last_mft_change_time = nt; - modified = true; - } - nt = utc2ntfs(inode_get_atime(vi)); - if (si->last_access_time != nt) { - ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, " - "new = 0x%llx", vi->i_ino, - (long long)sle64_to_cpu(si->last_access_time), - (long long)sle64_to_cpu(nt)); - si->last_access_time = nt; - modified = true; - } - /* - * If we just modified the standard information attribute we need to - * mark the mft record it is in dirty. We do this manually so that - * mark_inode_dirty() is not called which would redirty the inode and - * hence result in an infinite loop of trying to write the inode. - * There is no need to mark the base inode nor the base mft record - * dirty, since we are going to write this mft record below in any case - * and the base mft record may actually not have been modified so it - * might not need to be written out. - * NOTE: It is not a problem when the inode for $MFT itself is being - * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES - * on the $MFT inode and hence __ntfs_write_inode() will not be - * re-invoked because of it which in turn is ok since the dirtied mft - * record will be cleaned and written out to disk below, i.e. before - * this function returns. - */ - if (modified) { - flush_dcache_mft_record_page(ctx->ntfs_ino); - if (!NInoTestSetDirty(ctx->ntfs_ino)) - mark_ntfs_record_dirty(ctx->ntfs_ino->page, - ctx->ntfs_ino->page_ofs); - } - ntfs_attr_put_search_ctx(ctx); - /* Now the access times are updated, write the base mft record. */ - if (NInoDirty(ni)) - err = write_mft_record(ni, m, sync); - /* Write all attached extent mft records. */ - mutex_lock(&ni->extent_lock); - if (ni->nr_extents > 0) { - ntfs_inode **extent_nis = ni->ext.extent_ntfs_inos; - int i; - - ntfs_debug("Writing %i extent inodes.", ni->nr_extents); - for (i = 0; i < ni->nr_extents; i++) { - ntfs_inode *tni = extent_nis[i]; - - if (NInoDirty(tni)) { - MFT_RECORD *tm = map_mft_record(tni); - int ret; - - if (IS_ERR(tm)) { - if (!err || err == -ENOMEM) - err = PTR_ERR(tm); - continue; - } - ret = write_mft_record(tni, tm, sync); - unmap_mft_record(tni); - if (unlikely(ret)) { - if (!err || err == -ENOMEM) - err = ret; - } - } - } - } - mutex_unlock(&ni->extent_lock); - unmap_mft_record(ni); - if (unlikely(err)) - goto err_out; - ntfs_debug("Done."); - return 0; -unm_err_out: - unmap_mft_record(ni); -err_out: - if (err == -ENOMEM) { - ntfs_warning(vi->i_sb, "Not enough memory to write inode. " - "Marking the inode dirty again, so the VFS " - "retries later."); - mark_inode_dirty(vi); - } else { - ntfs_error(vi->i_sb, "Failed (error %i): Run chkdsk.", -err); - NVolSetErrors(ni->vol); - } - return err; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h deleted file mode 100644 index 147ef4ddb6914..0000000000000 --- a/fs/ntfs/inode.h +++ /dev/null @@ -1,310 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of - * the Linux-NTFS project. - * - * Copyright (c) 2001-2007 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_INODE_H -#define _LINUX_NTFS_INODE_H - -#include - -#include -#include -#include -#include -#include - -#include "layout.h" -#include "volume.h" -#include "types.h" -#include "runlist.h" -#include "debug.h" - -typedef struct _ntfs_inode ntfs_inode; - -/* - * The NTFS in-memory inode structure. It is just used as an extension to the - * fields already provided in the VFS inode. - */ -struct _ntfs_inode { - rwlock_t size_lock; /* Lock serializing access to inode sizes. */ - s64 initialized_size; /* Copy from the attribute record. */ - s64 allocated_size; /* Copy from the attribute record. */ - unsigned long state; /* NTFS specific flags describing this inode. - See ntfs_inode_state_bits below. */ - unsigned long mft_no; /* Number of the mft record / inode. */ - u16 seq_no; /* Sequence number of the mft record. */ - atomic_t count; /* Inode reference count for book keeping. */ - ntfs_volume *vol; /* Pointer to the ntfs volume of this inode. */ - /* - * If NInoAttr() is true, the below fields describe the attribute which - * this fake inode belongs to. The actual inode of this attribute is - * pointed to by base_ntfs_ino and nr_extents is always set to -1 (see - * below). For real inodes, we also set the type (AT_DATA for files and - * AT_INDEX_ALLOCATION for directories), with the name = NULL and - * name_len = 0 for files and name = I30 (global constant) and - * name_len = 4 for directories. - */ - ATTR_TYPE type; /* Attribute type of this fake inode. */ - ntfschar *name; /* Attribute name of this fake inode. */ - u32 name_len; /* Attribute name length of this fake inode. */ - runlist runlist; /* If state has the NI_NonResident bit set, - the runlist of the unnamed data attribute - (if a file) or of the index allocation - attribute (directory) or of the attribute - described by the fake inode (if NInoAttr()). - If runlist.rl is NULL, the runlist has not - been read in yet or has been unmapped. If - NI_NonResident is clear, the attribute is - resident (file and fake inode) or there is - no $I30 index allocation attribute - (small directory). In the latter case - runlist.rl is always NULL.*/ - /* - * The following fields are only valid for real inodes and extent - * inodes. - */ - struct mutex mrec_lock; /* Lock for serializing access to the - mft record belonging to this inode. */ - struct page *page; /* The page containing the mft record of the - inode. This should only be touched by the - (un)map_mft_record*() functions. */ - int page_ofs; /* Offset into the page at which the mft record - begins. This should only be touched by the - (un)map_mft_record*() functions. */ - /* - * Attribute list support (only for use by the attribute lookup - * functions). Setup during read_inode for all inodes with attribute - * lists. Only valid if NI_AttrList is set in state, and attr_list_rl is - * further only valid if NI_AttrListNonResident is set. - */ - u32 attr_list_size; /* Length of attribute list value in bytes. */ - u8 *attr_list; /* Attribute list value itself. */ - runlist attr_list_rl; /* Run list for the attribute list value. */ - union { - struct { /* It is a directory, $MFT, or an index inode. */ - u32 block_size; /* Size of an index block. */ - u32 vcn_size; /* Size of a vcn in this - index. */ - COLLATION_RULE collation_rule; /* The collation rule - for the index. */ - u8 block_size_bits; /* Log2 of the above. */ - u8 vcn_size_bits; /* Log2 of the above. */ - } index; - struct { /* It is a compressed/sparse file/attribute inode. */ - s64 size; /* Copy of compressed_size from - $DATA. */ - u32 block_size; /* Size of a compression block - (cb). */ - u8 block_size_bits; /* Log2 of the size of a cb. */ - u8 block_clusters; /* Number of clusters per cb. */ - } compressed; - } itype; - struct mutex extent_lock; /* Lock for accessing/modifying the - below . */ - s32 nr_extents; /* For a base mft record, the number of attached extent - inodes (0 if none), for extent records and for fake - inodes describing an attribute this is -1. */ - union { /* This union is only used if nr_extents != 0. */ - ntfs_inode **extent_ntfs_inos; /* For nr_extents > 0, array of - the ntfs inodes of the extent - mft records belonging to - this base inode which have - been loaded. */ - ntfs_inode *base_ntfs_ino; /* For nr_extents == -1, the - ntfs inode of the base mft - record. For fake inodes, the - real (base) inode to which - the attribute belongs. */ - } ext; -}; - -/* - * Defined bits for the state field in the ntfs_inode structure. - * (f) = files only, (d) = directories only, (a) = attributes/fake inodes only - */ -typedef enum { - NI_Dirty, /* 1: Mft record needs to be written to disk. */ - NI_AttrList, /* 1: Mft record contains an attribute list. */ - NI_AttrListNonResident, /* 1: Attribute list is non-resident. Implies - NI_AttrList is set. */ - - NI_Attr, /* 1: Fake inode for attribute i/o. - 0: Real inode or extent inode. */ - - NI_MstProtected, /* 1: Attribute is protected by MST fixups. - 0: Attribute is not protected by fixups. */ - NI_NonResident, /* 1: Unnamed data attr is non-resident (f). - 1: Attribute is non-resident (a). */ - NI_IndexAllocPresent = NI_NonResident, /* 1: $I30 index alloc attr is - present (d). */ - NI_Compressed, /* 1: Unnamed data attr is compressed (f). - 1: Create compressed files by default (d). - 1: Attribute is compressed (a). */ - NI_Encrypted, /* 1: Unnamed data attr is encrypted (f). - 1: Create encrypted files by default (d). - 1: Attribute is encrypted (a). */ - NI_Sparse, /* 1: Unnamed data attr is sparse (f). - 1: Create sparse files by default (d). - 1: Attribute is sparse (a). */ - NI_SparseDisabled, /* 1: May not create sparse regions. */ - NI_TruncateFailed, /* 1: Last ntfs_truncate() call failed. */ -} ntfs_inode_state_bits; - -/* - * NOTE: We should be adding dirty mft records to a list somewhere and they - * should be independent of the (ntfs/vfs) inode structure so that an inode can - * be removed but the record can be left dirty for syncing later. - */ - -/* - * Macro tricks to expand the NInoFoo(), NInoSetFoo(), and NInoClearFoo() - * functions. - */ -#define NINO_FNS(flag) \ -static inline int NIno##flag(ntfs_inode *ni) \ -{ \ - return test_bit(NI_##flag, &(ni)->state); \ -} \ -static inline void NInoSet##flag(ntfs_inode *ni) \ -{ \ - set_bit(NI_##flag, &(ni)->state); \ -} \ -static inline void NInoClear##flag(ntfs_inode *ni) \ -{ \ - clear_bit(NI_##flag, &(ni)->state); \ -} - -/* - * As above for NInoTestSetFoo() and NInoTestClearFoo(). - */ -#define TAS_NINO_FNS(flag) \ -static inline int NInoTestSet##flag(ntfs_inode *ni) \ -{ \ - return test_and_set_bit(NI_##flag, &(ni)->state); \ -} \ -static inline int NInoTestClear##flag(ntfs_inode *ni) \ -{ \ - return test_and_clear_bit(NI_##flag, &(ni)->state); \ -} - -/* Emit the ntfs inode bitops functions. */ -NINO_FNS(Dirty) -TAS_NINO_FNS(Dirty) -NINO_FNS(AttrList) -NINO_FNS(AttrListNonResident) -NINO_FNS(Attr) -NINO_FNS(MstProtected) -NINO_FNS(NonResident) -NINO_FNS(IndexAllocPresent) -NINO_FNS(Compressed) -NINO_FNS(Encrypted) -NINO_FNS(Sparse) -NINO_FNS(SparseDisabled) -NINO_FNS(TruncateFailed) - -/* - * The full structure containing a ntfs_inode and a vfs struct inode. Used for - * all real and fake inodes but not for extent inodes which lack the vfs struct - * inode. - */ -typedef struct { - ntfs_inode ntfs_inode; - struct inode vfs_inode; /* The vfs inode structure. */ -} big_ntfs_inode; - -/** - * NTFS_I - return the ntfs inode given a vfs inode - * @inode: VFS inode - * - * NTFS_I() returns the ntfs inode associated with the VFS @inode. - */ -static inline ntfs_inode *NTFS_I(struct inode *inode) -{ - return (ntfs_inode *)container_of(inode, big_ntfs_inode, vfs_inode); -} - -static inline struct inode *VFS_I(ntfs_inode *ni) -{ - return &((big_ntfs_inode *)ni)->vfs_inode; -} - -/** - * ntfs_attr - ntfs in memory attribute structure - * @mft_no: mft record number of the base mft record of this attribute - * @name: Unicode name of the attribute (NULL if unnamed) - * @name_len: length of @name in Unicode characters (0 if unnamed) - * @type: attribute type (see layout.h) - * - * This structure exists only to provide a small structure for the - * ntfs_{attr_}iget()/ntfs_test_inode()/ntfs_init_locked_inode() mechanism. - * - * NOTE: Elements are ordered by size to make the structure as compact as - * possible on all architectures. - */ -typedef struct { - unsigned long mft_no; - ntfschar *name; - u32 name_len; - ATTR_TYPE type; -} ntfs_attr; - -extern int ntfs_test_inode(struct inode *vi, void *data); - -extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no); -extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, - ntfschar *name, u32 name_len); -extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, - u32 name_len); - -extern struct inode *ntfs_alloc_big_inode(struct super_block *sb); -extern void ntfs_free_big_inode(struct inode *inode); -extern void ntfs_evict_big_inode(struct inode *vi); - -extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni); - -static inline void ntfs_init_big_inode(struct inode *vi) -{ - ntfs_inode *ni = NTFS_I(vi); - - ntfs_debug("Entering."); - __ntfs_init_inode(vi->i_sb, ni); - ni->mft_no = vi->i_ino; -} - -extern ntfs_inode *ntfs_new_extent_inode(struct super_block *sb, - unsigned long mft_no); -extern void ntfs_clear_extent_inode(ntfs_inode *ni); - -extern int ntfs_read_inode_mount(struct inode *vi); - -extern int ntfs_show_options(struct seq_file *sf, struct dentry *root); - -#ifdef NTFS_RW - -extern int ntfs_truncate(struct inode *vi); -extern void ntfs_truncate_vfs(struct inode *vi); - -extern int ntfs_setattr(struct mnt_idmap *idmap, - struct dentry *dentry, struct iattr *attr); - -extern int __ntfs_write_inode(struct inode *vi, int sync); - -static inline void ntfs_commit_inode(struct inode *vi) -{ - if (!is_bad_inode(vi)) - __ntfs_write_inode(vi, 1); - return; -} - -#else - -static inline void ntfs_truncate_vfs(struct inode *vi) {} - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_INODE_H */ diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h deleted file mode 100644 index 5d4bf7a3259f7..0000000000000 --- a/fs/ntfs/layout.h +++ /dev/null @@ -1,2421 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS - * project. - * - * Copyright (c) 2001-2005 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_LAYOUT_H -#define _LINUX_NTFS_LAYOUT_H - -#include -#include -#include -#include - -#include "types.h" - -/* The NTFS oem_id "NTFS " */ -#define magicNTFS cpu_to_le64(0x202020205346544eULL) - -/* - * Location of bootsector on partition: - * The standard NTFS_BOOT_SECTOR is on sector 0 of the partition. - * On NT4 and above there is one backup copy of the boot sector to - * be found on the last sector of the partition (not normally accessible - * from within Windows as the bootsector contained number of sectors - * value is one less than the actual value!). - * On versions of NT 3.51 and earlier, the backup copy was located at - * number of sectors/2 (integer divide), i.e. in the middle of the volume. - */ - -/* - * BIOS parameter block (bpb) structure. - */ -typedef struct { - le16 bytes_per_sector; /* Size of a sector in bytes. */ - u8 sectors_per_cluster; /* Size of a cluster in sectors. */ - le16 reserved_sectors; /* zero */ - u8 fats; /* zero */ - le16 root_entries; /* zero */ - le16 sectors; /* zero */ - u8 media_type; /* 0xf8 = hard disk */ - le16 sectors_per_fat; /* zero */ - le16 sectors_per_track; /* irrelevant */ - le16 heads; /* irrelevant */ - le32 hidden_sectors; /* zero */ - le32 large_sectors; /* zero */ -} __attribute__ ((__packed__)) BIOS_PARAMETER_BLOCK; - -/* - * NTFS boot sector structure. - */ -typedef struct { - u8 jump[3]; /* Irrelevant (jump to boot up code).*/ - le64 oem_id; /* Magic "NTFS ". */ - BIOS_PARAMETER_BLOCK bpb; /* See BIOS_PARAMETER_BLOCK. */ - u8 unused[4]; /* zero, NTFS diskedit.exe states that - this is actually: - __u8 physical_drive; // 0x80 - __u8 current_head; // zero - __u8 extended_boot_signature; - // 0x80 - __u8 unused; // zero - */ -/*0x28*/sle64 number_of_sectors; /* Number of sectors in volume. Gives - maximum volume size of 2^63 sectors. - Assuming standard sector size of 512 - bytes, the maximum byte size is - approx. 4.7x10^21 bytes. (-; */ - sle64 mft_lcn; /* Cluster location of mft data. */ - sle64 mftmirr_lcn; /* Cluster location of copy of mft. */ - s8 clusters_per_mft_record; /* Mft record size in clusters. */ - u8 reserved0[3]; /* zero */ - s8 clusters_per_index_record; /* Index block size in clusters. */ - u8 reserved1[3]; /* zero */ - le64 volume_serial_number; /* Irrelevant (serial number). */ - le32 checksum; /* Boot sector checksum. */ -/*0x54*/u8 bootstrap[426]; /* Irrelevant (boot up code). */ - le16 end_of_sector_marker; /* End of bootsector magic. Always is - 0xaa55 in little endian. */ -/* sizeof() = 512 (0x200) bytes */ -} __attribute__ ((__packed__)) NTFS_BOOT_SECTOR; - -/* - * Magic identifiers present at the beginning of all ntfs record containing - * records (like mft records for example). - */ -enum { - /* Found in $MFT/$DATA. */ - magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */ - magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */ - magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */ - - /* Found in $LogFile/$DATA. */ - magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */ - magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */ - - /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */ - magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */ - - /* Found in all ntfs record containing records. */ - magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector - transfer was detected. */ - /* - * Found in $LogFile/$DATA when a page is full of 0xff bytes and is - * thus not initialized. Page must be initialized before using it. - */ - magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */ -}; - -typedef le32 NTFS_RECORD_TYPE; - -/* - * Generic magic comparison macros. Finally found a use for the ## preprocessor - * operator! (-8 - */ - -static inline bool __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r) -{ - return (x == r); -} -#define ntfs_is_magic(x, m) __ntfs_is_magic(x, magic_##m) - -static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r) -{ - return (*p == r); -} -#define ntfs_is_magicp(p, m) __ntfs_is_magicp(p, magic_##m) - -/* - * Specialised magic comparison macros for the NTFS_RECORD_TYPEs defined above. - */ -#define ntfs_is_file_record(x) ( ntfs_is_magic (x, FILE) ) -#define ntfs_is_file_recordp(p) ( ntfs_is_magicp(p, FILE) ) -#define ntfs_is_mft_record(x) ( ntfs_is_file_record (x) ) -#define ntfs_is_mft_recordp(p) ( ntfs_is_file_recordp(p) ) -#define ntfs_is_indx_record(x) ( ntfs_is_magic (x, INDX) ) -#define ntfs_is_indx_recordp(p) ( ntfs_is_magicp(p, INDX) ) -#define ntfs_is_hole_record(x) ( ntfs_is_magic (x, HOLE) ) -#define ntfs_is_hole_recordp(p) ( ntfs_is_magicp(p, HOLE) ) - -#define ntfs_is_rstr_record(x) ( ntfs_is_magic (x, RSTR) ) -#define ntfs_is_rstr_recordp(p) ( ntfs_is_magicp(p, RSTR) ) -#define ntfs_is_rcrd_record(x) ( ntfs_is_magic (x, RCRD) ) -#define ntfs_is_rcrd_recordp(p) ( ntfs_is_magicp(p, RCRD) ) - -#define ntfs_is_chkd_record(x) ( ntfs_is_magic (x, CHKD) ) -#define ntfs_is_chkd_recordp(p) ( ntfs_is_magicp(p, CHKD) ) - -#define ntfs_is_baad_record(x) ( ntfs_is_magic (x, BAAD) ) -#define ntfs_is_baad_recordp(p) ( ntfs_is_magicp(p, BAAD) ) - -#define ntfs_is_empty_record(x) ( ntfs_is_magic (x, empty) ) -#define ntfs_is_empty_recordp(p) ( ntfs_is_magicp(p, empty) ) - -/* - * The Update Sequence Array (usa) is an array of the le16 values which belong - * to the end of each sector protected by the update sequence record in which - * this array is contained. Note that the first entry is the Update Sequence - * Number (usn), a cyclic counter of how many times the protected record has - * been written to disk. The values 0 and -1 (ie. 0xffff) are not used. All - * last le16's of each sector have to be equal to the usn (during reading) or - * are set to it (during writing). If they are not, an incomplete multi sector - * transfer has occurred when the data was written. - * The maximum size for the update sequence array is fixed to: - * maximum size = usa_ofs + (usa_count * 2) = 510 bytes - * The 510 bytes comes from the fact that the last le16 in the array has to - * (obviously) finish before the last le16 of the first 512-byte sector. - * This formula can be used as a consistency check in that usa_ofs + - * (usa_count * 2) has to be less than or equal to 510. - */ -typedef struct { - NTFS_RECORD_TYPE magic; /* A four-byte magic identifying the record - type and/or status. */ - le16 usa_ofs; /* Offset to the Update Sequence Array (usa) - from the start of the ntfs record. */ - le16 usa_count; /* Number of le16 sized entries in the usa - including the Update Sequence Number (usn), - thus the number of fixups is the usa_count - minus 1. */ -} __attribute__ ((__packed__)) NTFS_RECORD; - -/* - * System files mft record numbers. All these files are always marked as used - * in the bitmap attribute of the mft; presumably in order to avoid accidental - * allocation for random other mft records. Also, the sequence number for each - * of the system files is always equal to their mft record number and it is - * never modified. - */ -typedef enum { - FILE_MFT = 0, /* Master file table (mft). Data attribute - contains the entries and bitmap attribute - records which ones are in use (bit==1). */ - FILE_MFTMirr = 1, /* Mft mirror: copy of first four mft records - in data attribute. If cluster size > 4kiB, - copy of first N mft records, with - N = cluster_size / mft_record_size. */ - FILE_LogFile = 2, /* Journalling log in data attribute. */ - FILE_Volume = 3, /* Volume name attribute and volume information - attribute (flags and ntfs version). Windows - refers to this file as volume DASD (Direct - Access Storage Device). */ - FILE_AttrDef = 4, /* Array of attribute definitions in data - attribute. */ - FILE_root = 5, /* Root directory. */ - FILE_Bitmap = 6, /* Allocation bitmap of all clusters (lcns) in - data attribute. */ - FILE_Boot = 7, /* Boot sector (always at cluster 0) in data - attribute. */ - FILE_BadClus = 8, /* Contains all bad clusters in the non-resident - data attribute. */ - FILE_Secure = 9, /* Shared security descriptors in data attribute - and two indexes into the descriptors. - Appeared in Windows 2000. Before that, this - file was named $Quota but was unused. */ - FILE_UpCase = 10, /* Uppercase equivalents of all 65536 Unicode - characters in data attribute. */ - FILE_Extend = 11, /* Directory containing other system files (eg. - $ObjId, $Quota, $Reparse and $UsnJrnl). This - is new to NTFS3.0. */ - FILE_reserved12 = 12, /* Reserved for future use (records 12-15). */ - FILE_reserved13 = 13, - FILE_reserved14 = 14, - FILE_reserved15 = 15, - FILE_first_user = 16, /* First user file, used as test limit for - whether to allow opening a file or not. */ -} NTFS_SYSTEM_FILES; - -/* - * These are the so far known MFT_RECORD_* flags (16-bit) which contain - * information about the mft record in which they are present. - */ -enum { - MFT_RECORD_IN_USE = cpu_to_le16(0x0001), - MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002), -} __attribute__ ((__packed__)); - -typedef le16 MFT_RECORD_FLAGS; - -/* - * mft references (aka file references or file record segment references) are - * used whenever a structure needs to refer to a record in the mft. - * - * A reference consists of a 48-bit index into the mft and a 16-bit sequence - * number used to detect stale references. - * - * For error reporting purposes we treat the 48-bit index as a signed quantity. - * - * The sequence number is a circular counter (skipping 0) describing how many - * times the referenced mft record has been (re)used. This has to match the - * sequence number of the mft record being referenced, otherwise the reference - * is considered stale and removed (FIXME: only ntfsck or the driver itself?). - * - * If the sequence number is zero it is assumed that no sequence number - * consistency checking should be performed. - * - * FIXME: Since inodes are 32-bit as of now, the driver needs to always check - * for high_part being 0 and if not either BUG(), cause a panic() or handle - * the situation in some other way. This shouldn't be a problem as a volume has - * to become HUGE in order to need more than 32-bits worth of mft records. - * Assuming the standard mft record size of 1kb only the records (never mind - * the non-resident attributes, etc.) would require 4Tb of space on their own - * for the first 32 bits worth of records. This is only if some strange person - * doesn't decide to foul play and make the mft sparse which would be a really - * horrible thing to do as it would trash our current driver implementation. )-: - * Do I hear screams "we want 64-bit inodes!" ?!? (-; - * - * FIXME: The mft zone is defined as the first 12% of the volume. This space is - * reserved so that the mft can grow contiguously and hence doesn't become - * fragmented. Volume free space includes the empty part of the mft zone and - * when the volume's free 88% are used up, the mft zone is shrunk by a factor - * of 2, thus making more space available for more files/data. This process is - * repeated every time there is no more free space except for the mft zone until - * there really is no more free space. - */ - -/* - * Typedef the MFT_REF as a 64-bit value for easier handling. - * Also define two unpacking macros to get to the reference (MREF) and - * sequence number (MSEQNO) respectively. - * The _LE versions are to be applied on little endian MFT_REFs. - * Note: The _LE versions will return a CPU endian formatted value! - */ -#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL -#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU) - -typedef u64 MFT_REF; -typedef le64 leMFT_REF; - -#define MK_MREF(m, s) ((MFT_REF)(((MFT_REF)(s) << 48) | \ - ((MFT_REF)(m) & MFT_REF_MASK_CPU))) -#define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s)) - -#define MREF(x) ((unsigned long)((x) & MFT_REF_MASK_CPU)) -#define MSEQNO(x) ((u16)(((x) >> 48) & 0xffff)) -#define MREF_LE(x) ((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU)) -#define MSEQNO_LE(x) ((u16)((le64_to_cpu(x) >> 48) & 0xffff)) - -#define IS_ERR_MREF(x) (((x) & 0x0000800000000000ULL) ? true : false) -#define ERR_MREF(x) ((u64)((s64)(x))) -#define MREF_ERR(x) ((int)((s64)(x))) - -/* - * The mft record header present at the beginning of every record in the mft. - * This is followed by a sequence of variable length attribute records which - * is terminated by an attribute of type AT_END which is a truncated attribute - * in that it only consists of the attribute type code AT_END and none of the - * other members of the attribute structure are present. - */ -typedef struct { -/*Ofs*/ -/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ - NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */ - le16 usa_ofs; /* See NTFS_RECORD definition above. */ - le16 usa_count; /* See NTFS_RECORD definition above. */ - -/* 8*/ le64 lsn; /* $LogFile sequence number for this record. - Changed every time the record is modified. */ -/* 16*/ le16 sequence_number; /* Number of times this mft record has been - reused. (See description for MFT_REF - above.) NOTE: The increment (skipping zero) - is done when the file is deleted. NOTE: If - this is zero it is left zero. */ -/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of - directory entries referencing this record. - NOTE: Only used in mft base records. - NOTE: When deleting a directory entry we - check the link_count and if it is 1 we - delete the file. Otherwise we delete the - FILE_NAME_ATTR being referenced by the - directory entry from the mft record and - decrement the link_count. - FIXME: Careful with Win32 + DOS names! */ -/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this - mft record from the start of the mft record. - NOTE: Must be aligned to 8-byte boundary. */ -/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file - is deleted, the MFT_RECORD_IN_USE flag is - set to zero. */ -/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record. - NOTE: Must be aligned to 8-byte boundary. */ -/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft - record. This should be equal to the mft - record size. */ -/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records. - When it is not zero it is a mft reference - pointing to the base mft record to which - this record belongs (this is then used to - locate the attribute list attribute present - in the base record which describes this - extension record and hence might need - modification when the extension record - itself is modified, also locating the - attribute list also means finding the other - potential extents, belonging to the non-base - mft record). */ -/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to - the next attribute added to this mft record. - NOTE: Incremented each time after it is used. - NOTE: Every time the mft record is reused - this number is set to zero. NOTE: The first - instance number is always 0. */ -/* The below fields are specific to NTFS 3.1+ (Windows XP and above): */ -/* 42*/ le16 reserved; /* Reserved/alignment. */ -/* 44*/ le32 mft_record_number; /* Number of this mft record. */ -/* sizeof() = 48 bytes */ -/* - * When (re)using the mft record, we place the update sequence array at this - * offset, i.e. before we start with the attributes. This also makes sense, - * otherwise we could run into problems with the update sequence array - * containing in itself the last two bytes of a sector which would mean that - * multi sector transfer protection wouldn't work. As you can't protect data - * by overwriting it since you then can't get it back... - * When reading we obviously use the data from the ntfs record header. - */ -} __attribute__ ((__packed__)) MFT_RECORD; - -/* This is the version without the NTFS 3.1+ specific fields. */ -typedef struct { -/*Ofs*/ -/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ - NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */ - le16 usa_ofs; /* See NTFS_RECORD definition above. */ - le16 usa_count; /* See NTFS_RECORD definition above. */ - -/* 8*/ le64 lsn; /* $LogFile sequence number for this record. - Changed every time the record is modified. */ -/* 16*/ le16 sequence_number; /* Number of times this mft record has been - reused. (See description for MFT_REF - above.) NOTE: The increment (skipping zero) - is done when the file is deleted. NOTE: If - this is zero it is left zero. */ -/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of - directory entries referencing this record. - NOTE: Only used in mft base records. - NOTE: When deleting a directory entry we - check the link_count and if it is 1 we - delete the file. Otherwise we delete the - FILE_NAME_ATTR being referenced by the - directory entry from the mft record and - decrement the link_count. - FIXME: Careful with Win32 + DOS names! */ -/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this - mft record from the start of the mft record. - NOTE: Must be aligned to 8-byte boundary. */ -/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file - is deleted, the MFT_RECORD_IN_USE flag is - set to zero. */ -/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record. - NOTE: Must be aligned to 8-byte boundary. */ -/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft - record. This should be equal to the mft - record size. */ -/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records. - When it is not zero it is a mft reference - pointing to the base mft record to which - this record belongs (this is then used to - locate the attribute list attribute present - in the base record which describes this - extension record and hence might need - modification when the extension record - itself is modified, also locating the - attribute list also means finding the other - potential extents, belonging to the non-base - mft record). */ -/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to - the next attribute added to this mft record. - NOTE: Incremented each time after it is used. - NOTE: Every time the mft record is reused - this number is set to zero. NOTE: The first - instance number is always 0. */ -/* sizeof() = 42 bytes */ -/* - * When (re)using the mft record, we place the update sequence array at this - * offset, i.e. before we start with the attributes. This also makes sense, - * otherwise we could run into problems with the update sequence array - * containing in itself the last two bytes of a sector which would mean that - * multi sector transfer protection wouldn't work. As you can't protect data - * by overwriting it since you then can't get it back... - * When reading we obviously use the data from the ntfs record header. - */ -} __attribute__ ((__packed__)) MFT_RECORD_OLD; - -/* - * System defined attributes (32-bit). Each attribute type has a corresponding - * attribute name (Unicode string of maximum 64 character length) as described - * by the attribute definitions present in the data attribute of the $AttrDef - * system file. On NTFS 3.0 volumes the names are just as the types are named - * in the below defines exchanging AT_ for the dollar sign ($). If that is not - * a revealing choice of symbol I do not know what is... (-; - */ -enum { - AT_UNUSED = cpu_to_le32( 0), - AT_STANDARD_INFORMATION = cpu_to_le32( 0x10), - AT_ATTRIBUTE_LIST = cpu_to_le32( 0x20), - AT_FILE_NAME = cpu_to_le32( 0x30), - AT_OBJECT_ID = cpu_to_le32( 0x40), - AT_SECURITY_DESCRIPTOR = cpu_to_le32( 0x50), - AT_VOLUME_NAME = cpu_to_le32( 0x60), - AT_VOLUME_INFORMATION = cpu_to_le32( 0x70), - AT_DATA = cpu_to_le32( 0x80), - AT_INDEX_ROOT = cpu_to_le32( 0x90), - AT_INDEX_ALLOCATION = cpu_to_le32( 0xa0), - AT_BITMAP = cpu_to_le32( 0xb0), - AT_REPARSE_POINT = cpu_to_le32( 0xc0), - AT_EA_INFORMATION = cpu_to_le32( 0xd0), - AT_EA = cpu_to_le32( 0xe0), - AT_PROPERTY_SET = cpu_to_le32( 0xf0), - AT_LOGGED_UTILITY_STREAM = cpu_to_le32( 0x100), - AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32( 0x1000), - AT_END = cpu_to_le32(0xffffffff) -}; - -typedef le32 ATTR_TYPE; - -/* - * The collation rules for sorting views/indexes/etc (32-bit). - * - * COLLATION_BINARY - Collate by binary compare where the first byte is most - * significant. - * COLLATION_UNICODE_STRING - Collate Unicode strings by comparing their binary - * Unicode values, except that when a character can be uppercased, the - * upper case value collates before the lower case one. - * COLLATION_FILE_NAME - Collate file names as Unicode strings. The collation - * is done very much like COLLATION_UNICODE_STRING. In fact I have no idea - * what the difference is. Perhaps the difference is that file names - * would treat some special characters in an odd way (see - * unistr.c::ntfs_collate_names() and unistr.c::legal_ansi_char_array[] - * for what I mean but COLLATION_UNICODE_STRING would not give any special - * treatment to any characters at all, but this is speculation. - * COLLATION_NTOFS_ULONG - Sorting is done according to ascending le32 key - * values. E.g. used for $SII index in FILE_Secure, which sorts by - * security_id (le32). - * COLLATION_NTOFS_SID - Sorting is done according to ascending SID values. - * E.g. used for $O index in FILE_Extend/$Quota. - * COLLATION_NTOFS_SECURITY_HASH - Sorting is done first by ascending hash - * values and second by ascending security_id values. E.g. used for $SDH - * index in FILE_Secure. - * COLLATION_NTOFS_ULONGS - Sorting is done according to a sequence of ascending - * le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which - * sorts by object_id (16-byte), by splitting up the object_id in four - * le32 values and using them as individual keys. E.g. take the following - * two security_ids, stored as follows on disk: - * 1st: a1 61 65 b7 65 7b d4 11 9e 3d 00 e0 81 10 42 59 - * 2nd: 38 14 37 d2 d2 f3 d4 11 a5 21 c8 6b 79 b1 97 45 - * To compare them, they are split into four le32 values each, like so: - * 1st: 0xb76561a1 0x11d47b65 0xe0003d9e 0x59421081 - * 2nd: 0xd2371438 0x11d4f3d2 0x6bc821a5 0x4597b179 - * Now, it is apparent why the 2nd object_id collates after the 1st: the - * first le32 value of the 1st object_id is less than the first le32 of - * the 2nd object_id. If the first le32 values of both object_ids were - * equal then the second le32 values would be compared, etc. - */ -enum { - COLLATION_BINARY = cpu_to_le32(0x00), - COLLATION_FILE_NAME = cpu_to_le32(0x01), - COLLATION_UNICODE_STRING = cpu_to_le32(0x02), - COLLATION_NTOFS_ULONG = cpu_to_le32(0x10), - COLLATION_NTOFS_SID = cpu_to_le32(0x11), - COLLATION_NTOFS_SECURITY_HASH = cpu_to_le32(0x12), - COLLATION_NTOFS_ULONGS = cpu_to_le32(0x13), -}; - -typedef le32 COLLATION_RULE; - -/* - * The flags (32-bit) describing attribute properties in the attribute - * definition structure. FIXME: This information is based on Regis's - * information and, according to him, it is not certain and probably - * incomplete. The INDEXABLE flag is fairly certainly correct as only the file - * name attribute has this flag set and this is the only attribute indexed in - * NT4. - */ -enum { - ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be - indexed. */ - ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* Attribute type - can be present multiple times in the - mft records of an inode. */ - ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* Attribute value - must contain at least one non-zero - byte. */ - ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be - indexed and the attribute value must be - unique for the attribute type in all of - the mft records of an inode. */ - ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* Attribute must be - named and the name must be unique for - the attribute type in all of the mft - records of an inode. */ - ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be - resident. */ - ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* Always log - modifications to this attribute, - regardless of whether it is resident or - non-resident. Without this, only log - modifications if the attribute is - resident. */ -}; - -typedef le32 ATTR_DEF_FLAGS; - -/* - * The data attribute of FILE_AttrDef contains a sequence of attribute - * definitions for the NTFS volume. With this, it is supposed to be safe for an - * older NTFS driver to mount a volume containing a newer NTFS version without - * damaging it (that's the theory. In practice it's: not damaging it too much). - * Entries are sorted by attribute type. The flags describe whether the - * attribute can be resident/non-resident and possibly other things, but the - * actual bits are unknown. - */ -typedef struct { -/*hex ofs*/ -/* 0*/ ntfschar name[0x40]; /* Unicode name of the attribute. Zero - terminated. */ -/* 80*/ ATTR_TYPE type; /* Type of the attribute. */ -/* 84*/ le32 display_rule; /* Default display rule. - FIXME: What does it mean? (AIA) */ -/* 88*/ COLLATION_RULE collation_rule; /* Default collation rule. */ -/* 8c*/ ATTR_DEF_FLAGS flags; /* Flags describing the attribute. */ -/* 90*/ sle64 min_size; /* Optional minimum attribute size. */ -/* 98*/ sle64 max_size; /* Maximum size of attribute. */ -/* sizeof() = 0xa0 or 160 bytes */ -} __attribute__ ((__packed__)) ATTR_DEF; - -/* - * Attribute flags (16-bit). - */ -enum { - ATTR_IS_COMPRESSED = cpu_to_le16(0x0001), - ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method - mask. Also, first - illegal value. */ - ATTR_IS_ENCRYPTED = cpu_to_le16(0x4000), - ATTR_IS_SPARSE = cpu_to_le16(0x8000), -} __attribute__ ((__packed__)); - -typedef le16 ATTR_FLAGS; - -/* - * Attribute compression. - * - * Only the data attribute is ever compressed in the current ntfs driver in - * Windows. Further, compression is only applied when the data attribute is - * non-resident. Finally, to use compression, the maximum allowed cluster size - * on a volume is 4kib. - * - * The compression method is based on independently compressing blocks of X - * clusters, where X is determined from the compression_unit value found in the - * non-resident attribute record header (more precisely: X = 2^compression_unit - * clusters). On Windows NT/2k, X always is 16 clusters (compression_unit = 4). - * - * There are three different cases of how a compression block of X clusters - * can be stored: - * - * 1) The data in the block is all zero (a sparse block): - * This is stored as a sparse block in the runlist, i.e. the runlist - * entry has length = X and lcn = -1. The mapping pairs array actually - * uses a delta_lcn value length of 0, i.e. delta_lcn is not present at - * all, which is then interpreted by the driver as lcn = -1. - * NOTE: Even uncompressed files can be sparse on NTFS 3.0 volumes, then - * the same principles apply as above, except that the length is not - * restricted to being any particular value. - * - * 2) The data in the block is not compressed: - * This happens when compression doesn't reduce the size of the block - * in clusters. I.e. if compression has a small effect so that the - * compressed data still occupies X clusters, then the uncompressed data - * is stored in the block. - * This case is recognised by the fact that the runlist entry has - * length = X and lcn >= 0. The mapping pairs array stores this as - * normal with a run length of X and some specific delta_lcn, i.e. - * delta_lcn has to be present. - * - * 3) The data in the block is compressed: - * The common case. This case is recognised by the fact that the run - * list entry has length L < X and lcn >= 0. The mapping pairs array - * stores this as normal with a run length of X and some specific - * delta_lcn, i.e. delta_lcn has to be present. This runlist entry is - * immediately followed by a sparse entry with length = X - L and - * lcn = -1. The latter entry is to make up the vcn counting to the - * full compression block size X. - * - * In fact, life is more complicated because adjacent entries of the same type - * can be coalesced. This means that one has to keep track of the number of - * clusters handled and work on a basis of X clusters at a time being one - * block. An example: if length L > X this means that this particular runlist - * entry contains a block of length X and part of one or more blocks of length - * L - X. Another example: if length L < X, this does not necessarily mean that - * the block is compressed as it might be that the lcn changes inside the block - * and hence the following runlist entry describes the continuation of the - * potentially compressed block. The block would be compressed if the - * following runlist entry describes at least X - L sparse clusters, thus - * making up the compression block length as described in point 3 above. (Of - * course, there can be several runlist entries with small lengths so that the - * sparse entry does not follow the first data containing entry with - * length < X.) - * - * NOTE: At the end of the compressed attribute value, there most likely is not - * just the right amount of data to make up a compression block, thus this data - * is not even attempted to be compressed. It is just stored as is, unless - * the number of clusters it occupies is reduced when compressed in which case - * it is stored as a compressed compression block, complete with sparse - * clusters at the end. - */ - -/* - * Flags of resident attributes (8-bit). - */ -enum { - RESIDENT_ATTR_IS_INDEXED = 0x01, /* Attribute is referenced in an index - (has implications for deleting and - modifying the attribute). */ -} __attribute__ ((__packed__)); - -typedef u8 RESIDENT_ATTR_FLAGS; - -/* - * Attribute record header. Always aligned to 8-byte boundary. - */ -typedef struct { -/*Ofs*/ -/* 0*/ ATTR_TYPE type; /* The (32-bit) type of the attribute. */ -/* 4*/ le32 length; /* Byte size of the resident part of the - attribute (aligned to 8-byte boundary). - Used to get to the next attribute. */ -/* 8*/ u8 non_resident; /* If 0, attribute is resident. - If 1, attribute is non-resident. */ -/* 9*/ u8 name_length; /* Unicode character size of name of attribute. - 0 if unnamed. */ -/* 10*/ le16 name_offset; /* If name_length != 0, the byte offset to the - beginning of the name from the attribute - record. Note that the name is stored as a - Unicode string. When creating, place offset - just at the end of the record header. Then, - follow with attribute value or mapping pairs - array, resident and non-resident attributes - respectively, aligning to an 8-byte - boundary. */ -/* 12*/ ATTR_FLAGS flags; /* Flags describing the attribute. */ -/* 14*/ le16 instance; /* The instance of this attribute record. This - number is unique within this mft record (see - MFT_RECORD/next_attribute_instance notes in - mft.h for more details). */ -/* 16*/ union { - /* Resident attributes. */ - struct { -/* 16 */ le32 value_length;/* Byte size of attribute value. */ -/* 20 */ le16 value_offset;/* Byte offset of the attribute - value from the start of the - attribute record. When creating, - align to 8-byte boundary if we - have a name present as this might - not have a length of a multiple - of 8-bytes. */ -/* 22 */ RESIDENT_ATTR_FLAGS flags; /* See above. */ -/* 23 */ s8 reserved; /* Reserved/alignment to 8-byte - boundary. */ - } __attribute__ ((__packed__)) resident; - /* Non-resident attributes. */ - struct { -/* 16*/ leVCN lowest_vcn;/* Lowest valid virtual cluster number - for this portion of the attribute value or - 0 if this is the only extent (usually the - case). - Only when an attribute list is used - does lowest_vcn != 0 ever occur. */ -/* 24*/ leVCN highest_vcn;/* Highest valid vcn of this extent of - the attribute value. - Usually there is only one - portion, so this usually equals the attribute - value size in clusters minus 1. Can be -1 for - zero length files. Can be 0 for "single extent" - attributes. */ -/* 32*/ le16 mapping_pairs_offset; /* Byte offset from the - beginning of the structure to the mapping pairs - array which contains the mappings between the - vcns and the logical cluster numbers (lcns). - When creating, place this at the end of this - record header aligned to 8-byte boundary. */ -/* 34*/ u8 compression_unit; /* The compression unit expressed - as the log to the base 2 of the number of - clusters in a compression unit. 0 means not - compressed. (This effectively limits the - compression unit size to be a power of two - clusters.) WinNT4 only uses a value of 4. - Sparse files have this set to 0 on XPSP2. */ -/* 35*/ u8 reserved[5]; /* Align to 8-byte boundary. */ -/* The sizes below are only used when lowest_vcn is zero, as otherwise it would - be difficult to keep them up-to-date.*/ -/* 40*/ sle64 allocated_size; /* Byte size of disk space - allocated to hold the attribute value. Always - is a multiple of the cluster size. When a file - is compressed, this field is a multiple of the - compression block size (2^compression_unit) and - it represents the logically allocated space - rather than the actual on disk usage. For this - use the compressed_size (see below). */ -/* 48*/ sle64 data_size; /* Byte size of the attribute - value. Can be larger than allocated_size if - attribute value is compressed or sparse. */ -/* 56*/ sle64 initialized_size; /* Byte size of initialized - portion of the attribute value. Usually equals - data_size. */ -/* sizeof(uncompressed attr) = 64*/ -/* 64*/ sle64 compressed_size; /* Byte size of the attribute - value after compression. Only present when - compressed or sparse. Always is a multiple of - the cluster size. Represents the actual amount - of disk space being used on the disk. */ -/* sizeof(compressed attr) = 72*/ - } __attribute__ ((__packed__)) non_resident; - } __attribute__ ((__packed__)) data; -} __attribute__ ((__packed__)) ATTR_RECORD; - -typedef ATTR_RECORD ATTR_REC; - -/* - * File attribute flags (32-bit) appearing in the file_attributes fields of the - * STANDARD_INFORMATION attribute of MFT_RECORDs and the FILENAME_ATTR - * attributes of MFT_RECORDs and directory index entries. - * - * All of the below flags appear in the directory index entries but only some - * appear in the STANDARD_INFORMATION attribute whilst only some others appear - * in the FILENAME_ATTR attribute of MFT_RECORDs. Unless otherwise stated the - * flags appear in all of the above. - */ -enum { - FILE_ATTR_READONLY = cpu_to_le32(0x00000001), - FILE_ATTR_HIDDEN = cpu_to_le32(0x00000002), - FILE_ATTR_SYSTEM = cpu_to_le32(0x00000004), - /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */ - - FILE_ATTR_DIRECTORY = cpu_to_le32(0x00000010), - /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is - reserved for the DOS SUBDIRECTORY flag. */ - FILE_ATTR_ARCHIVE = cpu_to_le32(0x00000020), - FILE_ATTR_DEVICE = cpu_to_le32(0x00000040), - FILE_ATTR_NORMAL = cpu_to_le32(0x00000080), - - FILE_ATTR_TEMPORARY = cpu_to_le32(0x00000100), - FILE_ATTR_SPARSE_FILE = cpu_to_le32(0x00000200), - FILE_ATTR_REPARSE_POINT = cpu_to_le32(0x00000400), - FILE_ATTR_COMPRESSED = cpu_to_le32(0x00000800), - - FILE_ATTR_OFFLINE = cpu_to_le32(0x00001000), - FILE_ATTR_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000), - FILE_ATTR_ENCRYPTED = cpu_to_le32(0x00004000), - - FILE_ATTR_VALID_FLAGS = cpu_to_le32(0x00007fb7), - /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the - FILE_ATTR_DEVICE and preserves everything else. This mask is used - to obtain all flags that are valid for reading. */ - FILE_ATTR_VALID_SET_FLAGS = cpu_to_le32(0x000031a7), - /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the - F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, - F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask - is used to obtain all flags that are valid for setting. */ - /* - * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all - * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION - * attribute of an mft record. - */ - FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = cpu_to_le32(0x10000000), - /* Note, this is a copy of the corresponding bit from the mft record, - telling us whether this is a directory or not, i.e. whether it has - an index root attribute or not. */ - FILE_ATTR_DUP_VIEW_INDEX_PRESENT = cpu_to_le32(0x20000000), - /* Note, this is a copy of the corresponding bit from the mft record, - telling us whether this file has a view index present (eg. object id - index, quota index, one of the security indexes or the encrypting - filesystem related indexes). */ -}; - -typedef le32 FILE_ATTR_FLAGS; - -/* - * NOTE on times in NTFS: All times are in MS standard time format, i.e. they - * are the number of 100-nanosecond intervals since 1st January 1601, 00:00:00 - * universal coordinated time (UTC). (In Linux time starts 1st January 1970, - * 00:00:00 UTC and is stored as the number of 1-second intervals since then.) - */ - -/* - * Attribute: Standard information (0x10). - * - * NOTE: Always resident. - * NOTE: Present in all base file records on a volume. - * NOTE: There is conflicting information about the meaning of each of the time - * fields but the meaning as defined below has been verified to be - * correct by practical experimentation on Windows NT4 SP6a and is hence - * assumed to be the one and only correct interpretation. - */ -typedef struct { -/*Ofs*/ -/* 0*/ sle64 creation_time; /* Time file was created. Updated when - a filename is changed(?). */ -/* 8*/ sle64 last_data_change_time; /* Time the data attribute was last - modified. */ -/* 16*/ sle64 last_mft_change_time; /* Time this mft record was last - modified. */ -/* 24*/ sle64 last_access_time; /* Approximate time when the file was - last accessed (obviously this is not - updated on read-only volumes). In - Windows this is only updated when - accessed if some time delta has - passed since the last update. Also, - last access time updates can be - disabled altogether for speed. */ -/* 32*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ -/* 36*/ union { - /* NTFS 1.2 */ - struct { - /* 36*/ u8 reserved12[12]; /* Reserved/alignment to 8-byte - boundary. */ - } __attribute__ ((__packed__)) v1; - /* sizeof() = 48 bytes */ - /* NTFS 3.x */ - struct { -/* - * If a volume has been upgraded from a previous NTFS version, then these - * fields are present only if the file has been accessed since the upgrade. - * Recognize the difference by comparing the length of the resident attribute - * value. If it is 48, then the following fields are missing. If it is 72 then - * the fields are present. Maybe just check like this: - * if (resident.ValueLength < sizeof(STANDARD_INFORMATION)) { - * Assume NTFS 1.2- format. - * If (volume version is 3.x) - * Upgrade attribute to NTFS 3.x format. - * else - * Use NTFS 1.2- format for access. - * } else - * Use NTFS 3.x format for access. - * Only problem is that it might be legal to set the length of the value to - * arbitrarily large values thus spoiling this check. - But chkdsk probably - * views that as a corruption, assuming that it behaves like this for all - * attributes. - */ - /* 36*/ le32 maximum_versions; /* Maximum allowed versions for - file. Zero if version numbering is disabled. */ - /* 40*/ le32 version_number; /* This file's version (if any). - Set to zero if maximum_versions is zero. */ - /* 44*/ le32 class_id; /* Class id from bidirectional - class id index (?). */ - /* 48*/ le32 owner_id; /* Owner_id of the user owning - the file. Translate via $Q index in FILE_Extend - /$Quota to the quota control entry for the user - owning the file. Zero if quotas are disabled. */ - /* 52*/ le32 security_id; /* Security_id for the file. - Translate via $SII index and $SDS data stream - in FILE_Secure to the security descriptor. */ - /* 56*/ le64 quota_charged; /* Byte size of the charge to - the quota for all streams of the file. Note: Is - zero if quotas are disabled. */ - /* 64*/ leUSN usn; /* Last update sequence number - of the file. This is a direct index into the - transaction log file ($UsnJrnl). It is zero if - the usn journal is disabled or this file has - not been subject to logging yet. See usnjrnl.h - for details. */ - } __attribute__ ((__packed__)) v3; - /* sizeof() = 72 bytes (NTFS 3.x) */ - } __attribute__ ((__packed__)) ver; -} __attribute__ ((__packed__)) STANDARD_INFORMATION; - -/* - * Attribute: Attribute list (0x20). - * - * - Can be either resident or non-resident. - * - Value consists of a sequence of variable length, 8-byte aligned, - * ATTR_LIST_ENTRY records. - * - The list is not terminated by anything at all! The only way to know when - * the end is reached is to keep track of the current offset and compare it to - * the attribute value size. - * - The attribute list attribute contains one entry for each attribute of - * the file in which the list is located, except for the list attribute - * itself. The list is sorted: first by attribute type, second by attribute - * name (if present), third by instance number. The extents of one - * non-resident attribute (if present) immediately follow after the initial - * extent. They are ordered by lowest_vcn and have their instace set to zero. - * It is not allowed to have two attributes with all sorting keys equal. - * - Further restrictions: - * - If not resident, the vcn to lcn mapping array has to fit inside the - * base mft record. - * - The attribute list attribute value has a maximum size of 256kb. This - * is imposed by the Windows cache manager. - * - Attribute lists are only used when the attributes of mft record do not - * fit inside the mft record despite all attributes (that can be made - * non-resident) having been made non-resident. This can happen e.g. when: - * - File has a large number of hard links (lots of file name - * attributes present). - * - The mapping pairs array of some non-resident attribute becomes so - * large due to fragmentation that it overflows the mft record. - * - The security descriptor is very complex (not applicable to - * NTFS 3.0 volumes). - * - There are many named streams. - */ -typedef struct { -/*Ofs*/ -/* 0*/ ATTR_TYPE type; /* Type of referenced attribute. */ -/* 4*/ le16 length; /* Byte size of this entry (8-byte aligned). */ -/* 6*/ u8 name_length; /* Size in Unicode chars of the name of the - attribute or 0 if unnamed. */ -/* 7*/ u8 name_offset; /* Byte offset to beginning of attribute name - (always set this to where the name would - start even if unnamed). */ -/* 8*/ leVCN lowest_vcn; /* Lowest virtual cluster number of this portion - of the attribute value. This is usually 0. It - is non-zero for the case where one attribute - does not fit into one mft record and thus - several mft records are allocated to hold - this attribute. In the latter case, each mft - record holds one extent of the attribute and - there is one attribute list entry for each - extent. NOTE: This is DEFINITELY a signed - value! The windows driver uses cmp, followed - by jg when comparing this, thus it treats it - as signed. */ -/* 16*/ leMFT_REF mft_reference;/* The reference of the mft record holding - the ATTR_RECORD for this portion of the - attribute value. */ -/* 24*/ le16 instance; /* If lowest_vcn = 0, the instance of the - attribute being referenced; otherwise 0. */ -/* 26*/ ntfschar name[0]; /* Use when creating only. When reading use - name_offset to determine the location of the - name. */ -/* sizeof() = 26 + (attribute_name_length * 2) bytes */ -} __attribute__ ((__packed__)) ATTR_LIST_ENTRY; - -/* - * The maximum allowed length for a file name. - */ -#define MAXIMUM_FILE_NAME_LENGTH 255 - -/* - * Possible namespaces for filenames in ntfs (8-bit). - */ -enum { - FILE_NAME_POSIX = 0x00, - /* This is the largest namespace. It is case sensitive and allows all - Unicode characters except for: '\0' and '/'. Beware that in - WinNT/2k/2003 by default files which eg have the same name except - for their case will not be distinguished by the standard utilities - and thus a "del filename" will delete both "filename" and "fileName" - without warning. However if for example Services For Unix (SFU) are - installed and the case sensitive option was enabled at installation - time, then you can create/access/delete such files. - Note that even SFU places restrictions on the filenames beyond the - '\0' and '/' and in particular the following set of characters is - not allowed: '"', '/', '<', '>', '\'. All other characters, - including the ones no allowed in WIN32 namespace are allowed. - Tested with SFU 3.5 (this is now free) running on Windows XP. */ - FILE_NAME_WIN32 = 0x01, - /* The standard WinNT/2k NTFS long filenames. Case insensitive. All - Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\', - and '|'. Further, names cannot end with a '.' or a space. */ - FILE_NAME_DOS = 0x02, - /* The standard DOS filenames (8.3 format). Uppercase only. All 8-bit - characters greater space, except: '"', '*', '+', ',', '/', ':', ';', - '<', '=', '>', '?', and '\'. */ - FILE_NAME_WIN32_AND_DOS = 0x03, - /* 3 means that both the Win32 and the DOS filenames are identical and - hence have been saved in this single filename record. */ -} __attribute__ ((__packed__)); - -typedef u8 FILE_NAME_TYPE_FLAGS; - -/* - * Attribute: Filename (0x30). - * - * NOTE: Always resident. - * NOTE: All fields, except the parent_directory, are only updated when the - * filename is changed. Until then, they just become out of sync with - * reality and the more up to date values are present in the standard - * information attribute. - * NOTE: There is conflicting information about the meaning of each of the time - * fields but the meaning as defined below has been verified to be - * correct by practical experimentation on Windows NT4 SP6a and is hence - * assumed to be the one and only correct interpretation. - */ -typedef struct { -/*hex ofs*/ -/* 0*/ leMFT_REF parent_directory; /* Directory this filename is - referenced from. */ -/* 8*/ sle64 creation_time; /* Time file was created. */ -/* 10*/ sle64 last_data_change_time; /* Time the data attribute was last - modified. */ -/* 18*/ sle64 last_mft_change_time; /* Time this mft record was last - modified. */ -/* 20*/ sle64 last_access_time; /* Time this mft record was last - accessed. */ -/* 28*/ sle64 allocated_size; /* Byte size of on-disk allocated space - for the unnamed data attribute. So - for normal $DATA, this is the - allocated_size from the unnamed - $DATA attribute and for compressed - and/or sparse $DATA, this is the - compressed_size from the unnamed - $DATA attribute. For a directory or - other inode without an unnamed $DATA - attribute, this is always 0. NOTE: - This is a multiple of the cluster - size. */ -/* 30*/ sle64 data_size; /* Byte size of actual data in unnamed - data attribute. For a directory or - other inode without an unnamed $DATA - attribute, this is always 0. */ -/* 38*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ -/* 3c*/ union { - /* 3c*/ struct { - /* 3c*/ le16 packed_ea_size; /* Size of the buffer needed to - pack the extended attributes - (EAs), if such are present.*/ - /* 3e*/ le16 reserved; /* Reserved for alignment. */ - } __attribute__ ((__packed__)) ea; - /* 3c*/ struct { - /* 3c*/ le32 reparse_point_tag; /* Type of reparse point, - present only in reparse - points and only if there are - no EAs. */ - } __attribute__ ((__packed__)) rp; - } __attribute__ ((__packed__)) type; -/* 40*/ u8 file_name_length; /* Length of file name in - (Unicode) characters. */ -/* 41*/ FILE_NAME_TYPE_FLAGS file_name_type; /* Namespace of the file name.*/ -/* 42*/ ntfschar file_name[0]; /* File name in Unicode. */ -} __attribute__ ((__packed__)) FILE_NAME_ATTR; - -/* - * GUID structures store globally unique identifiers (GUID). A GUID is a - * 128-bit value consisting of one group of eight hexadecimal digits, followed - * by three groups of four hexadecimal digits each, followed by one group of - * twelve hexadecimal digits. GUIDs are Microsoft's implementation of the - * distributed computing environment (DCE) universally unique identifier (UUID). - * Example of a GUID: - * 1F010768-5A73-BC91-0010A52216A7 - */ -typedef struct { - le32 data1; /* The first eight hexadecimal digits of the GUID. */ - le16 data2; /* The first group of four hexadecimal digits. */ - le16 data3; /* The second group of four hexadecimal digits. */ - u8 data4[8]; /* The first two bytes are the third group of four - hexadecimal digits. The remaining six bytes are the - final 12 hexadecimal digits. */ -} __attribute__ ((__packed__)) GUID; - -/* - * FILE_Extend/$ObjId contains an index named $O. This index contains all - * object_ids present on the volume as the index keys and the corresponding - * mft_record numbers as the index entry data parts. The data part (defined - * below) also contains three other object_ids: - * birth_volume_id - object_id of FILE_Volume on which the file was first - * created. Optional (i.e. can be zero). - * birth_object_id - object_id of file when it was first created. Usually - * equals the object_id. Optional (i.e. can be zero). - * domain_id - Reserved (always zero). - */ -typedef struct { - leMFT_REF mft_reference;/* Mft record containing the object_id in - the index entry key. */ - union { - struct { - GUID birth_volume_id; - GUID birth_object_id; - GUID domain_id; - } __attribute__ ((__packed__)) origin; - u8 extended_info[48]; - } __attribute__ ((__packed__)) opt; -} __attribute__ ((__packed__)) OBJ_ID_INDEX_DATA; - -/* - * Attribute: Object id (NTFS 3.0+) (0x40). - * - * NOTE: Always resident. - */ -typedef struct { - GUID object_id; /* Unique id assigned to the - file.*/ - /* The following fields are optional. The attribute value size is 16 - bytes, i.e. sizeof(GUID), if these are not present at all. Note, - the entries can be present but one or more (or all) can be zero - meaning that that particular value(s) is(are) not defined. */ - union { - struct { - GUID birth_volume_id; /* Unique id of volume on which - the file was first created.*/ - GUID birth_object_id; /* Unique id of file when it was - first created. */ - GUID domain_id; /* Reserved, zero. */ - } __attribute__ ((__packed__)) origin; - u8 extended_info[48]; - } __attribute__ ((__packed__)) opt; -} __attribute__ ((__packed__)) OBJECT_ID_ATTR; - -/* - * The pre-defined IDENTIFIER_AUTHORITIES used as SID_IDENTIFIER_AUTHORITY in - * the SID structure (see below). - */ -//typedef enum { /* SID string prefix. */ -// SECURITY_NULL_SID_AUTHORITY = {0, 0, 0, 0, 0, 0}, /* S-1-0 */ -// SECURITY_WORLD_SID_AUTHORITY = {0, 0, 0, 0, 0, 1}, /* S-1-1 */ -// SECURITY_LOCAL_SID_AUTHORITY = {0, 0, 0, 0, 0, 2}, /* S-1-2 */ -// SECURITY_CREATOR_SID_AUTHORITY = {0, 0, 0, 0, 0, 3}, /* S-1-3 */ -// SECURITY_NON_UNIQUE_AUTHORITY = {0, 0, 0, 0, 0, 4}, /* S-1-4 */ -// SECURITY_NT_SID_AUTHORITY = {0, 0, 0, 0, 0, 5}, /* S-1-5 */ -//} IDENTIFIER_AUTHORITIES; - -/* - * These relative identifiers (RIDs) are used with the above identifier - * authorities to make up universal well-known SIDs. - * - * Note: The relative identifier (RID) refers to the portion of a SID, which - * identifies a user or group in relation to the authority that issued the SID. - * For example, the universal well-known SID Creator Owner ID (S-1-3-0) is - * made up of the identifier authority SECURITY_CREATOR_SID_AUTHORITY (3) and - * the relative identifier SECURITY_CREATOR_OWNER_RID (0). - */ -typedef enum { /* Identifier authority. */ - SECURITY_NULL_RID = 0, /* S-1-0 */ - SECURITY_WORLD_RID = 0, /* S-1-1 */ - SECURITY_LOCAL_RID = 0, /* S-1-2 */ - - SECURITY_CREATOR_OWNER_RID = 0, /* S-1-3 */ - SECURITY_CREATOR_GROUP_RID = 1, /* S-1-3 */ - - SECURITY_CREATOR_OWNER_SERVER_RID = 2, /* S-1-3 */ - SECURITY_CREATOR_GROUP_SERVER_RID = 3, /* S-1-3 */ - - SECURITY_DIALUP_RID = 1, - SECURITY_NETWORK_RID = 2, - SECURITY_BATCH_RID = 3, - SECURITY_INTERACTIVE_RID = 4, - SECURITY_SERVICE_RID = 6, - SECURITY_ANONYMOUS_LOGON_RID = 7, - SECURITY_PROXY_RID = 8, - SECURITY_ENTERPRISE_CONTROLLERS_RID=9, - SECURITY_SERVER_LOGON_RID = 9, - SECURITY_PRINCIPAL_SELF_RID = 0xa, - SECURITY_AUTHENTICATED_USER_RID = 0xb, - SECURITY_RESTRICTED_CODE_RID = 0xc, - SECURITY_TERMINAL_SERVER_RID = 0xd, - - SECURITY_LOGON_IDS_RID = 5, - SECURITY_LOGON_IDS_RID_COUNT = 3, - - SECURITY_LOCAL_SYSTEM_RID = 0x12, - - SECURITY_NT_NON_UNIQUE = 0x15, - - SECURITY_BUILTIN_DOMAIN_RID = 0x20, - - /* - * Well-known domain relative sub-authority values (RIDs). - */ - - /* Users. */ - DOMAIN_USER_RID_ADMIN = 0x1f4, - DOMAIN_USER_RID_GUEST = 0x1f5, - DOMAIN_USER_RID_KRBTGT = 0x1f6, - - /* Groups. */ - DOMAIN_GROUP_RID_ADMINS = 0x200, - DOMAIN_GROUP_RID_USERS = 0x201, - DOMAIN_GROUP_RID_GUESTS = 0x202, - DOMAIN_GROUP_RID_COMPUTERS = 0x203, - DOMAIN_GROUP_RID_CONTROLLERS = 0x204, - DOMAIN_GROUP_RID_CERT_ADMINS = 0x205, - DOMAIN_GROUP_RID_SCHEMA_ADMINS = 0x206, - DOMAIN_GROUP_RID_ENTERPRISE_ADMINS= 0x207, - DOMAIN_GROUP_RID_POLICY_ADMINS = 0x208, - - /* Aliases. */ - DOMAIN_ALIAS_RID_ADMINS = 0x220, - DOMAIN_ALIAS_RID_USERS = 0x221, - DOMAIN_ALIAS_RID_GUESTS = 0x222, - DOMAIN_ALIAS_RID_POWER_USERS = 0x223, - - DOMAIN_ALIAS_RID_ACCOUNT_OPS = 0x224, - DOMAIN_ALIAS_RID_SYSTEM_OPS = 0x225, - DOMAIN_ALIAS_RID_PRINT_OPS = 0x226, - DOMAIN_ALIAS_RID_BACKUP_OPS = 0x227, - - DOMAIN_ALIAS_RID_REPLICATOR = 0x228, - DOMAIN_ALIAS_RID_RAS_SERVERS = 0x229, - DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a, -} RELATIVE_IDENTIFIERS; - -/* - * The universal well-known SIDs: - * - * NULL_SID S-1-0-0 - * WORLD_SID S-1-1-0 - * LOCAL_SID S-1-2-0 - * CREATOR_OWNER_SID S-1-3-0 - * CREATOR_GROUP_SID S-1-3-1 - * CREATOR_OWNER_SERVER_SID S-1-3-2 - * CREATOR_GROUP_SERVER_SID S-1-3-3 - * - * (Non-unique IDs) S-1-4 - * - * NT well-known SIDs: - * - * NT_AUTHORITY_SID S-1-5 - * DIALUP_SID S-1-5-1 - * - * NETWORD_SID S-1-5-2 - * BATCH_SID S-1-5-3 - * INTERACTIVE_SID S-1-5-4 - * SERVICE_SID S-1-5-6 - * ANONYMOUS_LOGON_SID S-1-5-7 (aka null logon session) - * PROXY_SID S-1-5-8 - * SERVER_LOGON_SID S-1-5-9 (aka domain controller account) - * SELF_SID S-1-5-10 (self RID) - * AUTHENTICATED_USER_SID S-1-5-11 - * RESTRICTED_CODE_SID S-1-5-12 (running restricted code) - * TERMINAL_SERVER_SID S-1-5-13 (running on terminal server) - * - * (Logon IDs) S-1-5-5-X-Y - * - * (NT non-unique IDs) S-1-5-0x15-... - * - * (Built-in domain) S-1-5-0x20 - */ - -/* - * The SID_IDENTIFIER_AUTHORITY is a 48-bit value used in the SID structure. - * - * NOTE: This is stored as a big endian number, hence the high_part comes - * before the low_part. - */ -typedef union { - struct { - u16 high_part; /* High 16-bits. */ - u32 low_part; /* Low 32-bits. */ - } __attribute__ ((__packed__)) parts; - u8 value[6]; /* Value as individual bytes. */ -} __attribute__ ((__packed__)) SID_IDENTIFIER_AUTHORITY; - -/* - * The SID structure is a variable-length structure used to uniquely identify - * users or groups. SID stands for security identifier. - * - * The standard textual representation of the SID is of the form: - * S-R-I-S-S... - * Where: - * - The first "S" is the literal character 'S' identifying the following - * digits as a SID. - * - R is the revision level of the SID expressed as a sequence of digits - * either in decimal or hexadecimal (if the later, prefixed by "0x"). - * - I is the 48-bit identifier_authority, expressed as digits as R above. - * - S... is one or more sub_authority values, expressed as digits as above. - * - * Example SID; the domain-relative SID of the local Administrators group on - * Windows NT/2k: - * S-1-5-32-544 - * This translates to a SID with: - * revision = 1, - * sub_authority_count = 2, - * identifier_authority = {0,0,0,0,0,5}, // SECURITY_NT_AUTHORITY - * sub_authority[0] = 32, // SECURITY_BUILTIN_DOMAIN_RID - * sub_authority[1] = 544 // DOMAIN_ALIAS_RID_ADMINS - */ -typedef struct { - u8 revision; - u8 sub_authority_count; - SID_IDENTIFIER_AUTHORITY identifier_authority; - le32 sub_authority[1]; /* At least one sub_authority. */ -} __attribute__ ((__packed__)) SID; - -/* - * Current constants for SIDs. - */ -typedef enum { - SID_REVISION = 1, /* Current revision level. */ - SID_MAX_SUB_AUTHORITIES = 15, /* Maximum number of those. */ - SID_RECOMMENDED_SUB_AUTHORITIES = 1, /* Will change to around 6 in - a future revision. */ -} SID_CONSTANTS; - -/* - * The predefined ACE types (8-bit, see below). - */ -enum { - ACCESS_MIN_MS_ACE_TYPE = 0, - ACCESS_ALLOWED_ACE_TYPE = 0, - ACCESS_DENIED_ACE_TYPE = 1, - SYSTEM_AUDIT_ACE_TYPE = 2, - SYSTEM_ALARM_ACE_TYPE = 3, /* Not implemented as of Win2k. */ - ACCESS_MAX_MS_V2_ACE_TYPE = 3, - - ACCESS_ALLOWED_COMPOUND_ACE_TYPE= 4, - ACCESS_MAX_MS_V3_ACE_TYPE = 4, - - /* The following are Win2k only. */ - ACCESS_MIN_MS_OBJECT_ACE_TYPE = 5, - ACCESS_ALLOWED_OBJECT_ACE_TYPE = 5, - ACCESS_DENIED_OBJECT_ACE_TYPE = 6, - SYSTEM_AUDIT_OBJECT_ACE_TYPE = 7, - SYSTEM_ALARM_OBJECT_ACE_TYPE = 8, - ACCESS_MAX_MS_OBJECT_ACE_TYPE = 8, - - ACCESS_MAX_MS_V4_ACE_TYPE = 8, - - /* This one is for WinNT/2k. */ - ACCESS_MAX_MS_ACE_TYPE = 8, -} __attribute__ ((__packed__)); - -typedef u8 ACE_TYPES; - -/* - * The ACE flags (8-bit) for audit and inheritance (see below). - * - * SUCCESSFUL_ACCESS_ACE_FLAG is only used with system audit and alarm ACE - * types to indicate that a message is generated (in Windows!) for successful - * accesses. - * - * FAILED_ACCESS_ACE_FLAG is only used with system audit and alarm ACE types - * to indicate that a message is generated (in Windows!) for failed accesses. - */ -enum { - /* The inheritance flags. */ - OBJECT_INHERIT_ACE = 0x01, - CONTAINER_INHERIT_ACE = 0x02, - NO_PROPAGATE_INHERIT_ACE = 0x04, - INHERIT_ONLY_ACE = 0x08, - INHERITED_ACE = 0x10, /* Win2k only. */ - VALID_INHERIT_FLAGS = 0x1f, - - /* The audit flags. */ - SUCCESSFUL_ACCESS_ACE_FLAG = 0x40, - FAILED_ACCESS_ACE_FLAG = 0x80, -} __attribute__ ((__packed__)); - -typedef u8 ACE_FLAGS; - -/* - * An ACE is an access-control entry in an access-control list (ACL). - * An ACE defines access to an object for a specific user or group or defines - * the types of access that generate system-administration messages or alarms - * for a specific user or group. The user or group is identified by a security - * identifier (SID). - * - * Each ACE starts with an ACE_HEADER structure (aligned on 4-byte boundary), - * which specifies the type and size of the ACE. The format of the subsequent - * data depends on the ACE type. - */ -typedef struct { -/*Ofs*/ -/* 0*/ ACE_TYPES type; /* Type of the ACE. */ -/* 1*/ ACE_FLAGS flags; /* Flags describing the ACE. */ -/* 2*/ le16 size; /* Size in bytes of the ACE. */ -} __attribute__ ((__packed__)) ACE_HEADER; - -/* - * The access mask (32-bit). Defines the access rights. - * - * The specific rights (bits 0 to 15). These depend on the type of the object - * being secured by the ACE. - */ -enum { - /* Specific rights for files and directories are as follows: */ - - /* Right to read data from the file. (FILE) */ - FILE_READ_DATA = cpu_to_le32(0x00000001), - /* Right to list contents of a directory. (DIRECTORY) */ - FILE_LIST_DIRECTORY = cpu_to_le32(0x00000001), - - /* Right to write data to the file. (FILE) */ - FILE_WRITE_DATA = cpu_to_le32(0x00000002), - /* Right to create a file in the directory. (DIRECTORY) */ - FILE_ADD_FILE = cpu_to_le32(0x00000002), - - /* Right to append data to the file. (FILE) */ - FILE_APPEND_DATA = cpu_to_le32(0x00000004), - /* Right to create a subdirectory. (DIRECTORY) */ - FILE_ADD_SUBDIRECTORY = cpu_to_le32(0x00000004), - - /* Right to read extended attributes. (FILE/DIRECTORY) */ - FILE_READ_EA = cpu_to_le32(0x00000008), - - /* Right to write extended attributes. (FILE/DIRECTORY) */ - FILE_WRITE_EA = cpu_to_le32(0x00000010), - - /* Right to execute a file. (FILE) */ - FILE_EXECUTE = cpu_to_le32(0x00000020), - /* Right to traverse the directory. (DIRECTORY) */ - FILE_TRAVERSE = cpu_to_le32(0x00000020), - - /* - * Right to delete a directory and all the files it contains (its - * children), even if the files are read-only. (DIRECTORY) - */ - FILE_DELETE_CHILD = cpu_to_le32(0x00000040), - - /* Right to read file attributes. (FILE/DIRECTORY) */ - FILE_READ_ATTRIBUTES = cpu_to_le32(0x00000080), - - /* Right to change file attributes. (FILE/DIRECTORY) */ - FILE_WRITE_ATTRIBUTES = cpu_to_le32(0x00000100), - - /* - * The standard rights (bits 16 to 23). These are independent of the - * type of object being secured. - */ - - /* Right to delete the object. */ - DELETE = cpu_to_le32(0x00010000), - - /* - * Right to read the information in the object's security descriptor, - * not including the information in the SACL, i.e. right to read the - * security descriptor and owner. - */ - READ_CONTROL = cpu_to_le32(0x00020000), - - /* Right to modify the DACL in the object's security descriptor. */ - WRITE_DAC = cpu_to_le32(0x00040000), - - /* Right to change the owner in the object's security descriptor. */ - WRITE_OWNER = cpu_to_le32(0x00080000), - - /* - * Right to use the object for synchronization. Enables a process to - * wait until the object is in the signalled state. Some object types - * do not support this access right. - */ - SYNCHRONIZE = cpu_to_le32(0x00100000), - - /* - * The following STANDARD_RIGHTS_* are combinations of the above for - * convenience and are defined by the Win32 API. - */ - - /* These are currently defined to READ_CONTROL. */ - STANDARD_RIGHTS_READ = cpu_to_le32(0x00020000), - STANDARD_RIGHTS_WRITE = cpu_to_le32(0x00020000), - STANDARD_RIGHTS_EXECUTE = cpu_to_le32(0x00020000), - - /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */ - STANDARD_RIGHTS_REQUIRED = cpu_to_le32(0x000f0000), - - /* - * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and - * SYNCHRONIZE access. - */ - STANDARD_RIGHTS_ALL = cpu_to_le32(0x001f0000), - - /* - * The access system ACL and maximum allowed access types (bits 24 to - * 25, bits 26 to 27 are reserved). - */ - ACCESS_SYSTEM_SECURITY = cpu_to_le32(0x01000000), - MAXIMUM_ALLOWED = cpu_to_le32(0x02000000), - - /* - * The generic rights (bits 28 to 31). These map onto the standard and - * specific rights. - */ - - /* Read, write, and execute access. */ - GENERIC_ALL = cpu_to_le32(0x10000000), - - /* Execute access. */ - GENERIC_EXECUTE = cpu_to_le32(0x20000000), - - /* - * Write access. For files, this maps onto: - * FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA | - * FILE_WRITE_EA | STANDARD_RIGHTS_WRITE | SYNCHRONIZE - * For directories, the mapping has the same numerical value. See - * above for the descriptions of the rights granted. - */ - GENERIC_WRITE = cpu_to_le32(0x40000000), - - /* - * Read access. For files, this maps onto: - * FILE_READ_ATTRIBUTES | FILE_READ_DATA | FILE_READ_EA | - * STANDARD_RIGHTS_READ | SYNCHRONIZE - * For directories, the mapping has the same numberical value. See - * above for the descriptions of the rights granted. - */ - GENERIC_READ = cpu_to_le32(0x80000000), -}; - -typedef le32 ACCESS_MASK; - -/* - * The generic mapping array. Used to denote the mapping of each generic - * access right to a specific access mask. - * - * FIXME: What exactly is this and what is it for? (AIA) - */ -typedef struct { - ACCESS_MASK generic_read; - ACCESS_MASK generic_write; - ACCESS_MASK generic_execute; - ACCESS_MASK generic_all; -} __attribute__ ((__packed__)) GENERIC_MAPPING; - -/* - * The predefined ACE type structures are as defined below. - */ - -/* - * ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE - */ -typedef struct { -/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ - ACE_TYPES type; /* Type of the ACE. */ - ACE_FLAGS flags; /* Flags describing the ACE. */ - le16 size; /* Size in bytes of the ACE. */ -/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */ - -/* 8*/ SID sid; /* The SID associated with the ACE. */ -} __attribute__ ((__packed__)) ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, - SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE; - -/* - * The object ACE flags (32-bit). - */ -enum { - ACE_OBJECT_TYPE_PRESENT = cpu_to_le32(1), - ACE_INHERITED_OBJECT_TYPE_PRESENT = cpu_to_le32(2), -}; - -typedef le32 OBJECT_ACE_FLAGS; - -typedef struct { -/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ - ACE_TYPES type; /* Type of the ACE. */ - ACE_FLAGS flags; /* Flags describing the ACE. */ - le16 size; /* Size in bytes of the ACE. */ -/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */ - -/* 8*/ OBJECT_ACE_FLAGS object_flags; /* Flags describing the object ACE. */ -/* 12*/ GUID object_type; -/* 28*/ GUID inherited_object_type; - -/* 44*/ SID sid; /* The SID associated with the ACE. */ -} __attribute__ ((__packed__)) ACCESS_ALLOWED_OBJECT_ACE, - ACCESS_DENIED_OBJECT_ACE, - SYSTEM_AUDIT_OBJECT_ACE, - SYSTEM_ALARM_OBJECT_ACE; - -/* - * An ACL is an access-control list (ACL). - * An ACL starts with an ACL header structure, which specifies the size of - * the ACL and the number of ACEs it contains. The ACL header is followed by - * zero or more access control entries (ACEs). The ACL as well as each ACE - * are aligned on 4-byte boundaries. - */ -typedef struct { - u8 revision; /* Revision of this ACL. */ - u8 alignment1; - le16 size; /* Allocated space in bytes for ACL. Includes this - header, the ACEs and the remaining free space. */ - le16 ace_count; /* Number of ACEs in the ACL. */ - le16 alignment2; -/* sizeof() = 8 bytes */ -} __attribute__ ((__packed__)) ACL; - -/* - * Current constants for ACLs. - */ -typedef enum { - /* Current revision. */ - ACL_REVISION = 2, - ACL_REVISION_DS = 4, - - /* History of revisions. */ - ACL_REVISION1 = 1, - MIN_ACL_REVISION = 2, - ACL_REVISION2 = 2, - ACL_REVISION3 = 3, - ACL_REVISION4 = 4, - MAX_ACL_REVISION = 4, -} ACL_CONSTANTS; - -/* - * The security descriptor control flags (16-bit). - * - * SE_OWNER_DEFAULTED - This boolean flag, when set, indicates that the SID - * pointed to by the Owner field was provided by a defaulting mechanism - * rather than explicitly provided by the original provider of the - * security descriptor. This may affect the treatment of the SID with - * respect to inheritance of an owner. - * - * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in - * the Group field was provided by a defaulting mechanism rather than - * explicitly provided by the original provider of the security - * descriptor. This may affect the treatment of the SID with respect to - * inheritance of a primary group. - * - * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security - * descriptor contains a discretionary ACL. If this flag is set and the - * Dacl field of the SECURITY_DESCRIPTOR is null, then a null ACL is - * explicitly being specified. - * - * SE_DACL_DEFAULTED - This boolean flag, when set, indicates that the ACL - * pointed to by the Dacl field was provided by a defaulting mechanism - * rather than explicitly provided by the original provider of the - * security descriptor. This may affect the treatment of the ACL with - * respect to inheritance of an ACL. This flag is ignored if the - * DaclPresent flag is not set. - * - * SE_SACL_PRESENT - This boolean flag, when set, indicates that the security - * descriptor contains a system ACL pointed to by the Sacl field. If this - * flag is set and the Sacl field of the SECURITY_DESCRIPTOR is null, then - * an empty (but present) ACL is being specified. - * - * SE_SACL_DEFAULTED - This boolean flag, when set, indicates that the ACL - * pointed to by the Sacl field was provided by a defaulting mechanism - * rather than explicitly provided by the original provider of the - * security descriptor. This may affect the treatment of the ACL with - * respect to inheritance of an ACL. This flag is ignored if the - * SaclPresent flag is not set. - * - * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security - * descriptor is in self-relative form. In this form, all fields of the - * security descriptor are contiguous in memory and all pointer fields are - * expressed as offsets from the beginning of the security descriptor. - */ -enum { - SE_OWNER_DEFAULTED = cpu_to_le16(0x0001), - SE_GROUP_DEFAULTED = cpu_to_le16(0x0002), - SE_DACL_PRESENT = cpu_to_le16(0x0004), - SE_DACL_DEFAULTED = cpu_to_le16(0x0008), - - SE_SACL_PRESENT = cpu_to_le16(0x0010), - SE_SACL_DEFAULTED = cpu_to_le16(0x0020), - - SE_DACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0100), - SE_SACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0200), - SE_DACL_AUTO_INHERITED = cpu_to_le16(0x0400), - SE_SACL_AUTO_INHERITED = cpu_to_le16(0x0800), - - SE_DACL_PROTECTED = cpu_to_le16(0x1000), - SE_SACL_PROTECTED = cpu_to_le16(0x2000), - SE_RM_CONTROL_VALID = cpu_to_le16(0x4000), - SE_SELF_RELATIVE = cpu_to_le16(0x8000) -} __attribute__ ((__packed__)); - -typedef le16 SECURITY_DESCRIPTOR_CONTROL; - -/* - * Self-relative security descriptor. Contains the owner and group SIDs as well - * as the sacl and dacl ACLs inside the security descriptor itself. - */ -typedef struct { - u8 revision; /* Revision level of the security descriptor. */ - u8 alignment; - SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of - the descriptor as well as the following fields. */ - le32 owner; /* Byte offset to a SID representing an object's - owner. If this is NULL, no owner SID is present in - the descriptor. */ - le32 group; /* Byte offset to a SID representing an object's - primary group. If this is NULL, no primary group - SID is present in the descriptor. */ - le32 sacl; /* Byte offset to a system ACL. Only valid, if - SE_SACL_PRESENT is set in the control field. If - SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL - is specified. */ - le32 dacl; /* Byte offset to a discretionary ACL. Only valid, if - SE_DACL_PRESENT is set in the control field. If - SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL - (unconditionally granting access) is specified. */ -/* sizeof() = 0x14 bytes */ -} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_RELATIVE; - -/* - * Absolute security descriptor. Does not contain the owner and group SIDs, nor - * the sacl and dacl ACLs inside the security descriptor. Instead, it contains - * pointers to these structures in memory. Obviously, absolute security - * descriptors are only useful for in memory representations of security - * descriptors. On disk, a self-relative security descriptor is used. - */ -typedef struct { - u8 revision; /* Revision level of the security descriptor. */ - u8 alignment; - SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of - the descriptor as well as the following fields. */ - SID *owner; /* Points to a SID representing an object's owner. If - this is NULL, no owner SID is present in the - descriptor. */ - SID *group; /* Points to a SID representing an object's primary - group. If this is NULL, no primary group SID is - present in the descriptor. */ - ACL *sacl; /* Points to a system ACL. Only valid, if - SE_SACL_PRESENT is set in the control field. If - SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL - is specified. */ - ACL *dacl; /* Points to a discretionary ACL. Only valid, if - SE_DACL_PRESENT is set in the control field. If - SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL - (unconditionally granting access) is specified. */ -} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR; - -/* - * Current constants for security descriptors. - */ -typedef enum { - /* Current revision. */ - SECURITY_DESCRIPTOR_REVISION = 1, - SECURITY_DESCRIPTOR_REVISION1 = 1, - - /* The sizes of both the absolute and relative security descriptors is - the same as pointers, at least on ia32 architecture are 32-bit. */ - SECURITY_DESCRIPTOR_MIN_LENGTH = sizeof(SECURITY_DESCRIPTOR), -} SECURITY_DESCRIPTOR_CONSTANTS; - -/* - * Attribute: Security descriptor (0x50). A standard self-relative security - * descriptor. - * - * NOTE: Can be resident or non-resident. - * NOTE: Not used in NTFS 3.0+, as security descriptors are stored centrally - * in FILE_Secure and the correct descriptor is found using the security_id - * from the standard information attribute. - */ -typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR; - -/* - * On NTFS 3.0+, all security descriptors are stored in FILE_Secure. Only one - * referenced instance of each unique security descriptor is stored. - * - * FILE_Secure contains no unnamed data attribute, i.e. it has zero length. It - * does, however, contain two indexes ($SDH and $SII) as well as a named data - * stream ($SDS). - * - * Every unique security descriptor is assigned a unique security identifier - * (security_id, not to be confused with a SID). The security_id is unique for - * the NTFS volume and is used as an index into the $SII index, which maps - * security_ids to the security descriptor's storage location within the $SDS - * data attribute. The $SII index is sorted by ascending security_id. - * - * A simple hash is computed from each security descriptor. This hash is used - * as an index into the $SDH index, which maps security descriptor hashes to - * the security descriptor's storage location within the $SDS data attribute. - * The $SDH index is sorted by security descriptor hash and is stored in a B+ - * tree. When searching $SDH (with the intent of determining whether or not a - * new security descriptor is already present in the $SDS data stream), if a - * matching hash is found, but the security descriptors do not match, the - * search in the $SDH index is continued, searching for a next matching hash. - * - * When a precise match is found, the security_id coresponding to the security - * descriptor in the $SDS attribute is read from the found $SDH index entry and - * is stored in the $STANDARD_INFORMATION attribute of the file/directory to - * which the security descriptor is being applied. The $STANDARD_INFORMATION - * attribute is present in all base mft records (i.e. in all files and - * directories). - * - * If a match is not found, the security descriptor is assigned a new unique - * security_id and is added to the $SDS data attribute. Then, entries - * referencing the this security descriptor in the $SDS data attribute are - * added to the $SDH and $SII indexes. - * - * Note: Entries are never deleted from FILE_Secure, even if nothing - * references an entry any more. - */ - -/* - * This header precedes each security descriptor in the $SDS data stream. - * This is also the index entry data part of both the $SII and $SDH indexes. - */ -typedef struct { - le32 hash; /* Hash of the security descriptor. */ - le32 security_id; /* The security_id assigned to the descriptor. */ - le64 offset; /* Byte offset of this entry in the $SDS stream. */ - le32 length; /* Size in bytes of this entry in $SDS stream. */ -} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_HEADER; - -/* - * The $SDS data stream contains the security descriptors, aligned on 16-byte - * boundaries, sorted by security_id in a B+ tree. Security descriptors cannot - * cross 256kib boundaries (this restriction is imposed by the Windows cache - * manager). Each security descriptor is contained in a SDS_ENTRY structure. - * Also, each security descriptor is stored twice in the $SDS stream with a - * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size) - * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the - * first copy of the security descriptor will be at offset 0x51d0 in the - * $SDS data stream and the second copy will be at offset 0x451d0. - */ -typedef struct { -/*Ofs*/ -/* 0 SECURITY_DESCRIPTOR_HEADER; -- Unfolded here as gcc doesn't like - unnamed structs. */ - le32 hash; /* Hash of the security descriptor. */ - le32 security_id; /* The security_id assigned to the descriptor. */ - le64 offset; /* Byte offset of this entry in the $SDS stream. */ - le32 length; /* Size in bytes of this entry in $SDS stream. */ -/* 20*/ SECURITY_DESCRIPTOR_RELATIVE sid; /* The self-relative security - descriptor. */ -} __attribute__ ((__packed__)) SDS_ENTRY; - -/* - * The index entry key used in the $SII index. The collation type is - * COLLATION_NTOFS_ULONG. - */ -typedef struct { - le32 security_id; /* The security_id assigned to the descriptor. */ -} __attribute__ ((__packed__)) SII_INDEX_KEY; - -/* - * The index entry key used in the $SDH index. The keys are sorted first by - * hash and then by security_id. The collation rule is - * COLLATION_NTOFS_SECURITY_HASH. - */ -typedef struct { - le32 hash; /* Hash of the security descriptor. */ - le32 security_id; /* The security_id assigned to the descriptor. */ -} __attribute__ ((__packed__)) SDH_INDEX_KEY; - -/* - * Attribute: Volume name (0x60). - * - * NOTE: Always resident. - * NOTE: Present only in FILE_Volume. - */ -typedef struct { - ntfschar name[0]; /* The name of the volume in Unicode. */ -} __attribute__ ((__packed__)) VOLUME_NAME; - -/* - * Possible flags for the volume (16-bit). - */ -enum { - VOLUME_IS_DIRTY = cpu_to_le16(0x0001), - VOLUME_RESIZE_LOG_FILE = cpu_to_le16(0x0002), - VOLUME_UPGRADE_ON_MOUNT = cpu_to_le16(0x0004), - VOLUME_MOUNTED_ON_NT4 = cpu_to_le16(0x0008), - - VOLUME_DELETE_USN_UNDERWAY = cpu_to_le16(0x0010), - VOLUME_REPAIR_OBJECT_ID = cpu_to_le16(0x0020), - - VOLUME_CHKDSK_UNDERWAY = cpu_to_le16(0x4000), - VOLUME_MODIFIED_BY_CHKDSK = cpu_to_le16(0x8000), - - VOLUME_FLAGS_MASK = cpu_to_le16(0xc03f), - - /* To make our life easier when checking if we must mount read-only. */ - VOLUME_MUST_MOUNT_RO_MASK = cpu_to_le16(0xc027), -} __attribute__ ((__packed__)); - -typedef le16 VOLUME_FLAGS; - -/* - * Attribute: Volume information (0x70). - * - * NOTE: Always resident. - * NOTE: Present only in FILE_Volume. - * NOTE: Windows 2000 uses NTFS 3.0 while Windows NT4 service pack 6a uses - * NTFS 1.2. I haven't personally seen other values yet. - */ -typedef struct { - le64 reserved; /* Not used (yet?). */ - u8 major_ver; /* Major version of the ntfs format. */ - u8 minor_ver; /* Minor version of the ntfs format. */ - VOLUME_FLAGS flags; /* Bit array of VOLUME_* flags. */ -} __attribute__ ((__packed__)) VOLUME_INFORMATION; - -/* - * Attribute: Data attribute (0x80). - * - * NOTE: Can be resident or non-resident. - * - * Data contents of a file (i.e. the unnamed stream) or of a named stream. - */ -typedef struct { - u8 data[0]; /* The file's data contents. */ -} __attribute__ ((__packed__)) DATA_ATTR; - -/* - * Index header flags (8-bit). - */ -enum { - /* - * When index header is in an index root attribute: - */ - SMALL_INDEX = 0, /* The index is small enough to fit inside the index - root attribute and there is no index allocation - attribute present. */ - LARGE_INDEX = 1, /* The index is too large to fit in the index root - attribute and/or an index allocation attribute is - present. */ - /* - * When index header is in an index block, i.e. is part of index - * allocation attribute: - */ - LEAF_NODE = 0, /* This is a leaf node, i.e. there are no more nodes - branching off it. */ - INDEX_NODE = 1, /* This node indexes other nodes, i.e. it is not a leaf - node. */ - NODE_MASK = 1, /* Mask for accessing the *_NODE bits. */ -} __attribute__ ((__packed__)); - -typedef u8 INDEX_HEADER_FLAGS; - -/* - * This is the header for indexes, describing the INDEX_ENTRY records, which - * follow the INDEX_HEADER. Together the index header and the index entries - * make up a complete index. - * - * IMPORTANT NOTE: The offset, length and size structure members are counted - * relative to the start of the index header structure and not relative to the - * start of the index root or index allocation structures themselves. - */ -typedef struct { - le32 entries_offset; /* Byte offset to first INDEX_ENTRY - aligned to 8-byte boundary. */ - le32 index_length; /* Data size of the index in bytes, - i.e. bytes used from allocated - size, aligned to 8-byte boundary. */ - le32 allocated_size; /* Byte size of this index (block), - multiple of 8 bytes. */ - /* NOTE: For the index root attribute, the above two numbers are always - equal, as the attribute is resident and it is resized as needed. In - the case of the index allocation attribute the attribute is not - resident and hence the allocated_size is a fixed value and must - equal the index_block_size specified by the INDEX_ROOT attribute - corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK - belongs to. */ - INDEX_HEADER_FLAGS flags; /* Bit field of INDEX_HEADER_FLAGS. */ - u8 reserved[3]; /* Reserved/align to 8-byte boundary. */ -} __attribute__ ((__packed__)) INDEX_HEADER; - -/* - * Attribute: Index root (0x90). - * - * NOTE: Always resident. - * - * This is followed by a sequence of index entries (INDEX_ENTRY structures) - * as described by the index header. - * - * When a directory is small enough to fit inside the index root then this - * is the only attribute describing the directory. When the directory is too - * large to fit in the index root, on the other hand, two additional attributes - * are present: an index allocation attribute, containing sub-nodes of the B+ - * directory tree (see below), and a bitmap attribute, describing which virtual - * cluster numbers (vcns) in the index allocation attribute are in use by an - * index block. - * - * NOTE: The root directory (FILE_root) contains an entry for itself. Other - * directories do not contain entries for themselves, though. - */ -typedef struct { - ATTR_TYPE type; /* Type of the indexed attribute. Is - $FILE_NAME for directories, zero - for view indexes. No other values - allowed. */ - COLLATION_RULE collation_rule; /* Collation rule used to sort the - index entries. If type is $FILE_NAME, - this must be COLLATION_FILE_NAME. */ - le32 index_block_size; /* Size of each index block in bytes (in - the index allocation attribute). */ - u8 clusters_per_index_block; /* Cluster size of each index block (in - the index allocation attribute), when - an index block is >= than a cluster, - otherwise this will be the log of - the size (like how the encoding of - the mft record size and the index - record size found in the boot sector - work). Has to be a power of 2. */ - u8 reserved[3]; /* Reserved/align to 8-byte boundary. */ - INDEX_HEADER index; /* Index header describing the - following index entries. */ -} __attribute__ ((__packed__)) INDEX_ROOT; - -/* - * Attribute: Index allocation (0xa0). - * - * NOTE: Always non-resident (doesn't make sense to be resident anyway!). - * - * This is an array of index blocks. Each index block starts with an - * INDEX_BLOCK structure containing an index header, followed by a sequence of - * index entries (INDEX_ENTRY structures), as described by the INDEX_HEADER. - */ -typedef struct { -/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ - NTFS_RECORD_TYPE magic; /* Magic is "INDX". */ - le16 usa_ofs; /* See NTFS_RECORD definition. */ - le16 usa_count; /* See NTFS_RECORD definition. */ - -/* 8*/ sle64 lsn; /* $LogFile sequence number of the last - modification of this index block. */ -/* 16*/ leVCN index_block_vcn; /* Virtual cluster number of the index block. - If the cluster_size on the volume is <= the - index_block_size of the directory, - index_block_vcn counts in units of clusters, - and in units of sectors otherwise. */ -/* 24*/ INDEX_HEADER index; /* Describes the following index entries. */ -/* sizeof()= 40 (0x28) bytes */ -/* - * When creating the index block, we place the update sequence array at this - * offset, i.e. before we start with the index entries. This also makes sense, - * otherwise we could run into problems with the update sequence array - * containing in itself the last two bytes of a sector which would mean that - * multi sector transfer protection wouldn't work. As you can't protect data - * by overwriting it since you then can't get it back... - * When reading use the data from the ntfs record header. - */ -} __attribute__ ((__packed__)) INDEX_BLOCK; - -typedef INDEX_BLOCK INDEX_ALLOCATION; - -/* - * The system file FILE_Extend/$Reparse contains an index named $R listing - * all reparse points on the volume. The index entry keys are as defined - * below. Note, that there is no index data associated with the index entries. - * - * The index entries are sorted by the index key file_id. The collation rule is - * COLLATION_NTOFS_ULONGS. FIXME: Verify whether the reparse_tag is not the - * primary key / is not a key at all. (AIA) - */ -typedef struct { - le32 reparse_tag; /* Reparse point type (inc. flags). */ - leMFT_REF file_id; /* Mft record of the file containing the - reparse point attribute. */ -} __attribute__ ((__packed__)) REPARSE_INDEX_KEY; - -/* - * Quota flags (32-bit). - * - * The user quota flags. Names explain meaning. - */ -enum { - QUOTA_FLAG_DEFAULT_LIMITS = cpu_to_le32(0x00000001), - QUOTA_FLAG_LIMIT_REACHED = cpu_to_le32(0x00000002), - QUOTA_FLAG_ID_DELETED = cpu_to_le32(0x00000004), - - QUOTA_FLAG_USER_MASK = cpu_to_le32(0x00000007), - /* This is a bit mask for the user quota flags. */ - - /* - * These flags are only present in the quota defaults index entry, i.e. - * in the entry where owner_id = QUOTA_DEFAULTS_ID. - */ - QUOTA_FLAG_TRACKING_ENABLED = cpu_to_le32(0x00000010), - QUOTA_FLAG_ENFORCEMENT_ENABLED = cpu_to_le32(0x00000020), - QUOTA_FLAG_TRACKING_REQUESTED = cpu_to_le32(0x00000040), - QUOTA_FLAG_LOG_THRESHOLD = cpu_to_le32(0x00000080), - - QUOTA_FLAG_LOG_LIMIT = cpu_to_le32(0x00000100), - QUOTA_FLAG_OUT_OF_DATE = cpu_to_le32(0x00000200), - QUOTA_FLAG_CORRUPT = cpu_to_le32(0x00000400), - QUOTA_FLAG_PENDING_DELETES = cpu_to_le32(0x00000800), -}; - -typedef le32 QUOTA_FLAGS; - -/* - * The system file FILE_Extend/$Quota contains two indexes $O and $Q. Quotas - * are on a per volume and per user basis. - * - * The $Q index contains one entry for each existing user_id on the volume. The - * index key is the user_id of the user/group owning this quota control entry, - * i.e. the key is the owner_id. The user_id of the owner of a file, i.e. the - * owner_id, is found in the standard information attribute. The collation rule - * for $Q is COLLATION_NTOFS_ULONG. - * - * The $O index contains one entry for each user/group who has been assigned - * a quota on that volume. The index key holds the SID of the user_id the - * entry belongs to, i.e. the owner_id. The collation rule for $O is - * COLLATION_NTOFS_SID. - * - * The $O index entry data is the user_id of the user corresponding to the SID. - * This user_id is used as an index into $Q to find the quota control entry - * associated with the SID. - * - * The $Q index entry data is the quota control entry and is defined below. - */ -typedef struct { - le32 version; /* Currently equals 2. */ - QUOTA_FLAGS flags; /* Flags describing this quota entry. */ - le64 bytes_used; /* How many bytes of the quota are in use. */ - sle64 change_time; /* Last time this quota entry was changed. */ - sle64 threshold; /* Soft quota (-1 if not limited). */ - sle64 limit; /* Hard quota (-1 if not limited). */ - sle64 exceeded_time; /* How long the soft quota has been exceeded. */ - SID sid; /* The SID of the user/object associated with - this quota entry. Equals zero for the quota - defaults entry (and in fact on a WinXP - volume, it is not present at all). */ -} __attribute__ ((__packed__)) QUOTA_CONTROL_ENTRY; - -/* - * Predefined owner_id values (32-bit). - */ -enum { - QUOTA_INVALID_ID = cpu_to_le32(0x00000000), - QUOTA_DEFAULTS_ID = cpu_to_le32(0x00000001), - QUOTA_FIRST_USER_ID = cpu_to_le32(0x00000100), -}; - -/* - * Current constants for quota control entries. - */ -typedef enum { - /* Current version. */ - QUOTA_VERSION = 2, -} QUOTA_CONTROL_ENTRY_CONSTANTS; - -/* - * Index entry flags (16-bit). - */ -enum { - INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a - sub-node, i.e. a reference to an index block in form of - a virtual cluster number (see below). */ - INDEX_ENTRY_END = cpu_to_le16(2), /* This signifies the last - entry in an index block. The index entry does not - represent a file but it can point to a sub-node. */ - - INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force - enum bit width to 16-bit. */ -} __attribute__ ((__packed__)); - -typedef le16 INDEX_ENTRY_FLAGS; - -/* - * This the index entry header (see below). - */ -typedef struct { -/* 0*/ union { - struct { /* Only valid when INDEX_ENTRY_END is not set. */ - leMFT_REF indexed_file; /* The mft reference of the file - described by this index - entry. Used for directory - indexes. */ - } __attribute__ ((__packed__)) dir; - struct { /* Used for views/indexes to find the entry's data. */ - le16 data_offset; /* Data byte offset from this - INDEX_ENTRY. Follows the - index key. */ - le16 data_length; /* Data length in bytes. */ - le32 reservedV; /* Reserved (zero). */ - } __attribute__ ((__packed__)) vi; - } __attribute__ ((__packed__)) data; -/* 8*/ le16 length; /* Byte size of this index entry, multiple of - 8-bytes. */ -/* 10*/ le16 key_length; /* Byte size of the key value, which is in the - index entry. It follows field reserved. Not - multiple of 8-bytes. */ -/* 12*/ INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */ -/* 14*/ le16 reserved; /* Reserved/align to 8-byte boundary. */ -/* sizeof() = 16 bytes */ -} __attribute__ ((__packed__)) INDEX_ENTRY_HEADER; - -/* - * This is an index entry. A sequence of such entries follows each INDEX_HEADER - * structure. Together they make up a complete index. The index follows either - * an index root attribute or an index allocation attribute. - * - * NOTE: Before NTFS 3.0 only filename attributes were indexed. - */ -typedef struct { -/*Ofs*/ -/* 0 INDEX_ENTRY_HEADER; -- Unfolded here as gcc dislikes unnamed structs. */ - union { - struct { /* Only valid when INDEX_ENTRY_END is not set. */ - leMFT_REF indexed_file; /* The mft reference of the file - described by this index - entry. Used for directory - indexes. */ - } __attribute__ ((__packed__)) dir; - struct { /* Used for views/indexes to find the entry's data. */ - le16 data_offset; /* Data byte offset from this - INDEX_ENTRY. Follows the - index key. */ - le16 data_length; /* Data length in bytes. */ - le32 reservedV; /* Reserved (zero). */ - } __attribute__ ((__packed__)) vi; - } __attribute__ ((__packed__)) data; - le16 length; /* Byte size of this index entry, multiple of - 8-bytes. */ - le16 key_length; /* Byte size of the key value, which is in the - index entry. It follows field reserved. Not - multiple of 8-bytes. */ - INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */ - le16 reserved; /* Reserved/align to 8-byte boundary. */ - -/* 16*/ union { /* The key of the indexed attribute. NOTE: Only present - if INDEX_ENTRY_END bit in flags is not set. NOTE: On - NTFS versions before 3.0 the only valid key is the - FILE_NAME_ATTR. On NTFS 3.0+ the following - additional index keys are defined: */ - FILE_NAME_ATTR file_name;/* $I30 index in directories. */ - SII_INDEX_KEY sii; /* $SII index in $Secure. */ - SDH_INDEX_KEY sdh; /* $SDH index in $Secure. */ - GUID object_id; /* $O index in FILE_Extend/$ObjId: The - object_id of the mft record found in - the data part of the index. */ - REPARSE_INDEX_KEY reparse; /* $R index in - FILE_Extend/$Reparse. */ - SID sid; /* $O index in FILE_Extend/$Quota: - SID of the owner of the user_id. */ - le32 owner_id; /* $Q index in FILE_Extend/$Quota: - user_id of the owner of the quota - control entry in the data part of - the index. */ - } __attribute__ ((__packed__)) key; - /* The (optional) index data is inserted here when creating. */ - // leVCN vcn; /* If INDEX_ENTRY_NODE bit in flags is set, the last - // eight bytes of this index entry contain the virtual - // cluster number of the index block that holds the - // entries immediately preceding the current entry (the - // vcn references the corresponding cluster in the data - // of the non-resident index allocation attribute). If - // the key_length is zero, then the vcn immediately - // follows the INDEX_ENTRY_HEADER. Regardless of - // key_length, the address of the 8-byte boundary - // aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by - // (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN), - // where sizeof(VCN) can be hardcoded as 8 if wanted. */ -} __attribute__ ((__packed__)) INDEX_ENTRY; - -/* - * Attribute: Bitmap (0xb0). - * - * Contains an array of bits (aka a bitfield). - * - * When used in conjunction with the index allocation attribute, each bit - * corresponds to one index block within the index allocation attribute. Thus - * the number of bits in the bitmap * index block size / cluster size is the - * number of clusters in the index allocation attribute. - */ -typedef struct { - u8 bitmap[0]; /* Array of bits. */ -} __attribute__ ((__packed__)) BITMAP_ATTR; - -/* - * The reparse point tag defines the type of the reparse point. It also - * includes several flags, which further describe the reparse point. - * - * The reparse point tag is an unsigned 32-bit value divided in three parts: - * - * 1. The least significant 16 bits (i.e. bits 0 to 15) specifiy the type of - * the reparse point. - * 2. The 13 bits after this (i.e. bits 16 to 28) are reserved for future use. - * 3. The most significant three bits are flags describing the reparse point. - * They are defined as follows: - * bit 29: Name surrogate bit. If set, the filename is an alias for - * another object in the system. - * bit 30: High-latency bit. If set, accessing the first byte of data will - * be slow. (E.g. the data is stored on a tape drive.) - * bit 31: Microsoft bit. If set, the tag is owned by Microsoft. User - * defined tags have to use zero here. - * - * These are the predefined reparse point tags: - */ -enum { - IO_REPARSE_TAG_IS_ALIAS = cpu_to_le32(0x20000000), - IO_REPARSE_TAG_IS_HIGH_LATENCY = cpu_to_le32(0x40000000), - IO_REPARSE_TAG_IS_MICROSOFT = cpu_to_le32(0x80000000), - - IO_REPARSE_TAG_RESERVED_ZERO = cpu_to_le32(0x00000000), - IO_REPARSE_TAG_RESERVED_ONE = cpu_to_le32(0x00000001), - IO_REPARSE_TAG_RESERVED_RANGE = cpu_to_le32(0x00000001), - - IO_REPARSE_TAG_NSS = cpu_to_le32(0x68000005), - IO_REPARSE_TAG_NSS_RECOVER = cpu_to_le32(0x68000006), - IO_REPARSE_TAG_SIS = cpu_to_le32(0x68000007), - IO_REPARSE_TAG_DFS = cpu_to_le32(0x68000008), - - IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0x88000003), - - IO_REPARSE_TAG_HSM = cpu_to_le32(0xa8000004), - - IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0xe8000000), - - IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xe000ffff), -}; - -/* - * Attribute: Reparse point (0xc0). - * - * NOTE: Can be resident or non-resident. - */ -typedef struct { - le32 reparse_tag; /* Reparse point type (inc. flags). */ - le16 reparse_data_length; /* Byte size of reparse data. */ - le16 reserved; /* Align to 8-byte boundary. */ - u8 reparse_data[0]; /* Meaning depends on reparse_tag. */ -} __attribute__ ((__packed__)) REPARSE_POINT; - -/* - * Attribute: Extended attribute (EA) information (0xd0). - * - * NOTE: Always resident. (Is this true???) - */ -typedef struct { - le16 ea_length; /* Byte size of the packed extended - attributes. */ - le16 need_ea_count; /* The number of extended attributes which have - the NEED_EA bit set. */ - le32 ea_query_length; /* Byte size of the buffer required to query - the extended attributes when calling - ZwQueryEaFile() in Windows NT/2k. I.e. the - byte size of the unpacked extended - attributes. */ -} __attribute__ ((__packed__)) EA_INFORMATION; - -/* - * Extended attribute flags (8-bit). - */ -enum { - NEED_EA = 0x80 /* If set the file to which the EA belongs - cannot be interpreted without understanding - the associates extended attributes. */ -} __attribute__ ((__packed__)); - -typedef u8 EA_FLAGS; - -/* - * Attribute: Extended attribute (EA) (0xe0). - * - * NOTE: Can be resident or non-resident. - * - * Like the attribute list and the index buffer list, the EA attribute value is - * a sequence of EA_ATTR variable length records. - */ -typedef struct { - le32 next_entry_offset; /* Offset to the next EA_ATTR. */ - EA_FLAGS flags; /* Flags describing the EA. */ - u8 ea_name_length; /* Length of the name of the EA in bytes - excluding the '\0' byte terminator. */ - le16 ea_value_length; /* Byte size of the EA's value. */ - u8 ea_name[0]; /* Name of the EA. Note this is ASCII, not - Unicode and it is zero terminated. */ - u8 ea_value[0]; /* The value of the EA. Immediately follows - the name. */ -} __attribute__ ((__packed__)) EA_ATTR; - -/* - * Attribute: Property set (0xf0). - * - * Intended to support Native Structure Storage (NSS) - a feature removed from - * NTFS 3.0 during beta testing. - */ -typedef struct { - /* Irrelevant as feature unused. */ -} __attribute__ ((__packed__)) PROPERTY_SET; - -/* - * Attribute: Logged utility stream (0x100). - * - * NOTE: Can be resident or non-resident. - * - * Operations on this attribute are logged to the journal ($LogFile) like - * normal metadata changes. - * - * Used by the Encrypting File System (EFS). All encrypted files have this - * attribute with the name $EFS. - */ -typedef struct { - /* Can be anything the creator chooses. */ - /* EFS uses it as follows: */ - // FIXME: Type this info, verifying it along the way. (AIA) -} __attribute__ ((__packed__)) LOGGED_UTILITY_STREAM, EFS_ATTR; - -#endif /* _LINUX_NTFS_LAYOUT_H */ diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c deleted file mode 100644 index eda9972e61599..0000000000000 --- a/fs/ntfs/lcnalloc.c +++ /dev/null @@ -1,1000 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * lcnalloc.c - Cluster (de)allocation code. Part of the Linux-NTFS project. - * - * Copyright (c) 2004-2005 Anton Altaparmakov - */ - -#ifdef NTFS_RW - -#include - -#include "lcnalloc.h" -#include "debug.h" -#include "bitmap.h" -#include "inode.h" -#include "volume.h" -#include "attrib.h" -#include "malloc.h" -#include "aops.h" -#include "ntfs.h" - -/** - * ntfs_cluster_free_from_rl_nolock - free clusters from runlist - * @vol: mounted ntfs volume on which to free the clusters - * @rl: runlist describing the clusters to free - * - * Free all the clusters described by the runlist @rl on the volume @vol. In - * the case of an error being returned, at least some of the clusters were not - * freed. - * - * Return 0 on success and -errno on error. - * - * Locking: - The volume lcn bitmap must be locked for writing on entry and is - * left locked on return. - */ -int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, - const runlist_element *rl) -{ - struct inode *lcnbmp_vi = vol->lcnbmp_ino; - int ret = 0; - - ntfs_debug("Entering."); - if (!rl) - return 0; - for (; rl->length; rl++) { - int err; - - if (rl->lcn < 0) - continue; - err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length); - if (unlikely(err && (!ret || ret == -ENOMEM) && ret != err)) - ret = err; - } - ntfs_debug("Done."); - return ret; -} - -/** - * ntfs_cluster_alloc - allocate clusters on an ntfs volume - * @vol: mounted ntfs volume on which to allocate the clusters - * @start_vcn: vcn to use for the first allocated cluster - * @count: number of clusters to allocate - * @start_lcn: starting lcn at which to allocate the clusters (or -1 if none) - * @zone: zone from which to allocate the clusters - * @is_extension: if 'true', this is an attribute extension - * - * Allocate @count clusters preferably starting at cluster @start_lcn or at the - * current allocator position if @start_lcn is -1, on the mounted ntfs volume - * @vol. @zone is either DATA_ZONE for allocation of normal clusters or - * MFT_ZONE for allocation of clusters for the master file table, i.e. the - * $MFT/$DATA attribute. - * - * @start_vcn specifies the vcn of the first allocated cluster. This makes - * merging the resulting runlist with the old runlist easier. - * - * If @is_extension is 'true', the caller is allocating clusters to extend an - * attribute and if it is 'false', the caller is allocating clusters to fill a - * hole in an attribute. Practically the difference is that if @is_extension - * is 'true' the returned runlist will be terminated with LCN_ENOENT and if - * @is_extension is 'false' the runlist will be terminated with - * LCN_RL_NOT_MAPPED. - * - * You need to check the return value with IS_ERR(). If this is false, the - * function was successful and the return value is a runlist describing the - * allocated cluster(s). If IS_ERR() is true, the function failed and - * PTR_ERR() gives you the error code. - * - * Notes on the allocation algorithm - * ================================= - * - * There are two data zones. First is the area between the end of the mft zone - * and the end of the volume, and second is the area between the start of the - * volume and the start of the mft zone. On unmodified/standard NTFS 1.x - * volumes, the second data zone does not exist due to the mft zone being - * expanded to cover the start of the volume in order to reserve space for the - * mft bitmap attribute. - * - * This is not the prettiest function but the complexity stems from the need of - * implementing the mft vs data zoned approach and from the fact that we have - * access to the lcn bitmap in portions of up to 8192 bytes at a time, so we - * need to cope with crossing over boundaries of two buffers. Further, the - * fact that the allocator allows for caller supplied hints as to the location - * of where allocation should begin and the fact that the allocator keeps track - * of where in the data zones the next natural allocation should occur, - * contribute to the complexity of the function. But it should all be - * worthwhile, because this allocator should: 1) be a full implementation of - * the MFT zone approach used by Windows NT, 2) cause reduction in - * fragmentation, and 3) be speedy in allocations (the code is not optimized - * for speed, but the algorithm is, so further speed improvements are probably - * possible). - * - * FIXME: We should be monitoring cluster allocation and increment the MFT zone - * size dynamically but this is something for the future. We will just cause - * heavier fragmentation by not doing it and I am not even sure Windows would - * grow the MFT zone dynamically, so it might even be correct not to do this. - * The overhead in doing dynamic MFT zone expansion would be very large and - * unlikely worth the effort. (AIA) - * - * TODO: I have added in double the required zone position pointer wrap around - * logic which can be optimized to having only one of the two logic sets. - * However, having the double logic will work fine, but if we have only one of - * the sets and we get it wrong somewhere, then we get into trouble, so - * removing the duplicate logic requires _very_ careful consideration of _all_ - * possible code paths. So at least for now, I am leaving the double logic - - * better safe than sorry... (AIA) - * - * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked - * on return. - * - This function takes the volume lcn bitmap lock for writing and - * modifies the bitmap contents. - */ -runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, - const s64 count, const LCN start_lcn, - const NTFS_CLUSTER_ALLOCATION_ZONES zone, - const bool is_extension) -{ - LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn; - LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size; - s64 clusters; - loff_t i_size; - struct inode *lcnbmp_vi; - runlist_element *rl = NULL; - struct address_space *mapping; - struct page *page = NULL; - u8 *buf, *byte; - int err = 0, rlpos, rlsize, buf_size; - u8 pass, done_zones, search_zone, need_writeback = 0, bit; - - ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn " - "0x%llx, zone %s_ZONE.", (unsigned long long)start_vcn, - (unsigned long long)count, - (unsigned long long)start_lcn, - zone == MFT_ZONE ? "MFT" : "DATA"); - BUG_ON(!vol); - lcnbmp_vi = vol->lcnbmp_ino; - BUG_ON(!lcnbmp_vi); - BUG_ON(start_vcn < 0); - BUG_ON(count < 0); - BUG_ON(start_lcn < -1); - BUG_ON(zone < FIRST_ZONE); - BUG_ON(zone > LAST_ZONE); - - /* Return NULL if @count is zero. */ - if (!count) - return NULL; - /* Take the lcnbmp lock for writing. */ - down_write(&vol->lcnbmp_lock); - /* - * If no specific @start_lcn was requested, use the current data zone - * position, otherwise use the requested @start_lcn but make sure it - * lies outside the mft zone. Also set done_zones to 0 (no zones done) - * and pass depending on whether we are starting inside a zone (1) or - * at the beginning of a zone (2). If requesting from the MFT_ZONE, - * we either start at the current position within the mft zone or at - * the specified position. If the latter is out of bounds then we start - * at the beginning of the MFT_ZONE. - */ - done_zones = 0; - pass = 1; - /* - * zone_start and zone_end are the current search range. search_zone - * is 1 for mft zone, 2 for data zone 1 (end of mft zone till end of - * volume) and 4 for data zone 2 (start of volume till start of mft - * zone). - */ - zone_start = start_lcn; - if (zone_start < 0) { - if (zone == DATA_ZONE) - zone_start = vol->data1_zone_pos; - else - zone_start = vol->mft_zone_pos; - if (!zone_start) { - /* - * Zone starts at beginning of volume which means a - * single pass is sufficient. - */ - pass = 2; - } - } else if (zone == DATA_ZONE && zone_start >= vol->mft_zone_start && - zone_start < vol->mft_zone_end) { - zone_start = vol->mft_zone_end; - /* - * Starting at beginning of data1_zone which means a single - * pass in this zone is sufficient. - */ - pass = 2; - } else if (zone == MFT_ZONE && (zone_start < vol->mft_zone_start || - zone_start >= vol->mft_zone_end)) { - zone_start = vol->mft_lcn; - if (!vol->mft_zone_end) - zone_start = 0; - /* - * Starting at beginning of volume which means a single pass - * is sufficient. - */ - pass = 2; - } - if (zone == MFT_ZONE) { - zone_end = vol->mft_zone_end; - search_zone = 1; - } else /* if (zone == DATA_ZONE) */ { - /* Skip searching the mft zone. */ - done_zones |= 1; - if (zone_start >= vol->mft_zone_end) { - zone_end = vol->nr_clusters; - search_zone = 2; - } else { - zone_end = vol->mft_zone_start; - search_zone = 4; - } - } - /* - * bmp_pos is the current bit position inside the bitmap. We use - * bmp_initial_pos to determine whether or not to do a zone switch. - */ - bmp_pos = bmp_initial_pos = zone_start; - - /* Loop until all clusters are allocated, i.e. clusters == 0. */ - clusters = count; - rlpos = rlsize = 0; - mapping = lcnbmp_vi->i_mapping; - i_size = i_size_read(lcnbmp_vi); - while (1) { - ntfs_debug("Start of outer while loop: done_zones 0x%x, " - "search_zone %i, pass %i, zone_start 0x%llx, " - "zone_end 0x%llx, bmp_initial_pos 0x%llx, " - "bmp_pos 0x%llx, rlpos %i, rlsize %i.", - done_zones, search_zone, pass, - (unsigned long long)zone_start, - (unsigned long long)zone_end, - (unsigned long long)bmp_initial_pos, - (unsigned long long)bmp_pos, rlpos, rlsize); - /* Loop until we run out of free clusters. */ - last_read_pos = bmp_pos >> 3; - ntfs_debug("last_read_pos 0x%llx.", - (unsigned long long)last_read_pos); - if (last_read_pos > i_size) { - ntfs_debug("End of attribute reached. " - "Skipping to zone_pass_done."); - goto zone_pass_done; - } - if (likely(page)) { - if (need_writeback) { - ntfs_debug("Marking page dirty."); - flush_dcache_page(page); - set_page_dirty(page); - need_writeback = 0; - } - ntfs_unmap_page(page); - } - page = ntfs_map_page(mapping, last_read_pos >> - PAGE_SHIFT); - if (IS_ERR(page)) { - err = PTR_ERR(page); - ntfs_error(vol->sb, "Failed to map page."); - goto out; - } - buf_size = last_read_pos & ~PAGE_MASK; - buf = page_address(page) + buf_size; - buf_size = PAGE_SIZE - buf_size; - if (unlikely(last_read_pos + buf_size > i_size)) - buf_size = i_size - last_read_pos; - buf_size <<= 3; - lcn = bmp_pos & 7; - bmp_pos &= ~(LCN)7; - ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, " - "bmp_pos 0x%llx, need_writeback %i.", buf_size, - (unsigned long long)lcn, - (unsigned long long)bmp_pos, need_writeback); - while (lcn < buf_size && lcn + bmp_pos < zone_end) { - byte = buf + (lcn >> 3); - ntfs_debug("In inner while loop: buf_size %i, " - "lcn 0x%llx, bmp_pos 0x%llx, " - "need_writeback %i, byte ofs 0x%x, " - "*byte 0x%x.", buf_size, - (unsigned long long)lcn, - (unsigned long long)bmp_pos, - need_writeback, - (unsigned int)(lcn >> 3), - (unsigned int)*byte); - /* Skip full bytes. */ - if (*byte == 0xff) { - lcn = (lcn + 8) & ~(LCN)7; - ntfs_debug("Continuing while loop 1."); - continue; - } - bit = 1 << (lcn & 7); - ntfs_debug("bit 0x%x.", bit); - /* If the bit is already set, go onto the next one. */ - if (*byte & bit) { - lcn++; - ntfs_debug("Continuing while loop 2."); - continue; - } - /* - * Allocate more memory if needed, including space for - * the terminator element. - * ntfs_malloc_nofs() operates on whole pages only. - */ - if ((rlpos + 2) * sizeof(*rl) > rlsize) { - runlist_element *rl2; - - ntfs_debug("Reallocating memory."); - if (!rl) - ntfs_debug("First free bit is at LCN " - "0x%llx.", - (unsigned long long) - (lcn + bmp_pos)); - rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE); - if (unlikely(!rl2)) { - err = -ENOMEM; - ntfs_error(vol->sb, "Failed to " - "allocate memory."); - goto out; - } - memcpy(rl2, rl, rlsize); - ntfs_free(rl); - rl = rl2; - rlsize += PAGE_SIZE; - ntfs_debug("Reallocated memory, rlsize 0x%x.", - rlsize); - } - /* Allocate the bitmap bit. */ - *byte |= bit; - /* We need to write this bitmap page to disk. */ - need_writeback = 1; - ntfs_debug("*byte 0x%x, need_writeback is set.", - (unsigned int)*byte); - /* - * Coalesce with previous run if adjacent LCNs. - * Otherwise, append a new run. - */ - ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), " - "prev_lcn 0x%llx, lcn 0x%llx, " - "bmp_pos 0x%llx, prev_run_len 0x%llx, " - "rlpos %i.", - (unsigned long long)(lcn + bmp_pos), - 1ULL, (unsigned long long)prev_lcn, - (unsigned long long)lcn, - (unsigned long long)bmp_pos, - (unsigned long long)prev_run_len, - rlpos); - if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) { - ntfs_debug("Coalescing to run (lcn 0x%llx, " - "len 0x%llx).", - (unsigned long long) - rl[rlpos - 1].lcn, - (unsigned long long) - rl[rlpos - 1].length); - rl[rlpos - 1].length = ++prev_run_len; - ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), " - "prev_run_len 0x%llx.", - (unsigned long long) - rl[rlpos - 1].lcn, - (unsigned long long) - rl[rlpos - 1].length, - (unsigned long long) - prev_run_len); - } else { - if (likely(rlpos)) { - ntfs_debug("Adding new run, (previous " - "run lcn 0x%llx, " - "len 0x%llx).", - (unsigned long long) - rl[rlpos - 1].lcn, - (unsigned long long) - rl[rlpos - 1].length); - rl[rlpos].vcn = rl[rlpos - 1].vcn + - prev_run_len; - } else { - ntfs_debug("Adding new run, is first " - "run."); - rl[rlpos].vcn = start_vcn; - } - rl[rlpos].lcn = prev_lcn = lcn + bmp_pos; - rl[rlpos].length = prev_run_len = 1; - rlpos++; - } - /* Done? */ - if (!--clusters) { - LCN tc; - /* - * Update the current zone position. Positions - * of already scanned zones have been updated - * during the respective zone switches. - */ - tc = lcn + bmp_pos + 1; - ntfs_debug("Done. Updating current zone " - "position, tc 0x%llx, " - "search_zone %i.", - (unsigned long long)tc, - search_zone); - switch (search_zone) { - case 1: - ntfs_debug("Before checks, " - "vol->mft_zone_pos " - "0x%llx.", - (unsigned long long) - vol->mft_zone_pos); - if (tc >= vol->mft_zone_end) { - vol->mft_zone_pos = - vol->mft_lcn; - if (!vol->mft_zone_end) - vol->mft_zone_pos = 0; - } else if ((bmp_initial_pos >= - vol->mft_zone_pos || - tc > vol->mft_zone_pos) - && tc >= vol->mft_lcn) - vol->mft_zone_pos = tc; - ntfs_debug("After checks, " - "vol->mft_zone_pos " - "0x%llx.", - (unsigned long long) - vol->mft_zone_pos); - break; - case 2: - ntfs_debug("Before checks, " - "vol->data1_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data1_zone_pos); - if (tc >= vol->nr_clusters) - vol->data1_zone_pos = - vol->mft_zone_end; - else if ((bmp_initial_pos >= - vol->data1_zone_pos || - tc > vol->data1_zone_pos) - && tc >= vol->mft_zone_end) - vol->data1_zone_pos = tc; - ntfs_debug("After checks, " - "vol->data1_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data1_zone_pos); - break; - case 4: - ntfs_debug("Before checks, " - "vol->data2_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data2_zone_pos); - if (tc >= vol->mft_zone_start) - vol->data2_zone_pos = 0; - else if (bmp_initial_pos >= - vol->data2_zone_pos || - tc > vol->data2_zone_pos) - vol->data2_zone_pos = tc; - ntfs_debug("After checks, " - "vol->data2_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data2_zone_pos); - break; - default: - BUG(); - } - ntfs_debug("Finished. Going to out."); - goto out; - } - lcn++; - } - bmp_pos += buf_size; - ntfs_debug("After inner while loop: buf_size 0x%x, lcn " - "0x%llx, bmp_pos 0x%llx, need_writeback %i.", - buf_size, (unsigned long long)lcn, - (unsigned long long)bmp_pos, need_writeback); - if (bmp_pos < zone_end) { - ntfs_debug("Continuing outer while loop, " - "bmp_pos 0x%llx, zone_end 0x%llx.", - (unsigned long long)bmp_pos, - (unsigned long long)zone_end); - continue; - } -zone_pass_done: /* Finished with the current zone pass. */ - ntfs_debug("At zone_pass_done, pass %i.", pass); - if (pass == 1) { - /* - * Now do pass 2, scanning the first part of the zone - * we omitted in pass 1. - */ - pass = 2; - zone_end = zone_start; - switch (search_zone) { - case 1: /* mft_zone */ - zone_start = vol->mft_zone_start; - break; - case 2: /* data1_zone */ - zone_start = vol->mft_zone_end; - break; - case 4: /* data2_zone */ - zone_start = 0; - break; - default: - BUG(); - } - /* Sanity check. */ - if (zone_end < zone_start) - zone_end = zone_start; - bmp_pos = zone_start; - ntfs_debug("Continuing outer while loop, pass 2, " - "zone_start 0x%llx, zone_end 0x%llx, " - "bmp_pos 0x%llx.", - (unsigned long long)zone_start, - (unsigned long long)zone_end, - (unsigned long long)bmp_pos); - continue; - } /* pass == 2 */ -done_zones_check: - ntfs_debug("At done_zones_check, search_zone %i, done_zones " - "before 0x%x, done_zones after 0x%x.", - search_zone, done_zones, - done_zones | search_zone); - done_zones |= search_zone; - if (done_zones < 7) { - ntfs_debug("Switching zone."); - /* Now switch to the next zone we haven't done yet. */ - pass = 1; - switch (search_zone) { - case 1: - ntfs_debug("Switching from mft zone to data1 " - "zone."); - /* Update mft zone position. */ - if (rlpos) { - LCN tc; - - ntfs_debug("Before checks, " - "vol->mft_zone_pos " - "0x%llx.", - (unsigned long long) - vol->mft_zone_pos); - tc = rl[rlpos - 1].lcn + - rl[rlpos - 1].length; - if (tc >= vol->mft_zone_end) { - vol->mft_zone_pos = - vol->mft_lcn; - if (!vol->mft_zone_end) - vol->mft_zone_pos = 0; - } else if ((bmp_initial_pos >= - vol->mft_zone_pos || - tc > vol->mft_zone_pos) - && tc >= vol->mft_lcn) - vol->mft_zone_pos = tc; - ntfs_debug("After checks, " - "vol->mft_zone_pos " - "0x%llx.", - (unsigned long long) - vol->mft_zone_pos); - } - /* Switch from mft zone to data1 zone. */ -switch_to_data1_zone: search_zone = 2; - zone_start = bmp_initial_pos = - vol->data1_zone_pos; - zone_end = vol->nr_clusters; - if (zone_start == vol->mft_zone_end) - pass = 2; - if (zone_start >= zone_end) { - vol->data1_zone_pos = zone_start = - vol->mft_zone_end; - pass = 2; - } - break; - case 2: - ntfs_debug("Switching from data1 zone to " - "data2 zone."); - /* Update data1 zone position. */ - if (rlpos) { - LCN tc; - - ntfs_debug("Before checks, " - "vol->data1_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data1_zone_pos); - tc = rl[rlpos - 1].lcn + - rl[rlpos - 1].length; - if (tc >= vol->nr_clusters) - vol->data1_zone_pos = - vol->mft_zone_end; - else if ((bmp_initial_pos >= - vol->data1_zone_pos || - tc > vol->data1_zone_pos) - && tc >= vol->mft_zone_end) - vol->data1_zone_pos = tc; - ntfs_debug("After checks, " - "vol->data1_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data1_zone_pos); - } - /* Switch from data1 zone to data2 zone. */ - search_zone = 4; - zone_start = bmp_initial_pos = - vol->data2_zone_pos; - zone_end = vol->mft_zone_start; - if (!zone_start) - pass = 2; - if (zone_start >= zone_end) { - vol->data2_zone_pos = zone_start = - bmp_initial_pos = 0; - pass = 2; - } - break; - case 4: - ntfs_debug("Switching from data2 zone to " - "data1 zone."); - /* Update data2 zone position. */ - if (rlpos) { - LCN tc; - - ntfs_debug("Before checks, " - "vol->data2_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data2_zone_pos); - tc = rl[rlpos - 1].lcn + - rl[rlpos - 1].length; - if (tc >= vol->mft_zone_start) - vol->data2_zone_pos = 0; - else if (bmp_initial_pos >= - vol->data2_zone_pos || - tc > vol->data2_zone_pos) - vol->data2_zone_pos = tc; - ntfs_debug("After checks, " - "vol->data2_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data2_zone_pos); - } - /* Switch from data2 zone to data1 zone. */ - goto switch_to_data1_zone; - default: - BUG(); - } - ntfs_debug("After zone switch, search_zone %i, " - "pass %i, bmp_initial_pos 0x%llx, " - "zone_start 0x%llx, zone_end 0x%llx.", - search_zone, pass, - (unsigned long long)bmp_initial_pos, - (unsigned long long)zone_start, - (unsigned long long)zone_end); - bmp_pos = zone_start; - if (zone_start == zone_end) { - ntfs_debug("Empty zone, going to " - "done_zones_check."); - /* Empty zone. Don't bother searching it. */ - goto done_zones_check; - } - ntfs_debug("Continuing outer while loop."); - continue; - } /* done_zones == 7 */ - ntfs_debug("All zones are finished."); - /* - * All zones are finished! If DATA_ZONE, shrink mft zone. If - * MFT_ZONE, we have really run out of space. - */ - mft_zone_size = vol->mft_zone_end - vol->mft_zone_start; - ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end " - "0x%llx, mft_zone_size 0x%llx.", - (unsigned long long)vol->mft_zone_start, - (unsigned long long)vol->mft_zone_end, - (unsigned long long)mft_zone_size); - if (zone == MFT_ZONE || mft_zone_size <= 0) { - ntfs_debug("No free clusters left, going to out."); - /* Really no more space left on device. */ - err = -ENOSPC; - goto out; - } /* zone == DATA_ZONE && mft_zone_size > 0 */ - ntfs_debug("Shrinking mft zone."); - zone_end = vol->mft_zone_end; - mft_zone_size >>= 1; - if (mft_zone_size > 0) - vol->mft_zone_end = vol->mft_zone_start + mft_zone_size; - else /* mft zone and data2 zone no longer exist. */ - vol->data2_zone_pos = vol->mft_zone_start = - vol->mft_zone_end = 0; - if (vol->mft_zone_pos >= vol->mft_zone_end) { - vol->mft_zone_pos = vol->mft_lcn; - if (!vol->mft_zone_end) - vol->mft_zone_pos = 0; - } - bmp_pos = zone_start = bmp_initial_pos = - vol->data1_zone_pos = vol->mft_zone_end; - search_zone = 2; - pass = 2; - done_zones &= ~2; - ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, " - "vol->mft_zone_start 0x%llx, " - "vol->mft_zone_end 0x%llx, " - "vol->mft_zone_pos 0x%llx, search_zone 2, " - "pass 2, dones_zones 0x%x, zone_start 0x%llx, " - "zone_end 0x%llx, vol->data1_zone_pos 0x%llx, " - "continuing outer while loop.", - (unsigned long long)mft_zone_size, - (unsigned long long)vol->mft_zone_start, - (unsigned long long)vol->mft_zone_end, - (unsigned long long)vol->mft_zone_pos, - done_zones, (unsigned long long)zone_start, - (unsigned long long)zone_end, - (unsigned long long)vol->data1_zone_pos); - } - ntfs_debug("After outer while loop."); -out: - ntfs_debug("At out."); - /* Add runlist terminator element. */ - if (likely(rl)) { - rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length; - rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED; - rl[rlpos].length = 0; - } - if (likely(page && !IS_ERR(page))) { - if (need_writeback) { - ntfs_debug("Marking page dirty."); - flush_dcache_page(page); - set_page_dirty(page); - need_writeback = 0; - } - ntfs_unmap_page(page); - } - if (likely(!err)) { - up_write(&vol->lcnbmp_lock); - ntfs_debug("Done."); - return rl; - } - ntfs_error(vol->sb, "Failed to allocate clusters, aborting " - "(error %i).", err); - if (rl) { - int err2; - - if (err == -ENOSPC) - ntfs_debug("Not enough space to complete allocation, " - "err -ENOSPC, first free lcn 0x%llx, " - "could allocate up to 0x%llx " - "clusters.", - (unsigned long long)rl[0].lcn, - (unsigned long long)(count - clusters)); - /* Deallocate all allocated clusters. */ - ntfs_debug("Attempting rollback..."); - err2 = ntfs_cluster_free_from_rl_nolock(vol, rl); - if (err2) { - ntfs_error(vol->sb, "Failed to rollback (error %i). " - "Leaving inconsistent metadata! " - "Unmount and run chkdsk.", err2); - NVolSetErrors(vol); - } - /* Free the runlist. */ - ntfs_free(rl); - } else if (err == -ENOSPC) - ntfs_debug("No space left at all, err = -ENOSPC, first free " - "lcn = 0x%llx.", - (long long)vol->data1_zone_pos); - up_write(&vol->lcnbmp_lock); - return ERR_PTR(err); -} - -/** - * __ntfs_cluster_free - free clusters on an ntfs volume - * @ni: ntfs inode whose runlist describes the clusters to free - * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters - * @count: number of clusters to free or -1 for all clusters - * @ctx: active attribute search context if present or NULL if not - * @is_rollback: true if this is a rollback operation - * - * Free @count clusters starting at the cluster @start_vcn in the runlist - * described by the vfs inode @ni. - * - * If @count is -1, all clusters from @start_vcn to the end of the runlist are - * deallocated. Thus, to completely free all clusters in a runlist, use - * @start_vcn = 0 and @count = -1. - * - * If @ctx is specified, it is an active search context of @ni and its base mft - * record. This is needed when __ntfs_cluster_free() encounters unmapped - * runlist fragments and allows their mapping. If you do not have the mft - * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will - * perform the necessary mapping and unmapping. - * - * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it - * before returning. Thus, @ctx will be left pointing to the same attribute on - * return as on entry. However, the actual pointers in @ctx may point to - * different memory locations on return, so you must remember to reset any - * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(), - * you will probably want to do: - * m = ctx->mrec; - * a = ctx->attr; - * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that - * you cache ctx->mrec in a variable @m of type MFT_RECORD *. - * - * @is_rollback should always be 'false', it is for internal use to rollback - * errors. You probably want to use ntfs_cluster_free() instead. - * - * Note, __ntfs_cluster_free() does not modify the runlist, so you have to - * remove from the runlist or mark sparse the freed runs later. - * - * Return the number of deallocated clusters (not counting sparse ones) on - * success and -errno on error. - * - * WARNING: If @ctx is supplied, regardless of whether success or failure is - * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx - * is no longer valid, i.e. you need to either call - * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. - * In that case PTR_ERR(@ctx->mrec) will give you the error code for - * why the mapping of the old inode failed. - * - * Locking: - The runlist described by @ni must be locked for writing on entry - * and is locked on return. Note the runlist may be modified when - * needed runlist fragments need to be mapped. - * - The volume lcn bitmap must be unlocked on entry and is unlocked - * on return. - * - This function takes the volume lcn bitmap lock for writing and - * modifies the bitmap contents. - * - If @ctx is NULL, the base mft record of @ni must not be mapped on - * entry and it will be left unmapped on return. - * - If @ctx is not NULL, the base mft record must be mapped on entry - * and it will be left mapped on return. - */ -s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count, - ntfs_attr_search_ctx *ctx, const bool is_rollback) -{ - s64 delta, to_free, total_freed, real_freed; - ntfs_volume *vol; - struct inode *lcnbmp_vi; - runlist_element *rl; - int err; - - BUG_ON(!ni); - ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count " - "0x%llx.%s", ni->mft_no, (unsigned long long)start_vcn, - (unsigned long long)count, - is_rollback ? " (rollback)" : ""); - vol = ni->vol; - lcnbmp_vi = vol->lcnbmp_ino; - BUG_ON(!lcnbmp_vi); - BUG_ON(start_vcn < 0); - BUG_ON(count < -1); - /* - * Lock the lcn bitmap for writing but only if not rolling back. We - * must hold the lock all the way including through rollback otherwise - * rollback is not possible because once we have cleared a bit and - * dropped the lock, anyone could have set the bit again, thus - * allocating the cluster for another use. - */ - if (likely(!is_rollback)) - down_write(&vol->lcnbmp_lock); - - total_freed = real_freed = 0; - - rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx); - if (IS_ERR(rl)) { - if (!is_rollback) - ntfs_error(vol->sb, "Failed to find first runlist " - "element (error %li), aborting.", - PTR_ERR(rl)); - err = PTR_ERR(rl); - goto err_out; - } - if (unlikely(rl->lcn < LCN_HOLE)) { - if (!is_rollback) - ntfs_error(vol->sb, "First runlist element has " - "invalid lcn, aborting."); - err = -EIO; - goto err_out; - } - /* Find the starting cluster inside the run that needs freeing. */ - delta = start_vcn - rl->vcn; - - /* The number of clusters in this run that need freeing. */ - to_free = rl->length - delta; - if (count >= 0 && to_free > count) - to_free = count; - - if (likely(rl->lcn >= 0)) { - /* Do the actual freeing of the clusters in this run. */ - err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn + delta, - to_free, likely(!is_rollback) ? 0 : 1); - if (unlikely(err)) { - if (!is_rollback) - ntfs_error(vol->sb, "Failed to clear first run " - "(error %i), aborting.", err); - goto err_out; - } - /* We have freed @to_free real clusters. */ - real_freed = to_free; - }; - /* Go to the next run and adjust the number of clusters left to free. */ - ++rl; - if (count >= 0) - count -= to_free; - - /* Keep track of the total "freed" clusters, including sparse ones. */ - total_freed = to_free; - /* - * Loop over the remaining runs, using @count as a capping value, and - * free them. - */ - for (; rl->length && count != 0; ++rl) { - if (unlikely(rl->lcn < LCN_HOLE)) { - VCN vcn; - - /* Attempt to map runlist. */ - vcn = rl->vcn; - rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - if (!is_rollback) - ntfs_error(vol->sb, "Failed to map " - "runlist fragment or " - "failed to find " - "subsequent runlist " - "element."); - goto err_out; - } - if (unlikely(rl->lcn < LCN_HOLE)) { - if (!is_rollback) - ntfs_error(vol->sb, "Runlist element " - "has invalid lcn " - "(0x%llx).", - (unsigned long long) - rl->lcn); - err = -EIO; - goto err_out; - } - } - /* The number of clusters in this run that need freeing. */ - to_free = rl->length; - if (count >= 0 && to_free > count) - to_free = count; - - if (likely(rl->lcn >= 0)) { - /* Do the actual freeing of the clusters in the run. */ - err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn, - to_free, likely(!is_rollback) ? 0 : 1); - if (unlikely(err)) { - if (!is_rollback) - ntfs_error(vol->sb, "Failed to clear " - "subsequent run."); - goto err_out; - } - /* We have freed @to_free real clusters. */ - real_freed += to_free; - } - /* Adjust the number of clusters left to free. */ - if (count >= 0) - count -= to_free; - - /* Update the total done clusters. */ - total_freed += to_free; - } - if (likely(!is_rollback)) - up_write(&vol->lcnbmp_lock); - - BUG_ON(count > 0); - - /* We are done. Return the number of actually freed clusters. */ - ntfs_debug("Done."); - return real_freed; -err_out: - if (is_rollback) - return err; - /* If no real clusters were freed, no need to rollback. */ - if (!real_freed) { - up_write(&vol->lcnbmp_lock); - return err; - } - /* - * Attempt to rollback and if that succeeds just return the error code. - * If rollback fails, set the volume errors flag, emit an error - * message, and return the error code. - */ - delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, true); - if (delta < 0) { - ntfs_error(vol->sb, "Failed to rollback (error %i). Leaving " - "inconsistent metadata! Unmount and run " - "chkdsk.", (int)delta); - NVolSetErrors(vol); - } - up_write(&vol->lcnbmp_lock); - ntfs_error(vol->sb, "Aborting (error %i).", err); - return err; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h deleted file mode 100644 index 1589a6d8434b0..0000000000000 --- a/fs/ntfs/lcnalloc.h +++ /dev/null @@ -1,131 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation. Part of the - * Linux-NTFS project. - * - * Copyright (c) 2004-2005 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_LCNALLOC_H -#define _LINUX_NTFS_LCNALLOC_H - -#ifdef NTFS_RW - -#include - -#include "attrib.h" -#include "types.h" -#include "inode.h" -#include "runlist.h" -#include "volume.h" - -typedef enum { - FIRST_ZONE = 0, /* For sanity checking. */ - MFT_ZONE = 0, /* Allocate from $MFT zone. */ - DATA_ZONE = 1, /* Allocate from $DATA zone. */ - LAST_ZONE = 1, /* For sanity checking. */ -} NTFS_CLUSTER_ALLOCATION_ZONES; - -extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, - const VCN start_vcn, const s64 count, const LCN start_lcn, - const NTFS_CLUSTER_ALLOCATION_ZONES zone, - const bool is_extension); - -extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, - s64 count, ntfs_attr_search_ctx *ctx, const bool is_rollback); - -/** - * ntfs_cluster_free - free clusters on an ntfs volume - * @ni: ntfs inode whose runlist describes the clusters to free - * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters - * @count: number of clusters to free or -1 for all clusters - * @ctx: active attribute search context if present or NULL if not - * - * Free @count clusters starting at the cluster @start_vcn in the runlist - * described by the ntfs inode @ni. - * - * If @count is -1, all clusters from @start_vcn to the end of the runlist are - * deallocated. Thus, to completely free all clusters in a runlist, use - * @start_vcn = 0 and @count = -1. - * - * If @ctx is specified, it is an active search context of @ni and its base mft - * record. This is needed when ntfs_cluster_free() encounters unmapped runlist - * fragments and allows their mapping. If you do not have the mft record - * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform - * the necessary mapping and unmapping. - * - * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it - * before returning. Thus, @ctx will be left pointing to the same attribute on - * return as on entry. However, the actual pointers in @ctx may point to - * different memory locations on return, so you must remember to reset any - * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(), - * you will probably want to do: - * m = ctx->mrec; - * a = ctx->attr; - * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that - * you cache ctx->mrec in a variable @m of type MFT_RECORD *. - * - * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove - * from the runlist or mark sparse the freed runs later. - * - * Return the number of deallocated clusters (not counting sparse ones) on - * success and -errno on error. - * - * WARNING: If @ctx is supplied, regardless of whether success or failure is - * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx - * is no longer valid, i.e. you need to either call - * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. - * In that case PTR_ERR(@ctx->mrec) will give you the error code for - * why the mapping of the old inode failed. - * - * Locking: - The runlist described by @ni must be locked for writing on entry - * and is locked on return. Note the runlist may be modified when - * needed runlist fragments need to be mapped. - * - The volume lcn bitmap must be unlocked on entry and is unlocked - * on return. - * - This function takes the volume lcn bitmap lock for writing and - * modifies the bitmap contents. - * - If @ctx is NULL, the base mft record of @ni must not be mapped on - * entry and it will be left unmapped on return. - * - If @ctx is not NULL, the base mft record must be mapped on entry - * and it will be left mapped on return. - */ -static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, - s64 count, ntfs_attr_search_ctx *ctx) -{ - return __ntfs_cluster_free(ni, start_vcn, count, ctx, false); -} - -extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, - const runlist_element *rl); - -/** - * ntfs_cluster_free_from_rl - free clusters from runlist - * @vol: mounted ntfs volume on which to free the clusters - * @rl: runlist describing the clusters to free - * - * Free all the clusters described by the runlist @rl on the volume @vol. In - * the case of an error being returned, at least some of the clusters were not - * freed. - * - * Return 0 on success and -errno on error. - * - * Locking: - This function takes the volume lcn bitmap lock for writing and - * modifies the bitmap contents. - * - The caller must have locked the runlist @rl for reading or - * writing. - */ -static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol, - const runlist_element *rl) -{ - int ret; - - down_write(&vol->lcnbmp_lock); - ret = ntfs_cluster_free_from_rl_nolock(vol, rl); - up_write(&vol->lcnbmp_lock); - return ret; -} - -#endif /* NTFS_RW */ - -#endif /* defined _LINUX_NTFS_LCNALLOC_H */ diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c deleted file mode 100644 index 6ce60ffc6ac04..0000000000000 --- a/fs/ntfs/logfile.c +++ /dev/null @@ -1,849 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2002-2007 Anton Altaparmakov - */ - -#ifdef NTFS_RW - -#include -#include -#include -#include -#include -#include -#include - -#include "attrib.h" -#include "aops.h" -#include "debug.h" -#include "logfile.h" -#include "malloc.h" -#include "volume.h" -#include "ntfs.h" - -/** - * ntfs_check_restart_page_header - check the page header for consistency - * @vi: $LogFile inode to which the restart page header belongs - * @rp: restart page header to check - * @pos: position in @vi at which the restart page header resides - * - * Check the restart page header @rp for consistency and return 'true' if it is - * consistent and 'false' otherwise. - * - * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not - * require the full restart page. - */ -static bool ntfs_check_restart_page_header(struct inode *vi, - RESTART_PAGE_HEADER *rp, s64 pos) -{ - u32 logfile_system_page_size, logfile_log_page_size; - u16 ra_ofs, usa_count, usa_ofs, usa_end = 0; - bool have_usa = true; - - ntfs_debug("Entering."); - /* - * If the system or log page sizes are smaller than the ntfs block size - * or either is not a power of 2 we cannot handle this log file. - */ - logfile_system_page_size = le32_to_cpu(rp->system_page_size); - logfile_log_page_size = le32_to_cpu(rp->log_page_size); - if (logfile_system_page_size < NTFS_BLOCK_SIZE || - logfile_log_page_size < NTFS_BLOCK_SIZE || - logfile_system_page_size & - (logfile_system_page_size - 1) || - !is_power_of_2(logfile_log_page_size)) { - ntfs_error(vi->i_sb, "$LogFile uses unsupported page size."); - return false; - } - /* - * We must be either at !pos (1st restart page) or at pos = system page - * size (2nd restart page). - */ - if (pos && pos != logfile_system_page_size) { - ntfs_error(vi->i_sb, "Found restart area in incorrect " - "position in $LogFile."); - return false; - } - /* We only know how to handle version 1.1. */ - if (sle16_to_cpu(rp->major_ver) != 1 || - sle16_to_cpu(rp->minor_ver) != 1) { - ntfs_error(vi->i_sb, "$LogFile version %i.%i is not " - "supported. (This driver supports version " - "1.1 only.)", (int)sle16_to_cpu(rp->major_ver), - (int)sle16_to_cpu(rp->minor_ver)); - return false; - } - /* - * If chkdsk has been run the restart page may not be protected by an - * update sequence array. - */ - if (ntfs_is_chkd_record(rp->magic) && !le16_to_cpu(rp->usa_count)) { - have_usa = false; - goto skip_usa_checks; - } - /* Verify the size of the update sequence array. */ - usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS); - if (usa_count != le16_to_cpu(rp->usa_count)) { - ntfs_error(vi->i_sb, "$LogFile restart page specifies " - "inconsistent update sequence array count."); - return false; - } - /* Verify the position of the update sequence array. */ - usa_ofs = le16_to_cpu(rp->usa_ofs); - usa_end = usa_ofs + usa_count * sizeof(u16); - if (usa_ofs < sizeof(RESTART_PAGE_HEADER) || - usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) { - ntfs_error(vi->i_sb, "$LogFile restart page specifies " - "inconsistent update sequence array offset."); - return false; - } -skip_usa_checks: - /* - * Verify the position of the restart area. It must be: - * - aligned to 8-byte boundary, - * - after the update sequence array, and - * - within the system page size. - */ - ra_ofs = le16_to_cpu(rp->restart_area_offset); - if (ra_ofs & 7 || (have_usa ? ra_ofs < usa_end : - ra_ofs < sizeof(RESTART_PAGE_HEADER)) || - ra_ofs > logfile_system_page_size) { - ntfs_error(vi->i_sb, "$LogFile restart page specifies " - "inconsistent restart area offset."); - return false; - } - /* - * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn - * set. - */ - if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) { - ntfs_error(vi->i_sb, "$LogFile restart page is not modified " - "by chkdsk but a chkdsk LSN is specified."); - return false; - } - ntfs_debug("Done."); - return true; -} - -/** - * ntfs_check_restart_area - check the restart area for consistency - * @vi: $LogFile inode to which the restart page belongs - * @rp: restart page whose restart area to check - * - * Check the restart area of the restart page @rp for consistency and return - * 'true' if it is consistent and 'false' otherwise. - * - * This function assumes that the restart page header has already been - * consistency checked. - * - * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not - * require the full restart page. - */ -static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp) -{ - u64 file_size; - RESTART_AREA *ra; - u16 ra_ofs, ra_len, ca_ofs; - u8 fs_bits; - - ntfs_debug("Entering."); - ra_ofs = le16_to_cpu(rp->restart_area_offset); - ra = (RESTART_AREA*)((u8*)rp + ra_ofs); - /* - * Everything before ra->file_size must be before the first word - * protected by an update sequence number. This ensures that it is - * safe to access ra->client_array_offset. - */ - if (ra_ofs + offsetof(RESTART_AREA, file_size) > - NTFS_BLOCK_SIZE - sizeof(u16)) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent file offset."); - return false; - } - /* - * Now that we can access ra->client_array_offset, make sure everything - * up to the log client array is before the first word protected by an - * update sequence number. This ensures we can access all of the - * restart area elements safely. Also, the client array offset must be - * aligned to an 8-byte boundary. - */ - ca_ofs = le16_to_cpu(ra->client_array_offset); - if (((ca_ofs + 7) & ~7) != ca_ofs || - ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent client array offset."); - return false; - } - /* - * The restart area must end within the system page size both when - * calculated manually and as specified by ra->restart_area_length. - * Also, the calculated length must not exceed the specified length. - */ - ra_len = ca_ofs + le16_to_cpu(ra->log_clients) * - sizeof(LOG_CLIENT_RECORD); - if (ra_ofs + ra_len > le32_to_cpu(rp->system_page_size) || - ra_ofs + le16_to_cpu(ra->restart_area_length) > - le32_to_cpu(rp->system_page_size) || - ra_len > le16_to_cpu(ra->restart_area_length)) { - ntfs_error(vi->i_sb, "$LogFile restart area is out of bounds " - "of the system page size specified by the " - "restart page header and/or the specified " - "restart area length is inconsistent."); - return false; - } - /* - * The ra->client_free_list and ra->client_in_use_list must be either - * LOGFILE_NO_CLIENT or less than ra->log_clients or they are - * overflowing the client array. - */ - if ((ra->client_free_list != LOGFILE_NO_CLIENT && - le16_to_cpu(ra->client_free_list) >= - le16_to_cpu(ra->log_clients)) || - (ra->client_in_use_list != LOGFILE_NO_CLIENT && - le16_to_cpu(ra->client_in_use_list) >= - le16_to_cpu(ra->log_clients))) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "overflowing client free and/or in use lists."); - return false; - } - /* - * Check ra->seq_number_bits against ra->file_size for consistency. - * We cannot just use ffs() because the file size is not a power of 2. - */ - file_size = (u64)sle64_to_cpu(ra->file_size); - fs_bits = 0; - while (file_size) { - file_size >>= 1; - fs_bits++; - } - if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent sequence number bits."); - return false; - } - /* The log record header length must be a multiple of 8. */ - if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) != - le16_to_cpu(ra->log_record_header_length)) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent log record header length."); - return false; - } - /* Dito for the log page data offset. */ - if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) != - le16_to_cpu(ra->log_page_data_offset)) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent log page data offset."); - return false; - } - ntfs_debug("Done."); - return true; -} - -/** - * ntfs_check_log_client_array - check the log client array for consistency - * @vi: $LogFile inode to which the restart page belongs - * @rp: restart page whose log client array to check - * - * Check the log client array of the restart page @rp for consistency and - * return 'true' if it is consistent and 'false' otherwise. - * - * This function assumes that the restart page header and the restart area have - * already been consistency checked. - * - * Unlike ntfs_check_restart_page_header() and ntfs_check_restart_area(), this - * function needs @rp->system_page_size bytes in @rp, i.e. it requires the full - * restart page and the page must be multi sector transfer deprotected. - */ -static bool ntfs_check_log_client_array(struct inode *vi, - RESTART_PAGE_HEADER *rp) -{ - RESTART_AREA *ra; - LOG_CLIENT_RECORD *ca, *cr; - u16 nr_clients, idx; - bool in_free_list, idx_is_first; - - ntfs_debug("Entering."); - ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); - ca = (LOG_CLIENT_RECORD*)((u8*)ra + - le16_to_cpu(ra->client_array_offset)); - /* - * Check the ra->client_free_list first and then check the - * ra->client_in_use_list. Check each of the log client records in - * each of the lists and check that the array does not overflow the - * ra->log_clients value. Also keep track of the number of records - * visited as there cannot be more than ra->log_clients records and - * that way we detect eventual loops in within a list. - */ - nr_clients = le16_to_cpu(ra->log_clients); - idx = le16_to_cpu(ra->client_free_list); - in_free_list = true; -check_list: - for (idx_is_first = true; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--, - idx = le16_to_cpu(cr->next_client)) { - if (!nr_clients || idx >= le16_to_cpu(ra->log_clients)) - goto err_out; - /* Set @cr to the current log client record. */ - cr = ca + idx; - /* The first log client record must not have a prev_client. */ - if (idx_is_first) { - if (cr->prev_client != LOGFILE_NO_CLIENT) - goto err_out; - idx_is_first = false; - } - } - /* Switch to and check the in use list if we just did the free list. */ - if (in_free_list) { - in_free_list = false; - idx = le16_to_cpu(ra->client_in_use_list); - goto check_list; - } - ntfs_debug("Done."); - return true; -err_out: - ntfs_error(vi->i_sb, "$LogFile log client array is corrupt."); - return false; -} - -/** - * ntfs_check_and_load_restart_page - check the restart page for consistency - * @vi: $LogFile inode to which the restart page belongs - * @rp: restart page to check - * @pos: position in @vi at which the restart page resides - * @wrp: [OUT] copy of the multi sector transfer deprotected restart page - * @lsn: [OUT] set to the current logfile lsn on success - * - * Check the restart page @rp for consistency and return 0 if it is consistent - * and -errno otherwise. The restart page may have been modified by chkdsk in - * which case its magic is CHKD instead of RSTR. - * - * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not - * require the full restart page. - * - * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a - * copy of the complete multi sector transfer deprotected page. On failure, - * *@wrp is undefined. - * - * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current - * logfile lsn according to this restart page. On failure, *@lsn is undefined. - * - * The following error codes are defined: - * -EINVAL - The restart page is inconsistent. - * -ENOMEM - Not enough memory to load the restart page. - * -EIO - Failed to reading from $LogFile. - */ -static int ntfs_check_and_load_restart_page(struct inode *vi, - RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp, - LSN *lsn) -{ - RESTART_AREA *ra; - RESTART_PAGE_HEADER *trp; - int size, err; - - ntfs_debug("Entering."); - /* Check the restart page header for consistency. */ - if (!ntfs_check_restart_page_header(vi, rp, pos)) { - /* Error output already done inside the function. */ - return -EINVAL; - } - /* Check the restart area for consistency. */ - if (!ntfs_check_restart_area(vi, rp)) { - /* Error output already done inside the function. */ - return -EINVAL; - } - ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); - /* - * Allocate a buffer to store the whole restart page so we can multi - * sector transfer deprotect it. - */ - trp = ntfs_malloc_nofs(le32_to_cpu(rp->system_page_size)); - if (!trp) { - ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile " - "restart page buffer."); - return -ENOMEM; - } - /* - * Read the whole of the restart page into the buffer. If it fits - * completely inside @rp, just copy it from there. Otherwise map all - * the required pages and copy the data from them. - */ - size = PAGE_SIZE - (pos & ~PAGE_MASK); - if (size >= le32_to_cpu(rp->system_page_size)) { - memcpy(trp, rp, le32_to_cpu(rp->system_page_size)); - } else { - pgoff_t idx; - struct page *page; - int have_read, to_read; - - /* First copy what we already have in @rp. */ - memcpy(trp, rp, size); - /* Copy the remaining data one page at a time. */ - have_read = size; - to_read = le32_to_cpu(rp->system_page_size) - size; - idx = (pos + size) >> PAGE_SHIFT; - BUG_ON((pos + size) & ~PAGE_MASK); - do { - page = ntfs_map_page(vi->i_mapping, idx); - if (IS_ERR(page)) { - ntfs_error(vi->i_sb, "Error mapping $LogFile " - "page (index %lu).", idx); - err = PTR_ERR(page); - if (err != -EIO && err != -ENOMEM) - err = -EIO; - goto err_out; - } - size = min_t(int, to_read, PAGE_SIZE); - memcpy((u8*)trp + have_read, page_address(page), size); - ntfs_unmap_page(page); - have_read += size; - to_read -= size; - idx++; - } while (to_read > 0); - } - /* - * Perform the multi sector transfer deprotection on the buffer if the - * restart page is protected. - */ - if ((!ntfs_is_chkd_record(trp->magic) || le16_to_cpu(trp->usa_count)) - && post_read_mst_fixup((NTFS_RECORD*)trp, - le32_to_cpu(rp->system_page_size))) { - /* - * A multi sector tranfer error was detected. We only need to - * abort if the restart page contents exceed the multi sector - * transfer fixup of the first sector. - */ - if (le16_to_cpu(rp->restart_area_offset) + - le16_to_cpu(ra->restart_area_length) > - NTFS_BLOCK_SIZE - sizeof(u16)) { - ntfs_error(vi->i_sb, "Multi sector transfer error " - "detected in $LogFile restart page."); - err = -EINVAL; - goto err_out; - } - } - /* - * If the restart page is modified by chkdsk or there are no active - * logfile clients, the logfile is consistent. Otherwise, need to - * check the log client records for consistency, too. - */ - err = 0; - if (ntfs_is_rstr_record(rp->magic) && - ra->client_in_use_list != LOGFILE_NO_CLIENT) { - if (!ntfs_check_log_client_array(vi, trp)) { - err = -EINVAL; - goto err_out; - } - } - if (lsn) { - if (ntfs_is_rstr_record(rp->magic)) - *lsn = sle64_to_cpu(ra->current_lsn); - else /* if (ntfs_is_chkd_record(rp->magic)) */ - *lsn = sle64_to_cpu(rp->chkdsk_lsn); - } - ntfs_debug("Done."); - if (wrp) - *wrp = trp; - else { -err_out: - ntfs_free(trp); - } - return err; -} - -/** - * ntfs_check_logfile - check the journal for consistency - * @log_vi: struct inode of loaded journal $LogFile to check - * @rp: [OUT] on success this is a copy of the current restart page - * - * Check the $LogFile journal for consistency and return 'true' if it is - * consistent and 'false' if not. On success, the current restart page is - * returned in *@rp. Caller must call ntfs_free(*@rp) when finished with it. - * - * At present we only check the two restart pages and ignore the log record - * pages. - * - * Note that the MstProtected flag is not set on the $LogFile inode and hence - * when reading pages they are not deprotected. This is because we do not know - * if the $LogFile was created on a system with a different page size to ours - * yet and mst deprotection would fail if our page size is smaller. - */ -bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) -{ - s64 size, pos; - LSN rstr1_lsn, rstr2_lsn; - ntfs_volume *vol = NTFS_SB(log_vi->i_sb); - struct address_space *mapping = log_vi->i_mapping; - struct page *page = NULL; - u8 *kaddr = NULL; - RESTART_PAGE_HEADER *rstr1_ph = NULL; - RESTART_PAGE_HEADER *rstr2_ph = NULL; - int log_page_size, err; - bool logfile_is_empty = true; - u8 log_page_bits; - - ntfs_debug("Entering."); - /* An empty $LogFile must have been clean before it got emptied. */ - if (NVolLogFileEmpty(vol)) - goto is_empty; - size = i_size_read(log_vi); - /* Make sure the file doesn't exceed the maximum allowed size. */ - if (size > MaxLogFileSize) - size = MaxLogFileSize; - /* - * Truncate size to a multiple of the page cache size or the default - * log page size if the page cache size is between the default log page - * log page size if the page cache size is between the default log page - * size and twice that. - */ - if (PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= - DefaultLogPageSize * 2) - log_page_size = DefaultLogPageSize; - else - log_page_size = PAGE_SIZE; - /* - * Use ntfs_ffs() instead of ffs() to enable the compiler to - * optimize log_page_size and log_page_bits into constants. - */ - log_page_bits = ntfs_ffs(log_page_size) - 1; - size &= ~(s64)(log_page_size - 1); - /* - * Ensure the log file is big enough to store at least the two restart - * pages and the minimum number of log record pages. - */ - if (size < log_page_size * 2 || (size - log_page_size * 2) >> - log_page_bits < MinLogRecordPages) { - ntfs_error(vol->sb, "$LogFile is too small."); - return false; - } - /* - * Read through the file looking for a restart page. Since the restart - * page header is at the beginning of a page we only need to search at - * what could be the beginning of a page (for each page size) rather - * than scanning the whole file byte by byte. If all potential places - * contain empty and uninitialzed records, the log file can be assumed - * to be empty. - */ - for (pos = 0; pos < size; pos <<= 1) { - pgoff_t idx = pos >> PAGE_SHIFT; - if (!page || page->index != idx) { - if (page) - ntfs_unmap_page(page); - page = ntfs_map_page(mapping, idx); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Error mapping $LogFile " - "page (index %lu).", idx); - goto err_out; - } - } - kaddr = (u8*)page_address(page) + (pos & ~PAGE_MASK); - /* - * A non-empty block means the logfile is not empty while an - * empty block after a non-empty block has been encountered - * means we are done. - */ - if (!ntfs_is_empty_recordp((le32*)kaddr)) - logfile_is_empty = false; - else if (!logfile_is_empty) - break; - /* - * A log record page means there cannot be a restart page after - * this so no need to continue searching. - */ - if (ntfs_is_rcrd_recordp((le32*)kaddr)) - break; - /* If not a (modified by chkdsk) restart page, continue. */ - if (!ntfs_is_rstr_recordp((le32*)kaddr) && - !ntfs_is_chkd_recordp((le32*)kaddr)) { - if (!pos) - pos = NTFS_BLOCK_SIZE >> 1; - continue; - } - /* - * Check the (modified by chkdsk) restart page for consistency - * and get a copy of the complete multi sector transfer - * deprotected restart page. - */ - err = ntfs_check_and_load_restart_page(log_vi, - (RESTART_PAGE_HEADER*)kaddr, pos, - !rstr1_ph ? &rstr1_ph : &rstr2_ph, - !rstr1_ph ? &rstr1_lsn : &rstr2_lsn); - if (!err) { - /* - * If we have now found the first (modified by chkdsk) - * restart page, continue looking for the second one. - */ - if (!pos) { - pos = NTFS_BLOCK_SIZE >> 1; - continue; - } - /* - * We have now found the second (modified by chkdsk) - * restart page, so we can stop looking. - */ - break; - } - /* - * Error output already done inside the function. Note, we do - * not abort if the restart page was invalid as we might still - * find a valid one further in the file. - */ - if (err != -EINVAL) { - ntfs_unmap_page(page); - goto err_out; - } - /* Continue looking. */ - if (!pos) - pos = NTFS_BLOCK_SIZE >> 1; - } - if (page) - ntfs_unmap_page(page); - if (logfile_is_empty) { - NVolSetLogFileEmpty(vol); -is_empty: - ntfs_debug("Done. ($LogFile is empty.)"); - return true; - } - if (!rstr1_ph) { - BUG_ON(rstr2_ph); - ntfs_error(vol->sb, "Did not find any restart pages in " - "$LogFile and it was not empty."); - return false; - } - /* If both restart pages were found, use the more recent one. */ - if (rstr2_ph) { - /* - * If the second restart area is more recent, switch to it. - * Otherwise just throw it away. - */ - if (rstr2_lsn > rstr1_lsn) { - ntfs_debug("Using second restart page as it is more " - "recent."); - ntfs_free(rstr1_ph); - rstr1_ph = rstr2_ph; - /* rstr1_lsn = rstr2_lsn; */ - } else { - ntfs_debug("Using first restart page as it is more " - "recent."); - ntfs_free(rstr2_ph); - } - rstr2_ph = NULL; - } - /* All consistency checks passed. */ - if (rp) - *rp = rstr1_ph; - else - ntfs_free(rstr1_ph); - ntfs_debug("Done."); - return true; -err_out: - if (rstr1_ph) - ntfs_free(rstr1_ph); - return false; -} - -/** - * ntfs_is_logfile_clean - check in the journal if the volume is clean - * @log_vi: struct inode of loaded journal $LogFile to check - * @rp: copy of the current restart page - * - * Analyze the $LogFile journal and return 'true' if it indicates the volume was - * shutdown cleanly and 'false' if not. - * - * At present we only look at the two restart pages and ignore the log record - * pages. This is a little bit crude in that there will be a very small number - * of cases where we think that a volume is dirty when in fact it is clean. - * This should only affect volumes that have not been shutdown cleanly but did - * not have any pending, non-check-pointed i/o, i.e. they were completely idle - * at least for the five seconds preceding the unclean shutdown. - * - * This function assumes that the $LogFile journal has already been consistency - * checked by a call to ntfs_check_logfile() and in particular if the $LogFile - * is empty this function requires that NVolLogFileEmpty() is true otherwise an - * empty volume will be reported as dirty. - */ -bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp) -{ - ntfs_volume *vol = NTFS_SB(log_vi->i_sb); - RESTART_AREA *ra; - - ntfs_debug("Entering."); - /* An empty $LogFile must have been clean before it got emptied. */ - if (NVolLogFileEmpty(vol)) { - ntfs_debug("Done. ($LogFile is empty.)"); - return true; - } - BUG_ON(!rp); - if (!ntfs_is_rstr_record(rp->magic) && - !ntfs_is_chkd_record(rp->magic)) { - ntfs_error(vol->sb, "Restart page buffer is invalid. This is " - "probably a bug in that the $LogFile should " - "have been consistency checked before calling " - "this function."); - return false; - } - ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); - /* - * If the $LogFile has active clients, i.e. it is open, and we do not - * have the RESTART_VOLUME_IS_CLEAN bit set in the restart area flags, - * we assume there was an unclean shutdown. - */ - if (ra->client_in_use_list != LOGFILE_NO_CLIENT && - !(ra->flags & RESTART_VOLUME_IS_CLEAN)) { - ntfs_debug("Done. $LogFile indicates a dirty shutdown."); - return false; - } - /* $LogFile indicates a clean shutdown. */ - ntfs_debug("Done. $LogFile indicates a clean shutdown."); - return true; -} - -/** - * ntfs_empty_logfile - empty the contents of the $LogFile journal - * @log_vi: struct inode of loaded journal $LogFile to empty - * - * Empty the contents of the $LogFile journal @log_vi and return 'true' on - * success and 'false' on error. - * - * This function assumes that the $LogFile journal has already been consistency - * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean() - * has been used to ensure that the $LogFile is clean. - */ -bool ntfs_empty_logfile(struct inode *log_vi) -{ - VCN vcn, end_vcn; - ntfs_inode *log_ni = NTFS_I(log_vi); - ntfs_volume *vol = log_ni->vol; - struct super_block *sb = vol->sb; - runlist_element *rl; - unsigned long flags; - unsigned block_size, block_size_bits; - int err; - bool should_wait = true; - - ntfs_debug("Entering."); - if (NVolLogFileEmpty(vol)) { - ntfs_debug("Done."); - return true; - } - /* - * We cannot use ntfs_attr_set() because we may be still in the middle - * of a mount operation. Thus we do the emptying by hand by first - * zapping the page cache pages for the $LogFile/$DATA attribute and - * then emptying each of the buffers in each of the clusters specified - * by the runlist by hand. - */ - block_size = sb->s_blocksize; - block_size_bits = sb->s_blocksize_bits; - vcn = 0; - read_lock_irqsave(&log_ni->size_lock, flags); - end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >> - vol->cluster_size_bits; - read_unlock_irqrestore(&log_ni->size_lock, flags); - truncate_inode_pages(log_vi->i_mapping, 0); - down_write(&log_ni->runlist.lock); - rl = log_ni->runlist.rl; - if (unlikely(!rl || vcn < rl->vcn || !rl->length)) { -map_vcn: - err = ntfs_map_runlist_nolock(log_ni, vcn, NULL); - if (err) { - ntfs_error(sb, "Failed to map runlist fragment (error " - "%d).", -err); - goto err; - } - rl = log_ni->runlist.rl; - BUG_ON(!rl || vcn < rl->vcn || !rl->length); - } - /* Seek to the runlist element containing @vcn. */ - while (rl->length && vcn >= rl[1].vcn) - rl++; - do { - LCN lcn; - sector_t block, end_block; - s64 len; - - /* - * If this run is not mapped map it now and start again as the - * runlist will have been updated. - */ - lcn = rl->lcn; - if (unlikely(lcn == LCN_RL_NOT_MAPPED)) { - vcn = rl->vcn; - goto map_vcn; - } - /* If this run is not valid abort with an error. */ - if (unlikely(!rl->length || lcn < LCN_HOLE)) - goto rl_err; - /* Skip holes. */ - if (lcn == LCN_HOLE) - continue; - block = lcn << vol->cluster_size_bits >> block_size_bits; - len = rl->length; - if (rl[1].vcn > end_vcn) - len = end_vcn - rl->vcn; - end_block = (lcn + len) << vol->cluster_size_bits >> - block_size_bits; - /* Iterate over the blocks in the run and empty them. */ - do { - struct buffer_head *bh; - - /* Obtain the buffer, possibly not uptodate. */ - bh = sb_getblk(sb, block); - BUG_ON(!bh); - /* Setup buffer i/o submission. */ - lock_buffer(bh); - bh->b_end_io = end_buffer_write_sync; - get_bh(bh); - /* Set the entire contents of the buffer to 0xff. */ - memset(bh->b_data, -1, block_size); - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); - if (buffer_dirty(bh)) - clear_buffer_dirty(bh); - /* - * Submit the buffer and wait for i/o to complete but - * only for the first buffer so we do not miss really - * serious i/o errors. Once the first buffer has - * completed ignore errors afterwards as we can assume - * that if one buffer worked all of them will work. - */ - submit_bh(REQ_OP_WRITE, bh); - if (should_wait) { - should_wait = false; - wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - goto io_err; - } - brelse(bh); - } while (++block < end_block); - } while ((++rl)->vcn < end_vcn); - up_write(&log_ni->runlist.lock); - /* - * Zap the pages again just in case any got instantiated whilst we were - * emptying the blocks by hand. FIXME: We may not have completed - * writing to all the buffer heads yet so this may happen too early. - * We really should use a kernel thread to do the emptying - * asynchronously and then we can also set the volume dirty and output - * an error message if emptying should fail. - */ - truncate_inode_pages(log_vi->i_mapping, 0); - /* Set the flag so we do not have to do it again on remount. */ - NVolSetLogFileEmpty(vol); - ntfs_debug("Done."); - return true; -io_err: - ntfs_error(sb, "Failed to write buffer. Unmount and run chkdsk."); - goto dirty_err; -rl_err: - ntfs_error(sb, "Runlist is corrupt. Unmount and run chkdsk."); -dirty_err: - NVolSetErrors(vol); - err = -EIO; -err: - up_write(&log_ni->runlist.lock); - ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).", - -err); - return false; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h deleted file mode 100644 index 429d4909cc722..0000000000000 --- a/fs/ntfs/logfile.h +++ /dev/null @@ -1,295 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * logfile.h - Defines for NTFS kernel journal ($LogFile) handling. Part of - * the Linux-NTFS project. - * - * Copyright (c) 2000-2005 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_LOGFILE_H -#define _LINUX_NTFS_LOGFILE_H - -#ifdef NTFS_RW - -#include - -#include "types.h" -#include "endian.h" -#include "layout.h" - -/* - * Journal ($LogFile) organization: - * - * Two restart areas present in the first two pages (restart pages, one restart - * area in each page). When the volume is dismounted they should be identical, - * except for the update sequence array which usually has a different update - * sequence number. - * - * These are followed by log records organized in pages headed by a log record - * header going up to log file size. Not all pages contain log records when a - * volume is first formatted, but as the volume ages, all records will be used. - * When the log file fills up, the records at the beginning are purged (by - * modifying the oldest_lsn to a higher value presumably) and writing begins - * at the beginning of the file. Effectively, the log file is viewed as a - * circular entity. - * - * NOTE: Windows NT, 2000, and XP all use log file version 1.1 but they accept - * versions <= 1.x, including 0.-1. (Yes, that is a minus one in there!) We - * probably only want to support 1.1 as this seems to be the current version - * and we don't know how that differs from the older versions. The only - * exception is if the journal is clean as marked by the two restart pages - * then it doesn't matter whether we are on an earlier version. We can just - * reinitialize the logfile and start again with version 1.1. - */ - -/* Some $LogFile related constants. */ -#define MaxLogFileSize 0x100000000ULL -#define DefaultLogPageSize 4096 -#define MinLogRecordPages 48 - -/* - * Log file restart page header (begins the restart area). - */ -typedef struct { -/*Ofs*/ -/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ -/* 0*/ NTFS_RECORD_TYPE magic; /* The magic is "RSTR". */ -/* 4*/ le16 usa_ofs; /* See NTFS_RECORD definition in layout.h. - When creating, set this to be immediately - after this header structure (without any - alignment). */ -/* 6*/ le16 usa_count; /* See NTFS_RECORD definition in layout.h. */ - -/* 8*/ leLSN chkdsk_lsn; /* The last log file sequence number found by - chkdsk. Only used when the magic is changed - to "CHKD". Otherwise this is zero. */ -/* 16*/ le32 system_page_size; /* Byte size of system pages when the log file - was created, has to be >= 512 and a power of - 2. Use this to calculate the required size - of the usa (usa_count) and add it to usa_ofs. - Then verify that the result is less than the - value of the restart_area_offset. */ -/* 20*/ le32 log_page_size; /* Byte size of log file pages, has to be >= - 512 and a power of 2. The default is 4096 - and is used when the system page size is - between 4096 and 8192. Otherwise this is - set to the system page size instead. */ -/* 24*/ le16 restart_area_offset;/* Byte offset from the start of this header to - the RESTART_AREA. Value has to be aligned - to 8-byte boundary. When creating, set this - to be after the usa. */ -/* 26*/ sle16 minor_ver; /* Log file minor version. Only check if major - version is 1. */ -/* 28*/ sle16 major_ver; /* Log file major version. We only support - version 1.1. */ -/* sizeof() = 30 (0x1e) bytes */ -} __attribute__ ((__packed__)) RESTART_PAGE_HEADER; - -/* - * Constant for the log client indices meaning that there are no client records - * in this particular client array. Also inside the client records themselves, - * this means that there are no client records preceding or following this one. - */ -#define LOGFILE_NO_CLIENT cpu_to_le16(0xffff) -#define LOGFILE_NO_CLIENT_CPU 0xffff - -/* - * These are the so far known RESTART_AREA_* flags (16-bit) which contain - * information about the log file in which they are present. - */ -enum { - RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002), - RESTART_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */ -} __attribute__ ((__packed__)); - -typedef le16 RESTART_AREA_FLAGS; - -/* - * Log file restart area record. The offset of this record is found by adding - * the offset of the RESTART_PAGE_HEADER to the restart_area_offset value found - * in it. See notes at restart_area_offset above. - */ -typedef struct { -/*Ofs*/ -/* 0*/ leLSN current_lsn; /* The current, i.e. last LSN inside the log - when the restart area was last written. - This happens often but what is the interval? - Is it just fixed time or is it every time a - check point is written or somethine else? - On create set to 0. */ -/* 8*/ le16 log_clients; /* Number of log client records in the array of - log client records which follows this - restart area. Must be 1. */ -/* 10*/ le16 client_free_list; /* The index of the first free log client record - in the array of log client records. - LOGFILE_NO_CLIENT means that there are no - free log client records in the array. - If != LOGFILE_NO_CLIENT, check that - log_clients > client_free_list. On Win2k - and presumably earlier, on a clean volume - this is != LOGFILE_NO_CLIENT, and it should - be 0, i.e. the first (and only) client - record is free and thus the logfile is - closed and hence clean. A dirty volume - would have left the logfile open and hence - this would be LOGFILE_NO_CLIENT. On WinXP - and presumably later, the logfile is always - open, even on clean shutdown so this should - always be LOGFILE_NO_CLIENT. */ -/* 12*/ le16 client_in_use_list;/* The index of the first in-use log client - record in the array of log client records. - LOGFILE_NO_CLIENT means that there are no - in-use log client records in the array. If - != LOGFILE_NO_CLIENT check that log_clients - > client_in_use_list. On Win2k and - presumably earlier, on a clean volume this - is LOGFILE_NO_CLIENT, i.e. there are no - client records in use and thus the logfile - is closed and hence clean. A dirty volume - would have left the logfile open and hence - this would be != LOGFILE_NO_CLIENT, and it - should be 0, i.e. the first (and only) - client record is in use. On WinXP and - presumably later, the logfile is always - open, even on clean shutdown so this should - always be 0. */ -/* 14*/ RESTART_AREA_FLAGS flags;/* Flags modifying LFS behaviour. On Win2k - and presumably earlier this is always 0. On - WinXP and presumably later, if the logfile - was shutdown cleanly, the second bit, - RESTART_VOLUME_IS_CLEAN, is set. This bit - is cleared when the volume is mounted by - WinXP and set when the volume is dismounted, - thus if the logfile is dirty, this bit is - clear. Thus we don't need to check the - Windows version to determine if the logfile - is clean. Instead if the logfile is closed, - we know it must be clean. If it is open and - this bit is set, we also know it must be - clean. If on the other hand the logfile is - open and this bit is clear, we can be almost - certain that the logfile is dirty. */ -/* 16*/ le32 seq_number_bits; /* How many bits to use for the sequence - number. This is calculated as 67 - the - number of bits required to store the logfile - size in bytes and this can be used in with - the specified file_size as a consistency - check. */ -/* 20*/ le16 restart_area_length;/* Length of the restart area including the - client array. Following checks required if - version matches. Otherwise, skip them. - restart_area_offset + restart_area_length - has to be <= system_page_size. Also, - restart_area_length has to be >= - client_array_offset + (log_clients * - sizeof(log client record)). */ -/* 22*/ le16 client_array_offset;/* Offset from the start of this record to - the first log client record if versions are - matched. When creating, set this to be - after this restart area structure, aligned - to 8-bytes boundary. If the versions do not - match, this is ignored and the offset is - assumed to be (sizeof(RESTART_AREA) + 7) & - ~7, i.e. rounded up to first 8-byte - boundary. Either way, client_array_offset - has to be aligned to an 8-byte boundary. - Also, restart_area_offset + - client_array_offset has to be <= 510. - Finally, client_array_offset + (log_clients - * sizeof(log client record)) has to be <= - system_page_size. On Win2k and presumably - earlier, this is 0x30, i.e. immediately - following this record. On WinXP and - presumably later, this is 0x40, i.e. there - are 16 extra bytes between this record and - the client array. This probably means that - the RESTART_AREA record is actually bigger - in WinXP and later. */ -/* 24*/ sle64 file_size; /* Usable byte size of the log file. If the - restart_area_offset + the offset of the - file_size are > 510 then corruption has - occurred. This is the very first check when - starting with the restart_area as if it - fails it means that some of the above values - will be corrupted by the multi sector - transfer protection. The file_size has to - be rounded down to be a multiple of the - log_page_size in the RESTART_PAGE_HEADER and - then it has to be at least big enough to - store the two restart pages and 48 (0x30) - log record pages. */ -/* 32*/ le32 last_lsn_data_length;/* Length of data of last LSN, not including - the log record header. On create set to - 0. */ -/* 36*/ le16 log_record_header_length;/* Byte size of the log record header. - If the version matches then check that the - value of log_record_header_length is a - multiple of 8, i.e. - (log_record_header_length + 7) & ~7 == - log_record_header_length. When creating set - it to sizeof(LOG_RECORD_HEADER), aligned to - 8 bytes. */ -/* 38*/ le16 log_page_data_offset;/* Offset to the start of data in a log record - page. Must be a multiple of 8. On create - set it to immediately after the update - sequence array of the log record page. */ -/* 40*/ le32 restart_log_open_count;/* A counter that gets incremented every - time the logfile is restarted which happens - at mount time when the logfile is opened. - When creating set to a random value. Win2k - sets it to the low 32 bits of the current - system time in NTFS format (see time.h). */ -/* 44*/ le32 reserved; /* Reserved/alignment to 8-byte boundary. */ -/* sizeof() = 48 (0x30) bytes */ -} __attribute__ ((__packed__)) RESTART_AREA; - -/* - * Log client record. The offset of this record is found by adding the offset - * of the RESTART_AREA to the client_array_offset value found in it. - */ -typedef struct { -/*Ofs*/ -/* 0*/ leLSN oldest_lsn; /* Oldest LSN needed by this client. On create - set to 0. */ -/* 8*/ leLSN client_restart_lsn;/* LSN at which this client needs to restart - the volume, i.e. the current position within - the log file. At present, if clean this - should = current_lsn in restart area but it - probably also = current_lsn when dirty most - of the time. At create set to 0. */ -/* 16*/ le16 prev_client; /* The offset to the previous log client record - in the array of log client records. - LOGFILE_NO_CLIENT means there is no previous - client record, i.e. this is the first one. - This is always LOGFILE_NO_CLIENT. */ -/* 18*/ le16 next_client; /* The offset to the next log client record in - the array of log client records. - LOGFILE_NO_CLIENT means there are no next - client records, i.e. this is the last one. - This is always LOGFILE_NO_CLIENT. */ -/* 20*/ le16 seq_number; /* On Win2k and presumably earlier, this is set - to zero every time the logfile is restarted - and it is incremented when the logfile is - closed at dismount time. Thus it is 0 when - dirty and 1 when clean. On WinXP and - presumably later, this is always 0. */ -/* 22*/ u8 reserved[6]; /* Reserved/alignment. */ -/* 28*/ le32 client_name_length;/* Length of client name in bytes. Should - always be 8. */ -/* 32*/ ntfschar client_name[64];/* Name of the client in Unicode. Should - always be "NTFS" with the remaining bytes - set to 0. */ -/* sizeof() = 160 (0xa0) bytes */ -} __attribute__ ((__packed__)) LOG_CLIENT_RECORD; - -extern bool ntfs_check_logfile(struct inode *log_vi, - RESTART_PAGE_HEADER **rp); - -extern bool ntfs_is_logfile_clean(struct inode *log_vi, - const RESTART_PAGE_HEADER *rp); - -extern bool ntfs_empty_logfile(struct inode *log_vi); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_LOGFILE_H */ diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h deleted file mode 100644 index 7068425735f14..0000000000000 --- a/fs/ntfs/malloc.h +++ /dev/null @@ -1,77 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2005 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_MALLOC_H -#define _LINUX_NTFS_MALLOC_H - -#include -#include -#include - -/** - * __ntfs_malloc - allocate memory in multiples of pages - * @size: number of bytes to allocate - * @gfp_mask: extra flags for the allocator - * - * Internal function. You probably want ntfs_malloc_nofs()... - * - * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and - * returns a pointer to the allocated memory. - * - * If there was insufficient memory to complete the request, return NULL. - * Depending on @gfp_mask the allocation may be guaranteed to succeed. - */ -static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) -{ - if (likely(size <= PAGE_SIZE)) { - BUG_ON(!size); - /* kmalloc() has per-CPU caches so is faster for now. */ - return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM); - /* return (void *)__get_free_page(gfp_mask); */ - } - if (likely((size >> PAGE_SHIFT) < totalram_pages())) - return __vmalloc(size, gfp_mask); - return NULL; -} - -/** - * ntfs_malloc_nofs - allocate memory in multiples of pages - * @size: number of bytes to allocate - * - * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and - * returns a pointer to the allocated memory. - * - * If there was insufficient memory to complete the request, return NULL. - */ -static inline void *ntfs_malloc_nofs(unsigned long size) -{ - return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM); -} - -/** - * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages - * @size: number of bytes to allocate - * - * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and - * returns a pointer to the allocated memory. - * - * This function guarantees that the allocation will succeed. It will sleep - * for as long as it takes to complete the allocation. - * - * If there was insufficient memory to complete the request, return NULL. - */ -static inline void *ntfs_malloc_nofs_nofail(unsigned long size) -{ - return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL); -} - -static inline void ntfs_free(void *addr) -{ - kvfree(addr); -} - -#endif /* _LINUX_NTFS_MALLOC_H */ diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c deleted file mode 100644 index 6fd1dc4b08c81..0000000000000 --- a/fs/ntfs/mft.c +++ /dev/null @@ -1,2907 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. - * Copyright (c) 2002 Richard Russon - */ - -#include -#include -#include -#include - -#include "attrib.h" -#include "aops.h" -#include "bitmap.h" -#include "debug.h" -#include "dir.h" -#include "lcnalloc.h" -#include "malloc.h" -#include "mft.h" -#include "ntfs.h" - -#define MAX_BHS (PAGE_SIZE / NTFS_BLOCK_SIZE) - -/** - * map_mft_record_page - map the page in which a specific mft record resides - * @ni: ntfs inode whose mft record page to map - * - * This maps the page in which the mft record of the ntfs inode @ni is situated - * and returns a pointer to the mft record within the mapped page. - * - * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR() - * contains the negative error code returned. - */ -static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni) -{ - loff_t i_size; - ntfs_volume *vol = ni->vol; - struct inode *mft_vi = vol->mft_ino; - struct page *page; - unsigned long index, end_index; - unsigned ofs; - - BUG_ON(ni->page); - /* - * The index into the page cache and the offset within the page cache - * page of the wanted mft record. FIXME: We need to check for - * overflowing the unsigned long, but I don't think we would ever get - * here if the volume was that big... - */ - index = (u64)ni->mft_no << vol->mft_record_size_bits >> - PAGE_SHIFT; - ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; - - i_size = i_size_read(mft_vi); - /* The maximum valid index into the page cache for $MFT's data. */ - end_index = i_size >> PAGE_SHIFT; - - /* If the wanted index is out of bounds the mft record doesn't exist. */ - if (unlikely(index >= end_index)) { - if (index > end_index || (i_size & ~PAGE_MASK) < ofs + - vol->mft_record_size) { - page = ERR_PTR(-ENOENT); - ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, " - "which is beyond the end of the mft. " - "This is probably a bug in the ntfs " - "driver.", ni->mft_no); - goto err_out; - } - } - /* Read, map, and pin the page. */ - page = ntfs_map_page(mft_vi->i_mapping, index); - if (!IS_ERR(page)) { - /* Catch multi sector transfer fixup errors. */ - if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) + - ofs)))) { - ni->page = page; - ni->page_ofs = ofs; - return page_address(page) + ofs; - } - ntfs_error(vol->sb, "Mft record 0x%lx is corrupt. " - "Run chkdsk.", ni->mft_no); - ntfs_unmap_page(page); - page = ERR_PTR(-EIO); - NVolSetErrors(vol); - } -err_out: - ni->page = NULL; - ni->page_ofs = 0; - return (void*)page; -} - -/** - * map_mft_record - map, pin and lock an mft record - * @ni: ntfs inode whose MFT record to map - * - * First, take the mrec_lock mutex. We might now be sleeping, while waiting - * for the mutex if it was already locked by someone else. - * - * The page of the record is mapped using map_mft_record_page() before being - * returned to the caller. - * - * This in turn uses ntfs_map_page() to get the page containing the wanted mft - * record (it in turn calls read_cache_page() which reads it in from disk if - * necessary, increments the use count on the page so that it cannot disappear - * under us and returns a reference to the page cache page). - * - * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it - * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed - * and the post-read mst fixups on each mft record in the page have been - * performed, the page gets PG_uptodate set and PG_locked cleared (this is done - * in our asynchronous I/O completion handler end_buffer_read_mft_async()). - * ntfs_map_page() waits for PG_locked to become clear and checks if - * PG_uptodate is set and returns an error code if not. This provides - * sufficient protection against races when reading/using the page. - * - * However there is the write mapping to think about. Doing the above described - * checking here will be fine, because when initiating the write we will set - * PG_locked and clear PG_uptodate making sure nobody is touching the page - * contents. Doing the locking this way means that the commit to disk code in - * the page cache code paths is automatically sufficiently locked with us as - * we will not touch a page that has been locked or is not uptodate. The only - * locking problem then is them locking the page while we are accessing it. - * - * So that code will end up having to own the mrec_lock of all mft - * records/inodes present in the page before I/O can proceed. In that case we - * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be - * accessing anything without owning the mrec_lock mutex. But we do need to - * use them because of the read_cache_page() invocation and the code becomes so - * much simpler this way that it is well worth it. - * - * The mft record is now ours and we return a pointer to it. You need to check - * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return - * the error code. - * - * NOTE: Caller is responsible for setting the mft record dirty before calling - * unmap_mft_record(). This is obviously only necessary if the caller really - * modified the mft record... - * Q: Do we want to recycle one of the VFS inode state bits instead? - * A: No, the inode ones mean we want to change the mft record, not we want to - * write it out. - */ -MFT_RECORD *map_mft_record(ntfs_inode *ni) -{ - MFT_RECORD *m; - - ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no); - - /* Make sure the ntfs inode doesn't go away. */ - atomic_inc(&ni->count); - - /* Serialize access to this mft record. */ - mutex_lock(&ni->mrec_lock); - - m = map_mft_record_page(ni); - if (!IS_ERR(m)) - return m; - - mutex_unlock(&ni->mrec_lock); - atomic_dec(&ni->count); - ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m)); - return m; -} - -/** - * unmap_mft_record_page - unmap the page in which a specific mft record resides - * @ni: ntfs inode whose mft record page to unmap - * - * This unmaps the page in which the mft record of the ntfs inode @ni is - * situated and returns. This is a NOOP if highmem is not configured. - * - * The unmap happens via ntfs_unmap_page() which in turn decrements the use - * count on the page thus releasing it from the pinned state. - * - * We do not actually unmap the page from memory of course, as that will be - * done by the page cache code itself when memory pressure increases or - * whatever. - */ -static inline void unmap_mft_record_page(ntfs_inode *ni) -{ - BUG_ON(!ni->page); - - // TODO: If dirty, blah... - ntfs_unmap_page(ni->page); - ni->page = NULL; - ni->page_ofs = 0; - return; -} - -/** - * unmap_mft_record - release a mapped mft record - * @ni: ntfs inode whose MFT record to unmap - * - * We release the page mapping and the mrec_lock mutex which unmaps the mft - * record and releases it for others to get hold of. We also release the ntfs - * inode by decrementing the ntfs inode reference count. - * - * NOTE: If caller has modified the mft record, it is imperative to set the mft - * record dirty BEFORE calling unmap_mft_record(). - */ -void unmap_mft_record(ntfs_inode *ni) -{ - struct page *page = ni->page; - - BUG_ON(!page); - - ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no); - - unmap_mft_record_page(ni); - mutex_unlock(&ni->mrec_lock); - atomic_dec(&ni->count); - /* - * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to - * ntfs_clear_extent_inode() in the extent inode case, and to the - * caller in the non-extent, yet pure ntfs inode case, to do the actual - * tear down of all structures and freeing of all allocated memory. - */ - return; -} - -/** - * map_extent_mft_record - load an extent inode and attach it to its base - * @base_ni: base ntfs inode - * @mref: mft reference of the extent inode to load - * @ntfs_ino: on successful return, pointer to the ntfs_inode structure - * - * Load the extent mft record @mref and attach it to its base inode @base_ni. - * Return the mapped extent mft record if IS_ERR(result) is false. Otherwise - * PTR_ERR(result) gives the negative error code. - * - * On successful return, @ntfs_ino contains a pointer to the ntfs_inode - * structure of the mapped extent inode. - */ -MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, - ntfs_inode **ntfs_ino) -{ - MFT_RECORD *m; - ntfs_inode *ni = NULL; - ntfs_inode **extent_nis = NULL; - int i; - unsigned long mft_no = MREF(mref); - u16 seq_no = MSEQNO(mref); - bool destroy_ni = false; - - ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).", - mft_no, base_ni->mft_no); - /* Make sure the base ntfs inode doesn't go away. */ - atomic_inc(&base_ni->count); - /* - * Check if this extent inode has already been added to the base inode, - * in which case just return it. If not found, add it to the base - * inode before returning it. - */ - mutex_lock(&base_ni->extent_lock); - if (base_ni->nr_extents > 0) { - extent_nis = base_ni->ext.extent_ntfs_inos; - for (i = 0; i < base_ni->nr_extents; i++) { - if (mft_no != extent_nis[i]->mft_no) - continue; - ni = extent_nis[i]; - /* Make sure the ntfs inode doesn't go away. */ - atomic_inc(&ni->count); - break; - } - } - if (likely(ni != NULL)) { - mutex_unlock(&base_ni->extent_lock); - atomic_dec(&base_ni->count); - /* We found the record; just have to map and return it. */ - m = map_mft_record(ni); - /* map_mft_record() has incremented this on success. */ - atomic_dec(&ni->count); - if (!IS_ERR(m)) { - /* Verify the sequence number. */ - if (likely(le16_to_cpu(m->sequence_number) == seq_no)) { - ntfs_debug("Done 1."); - *ntfs_ino = ni; - return m; - } - unmap_mft_record(ni); - ntfs_error(base_ni->vol->sb, "Found stale extent mft " - "reference! Corrupt filesystem. " - "Run chkdsk."); - return ERR_PTR(-EIO); - } -map_err_out: - ntfs_error(base_ni->vol->sb, "Failed to map extent " - "mft record, error code %ld.", -PTR_ERR(m)); - return m; - } - /* Record wasn't there. Get a new ntfs inode and initialize it. */ - ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no); - if (unlikely(!ni)) { - mutex_unlock(&base_ni->extent_lock); - atomic_dec(&base_ni->count); - return ERR_PTR(-ENOMEM); - } - ni->vol = base_ni->vol; - ni->seq_no = seq_no; - ni->nr_extents = -1; - ni->ext.base_ntfs_ino = base_ni; - /* Now map the record. */ - m = map_mft_record(ni); - if (IS_ERR(m)) { - mutex_unlock(&base_ni->extent_lock); - atomic_dec(&base_ni->count); - ntfs_clear_extent_inode(ni); - goto map_err_out; - } - /* Verify the sequence number if it is present. */ - if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) { - ntfs_error(base_ni->vol->sb, "Found stale extent mft " - "reference! Corrupt filesystem. Run chkdsk."); - destroy_ni = true; - m = ERR_PTR(-EIO); - goto unm_err_out; - } - /* Attach extent inode to base inode, reallocating memory if needed. */ - if (!(base_ni->nr_extents & 3)) { - ntfs_inode **tmp; - int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *); - - tmp = kmalloc(new_size, GFP_NOFS); - if (unlikely(!tmp)) { - ntfs_error(base_ni->vol->sb, "Failed to allocate " - "internal buffer."); - destroy_ni = true; - m = ERR_PTR(-ENOMEM); - goto unm_err_out; - } - if (base_ni->nr_extents) { - BUG_ON(!base_ni->ext.extent_ntfs_inos); - memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size - - 4 * sizeof(ntfs_inode *)); - kfree(base_ni->ext.extent_ntfs_inos); - } - base_ni->ext.extent_ntfs_inos = tmp; - } - base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni; - mutex_unlock(&base_ni->extent_lock); - atomic_dec(&base_ni->count); - ntfs_debug("Done 2."); - *ntfs_ino = ni; - return m; -unm_err_out: - unmap_mft_record(ni); - mutex_unlock(&base_ni->extent_lock); - atomic_dec(&base_ni->count); - /* - * If the extent inode was not attached to the base inode we need to - * release it or we will leak memory. - */ - if (destroy_ni) - ntfs_clear_extent_inode(ni); - return m; -} - -#ifdef NTFS_RW - -/** - * __mark_mft_record_dirty - set the mft record and the page containing it dirty - * @ni: ntfs inode describing the mapped mft record - * - * Internal function. Users should call mark_mft_record_dirty() instead. - * - * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni, - * as well as the page containing the mft record, dirty. Also, mark the base - * vfs inode dirty. This ensures that any changes to the mft record are - * written out to disk. - * - * NOTE: We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES) - * on the base vfs inode, because even though file data may have been modified, - * it is dirty in the inode meta data rather than the data page cache of the - * inode, and thus there are no data pages that need writing out. Therefore, a - * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the - * other hand, is not sufficient, because ->write_inode needs to be called even - * in case of fdatasync. This needs to happen or the file data would not - * necessarily hit the device synchronously, even though the vfs inode has the - * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just - * I_DIRTY_SYNC, since the file data has not actually hit the block device yet, - * which is not what I_DIRTY_SYNC on its own would suggest. - */ -void __mark_mft_record_dirty(ntfs_inode *ni) -{ - ntfs_inode *base_ni; - - ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); - BUG_ON(NInoAttr(ni)); - mark_ntfs_record_dirty(ni->page, ni->page_ofs); - /* Determine the base vfs inode and mark it dirty, too. */ - mutex_lock(&ni->extent_lock); - if (likely(ni->nr_extents >= 0)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - mutex_unlock(&ni->extent_lock); - __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_DATASYNC); -} - -static const char *ntfs_please_email = "Please email " - "linux-ntfs-dev@lists.sourceforge.net and say that you saw " - "this message. Thank you."; - -/** - * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror - * @vol: ntfs volume on which the mft record to synchronize resides - * @mft_no: mft record number of mft record to synchronize - * @m: mapped, mst protected (extent) mft record to synchronize - * - * Write the mapped, mst protected (extent) mft record @m with mft record - * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol, - * bypassing the page cache and the $MFTMirr inode itself. - * - * This function is only for use at umount time when the mft mirror inode has - * already been disposed off. We BUG() if we are called while the mft mirror - * inode is still attached to the volume. - * - * On success return 0. On error return -errno. - * - * NOTE: This function is not implemented yet as I am not convinced it can - * actually be triggered considering the sequence of commits we do in super.c:: - * ntfs_put_super(). But just in case we provide this place holder as the - * alternative would be either to BUG() or to get a NULL pointer dereference - * and Oops. - */ -static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol, - const unsigned long mft_no, MFT_RECORD *m) -{ - BUG_ON(vol->mftmirr_ino); - ntfs_error(vol->sb, "Umount time mft mirror syncing is not " - "implemented yet. %s", ntfs_please_email); - return -EOPNOTSUPP; -} - -/** - * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror - * @vol: ntfs volume on which the mft record to synchronize resides - * @mft_no: mft record number of mft record to synchronize - * @m: mapped, mst protected (extent) mft record to synchronize - * @sync: if true, wait for i/o completion - * - * Write the mapped, mst protected (extent) mft record @m with mft record - * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol. - * - * On success return 0. On error return -errno and set the volume errors flag - * in the ntfs volume @vol. - * - * NOTE: We always perform synchronous i/o and ignore the @sync parameter. - * - * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just - * schedule i/o via ->writepage or do it via kntfsd or whatever. - */ -int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, - MFT_RECORD *m, int sync) -{ - struct page *page; - unsigned int blocksize = vol->sb->s_blocksize; - int max_bhs = vol->mft_record_size / blocksize; - struct buffer_head *bhs[MAX_BHS]; - struct buffer_head *bh, *head; - u8 *kmirr; - runlist_element *rl; - unsigned int block_start, block_end, m_start, m_end, page_ofs; - int i_bhs, nr_bhs, err = 0; - unsigned char blocksize_bits = vol->sb->s_blocksize_bits; - - ntfs_debug("Entering for inode 0x%lx.", mft_no); - BUG_ON(!max_bhs); - if (WARN_ON(max_bhs > MAX_BHS)) - return -EINVAL; - if (unlikely(!vol->mftmirr_ino)) { - /* This could happen during umount... */ - err = ntfs_sync_mft_mirror_umount(vol, mft_no, m); - if (likely(!err)) - return err; - goto err_out; - } - /* Get the page containing the mirror copy of the mft record @m. */ - page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >> - (PAGE_SHIFT - vol->mft_record_size_bits)); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to map mft mirror page."); - err = PTR_ERR(page); - goto err_out; - } - lock_page(page); - BUG_ON(!PageUptodate(page)); - ClearPageUptodate(page); - /* Offset of the mft mirror record inside the page. */ - page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; - /* The address in the page of the mirror copy of the mft record @m. */ - kmirr = page_address(page) + page_ofs; - /* Copy the mst protected mft record to the mirror. */ - memcpy(kmirr, m, vol->mft_record_size); - /* Create uptodate buffers if not present. */ - if (unlikely(!page_has_buffers(page))) { - struct buffer_head *tail; - - bh = head = alloc_page_buffers(page, blocksize, true); - do { - set_buffer_uptodate(bh); - tail = bh; - bh = bh->b_this_page; - } while (bh); - tail->b_this_page = head; - attach_page_private(page, head); - } - bh = head = page_buffers(page); - BUG_ON(!bh); - rl = NULL; - nr_bhs = 0; - block_start = 0; - m_start = kmirr - (u8*)page_address(page); - m_end = m_start + vol->mft_record_size; - do { - block_end = block_start + blocksize; - /* If the buffer is outside the mft record, skip it. */ - if (block_end <= m_start) - continue; - if (unlikely(block_start >= m_end)) - break; - /* Need to map the buffer if it is not mapped already. */ - if (unlikely(!buffer_mapped(bh))) { - VCN vcn; - LCN lcn; - unsigned int vcn_ofs; - - bh->b_bdev = vol->sb->s_bdev; - /* Obtain the vcn and offset of the current block. */ - vcn = ((VCN)mft_no << vol->mft_record_size_bits) + - (block_start - m_start); - vcn_ofs = vcn & vol->cluster_size_mask; - vcn >>= vol->cluster_size_bits; - if (!rl) { - down_read(&NTFS_I(vol->mftmirr_ino)-> - runlist.lock); - rl = NTFS_I(vol->mftmirr_ino)->runlist.rl; - /* - * $MFTMirr always has the whole of its runlist - * in memory. - */ - BUG_ON(!rl); - } - /* Seek to element containing target vcn. */ - while (rl->length && rl[1].vcn <= vcn) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); - /* For $MFTMirr, only lcn >= 0 is a successful remap. */ - if (likely(lcn >= 0)) { - /* Setup buffer head to correct block. */ - bh->b_blocknr = ((lcn << - vol->cluster_size_bits) + - vcn_ofs) >> blocksize_bits; - set_buffer_mapped(bh); - } else { - bh->b_blocknr = -1; - ntfs_error(vol->sb, "Cannot write mft mirror " - "record 0x%lx because its " - "location on disk could not " - "be determined (error code " - "%lli).", mft_no, - (long long)lcn); - err = -EIO; - } - } - BUG_ON(!buffer_uptodate(bh)); - BUG_ON(!nr_bhs && (m_start != block_start)); - BUG_ON(nr_bhs >= max_bhs); - bhs[nr_bhs++] = bh; - BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); - } while (block_start = block_end, (bh = bh->b_this_page) != head); - if (unlikely(rl)) - up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock); - if (likely(!err)) { - /* Lock buffers and start synchronous write i/o on them. */ - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { - struct buffer_head *tbh = bhs[i_bhs]; - - if (!trylock_buffer(tbh)) - BUG(); - BUG_ON(!buffer_uptodate(tbh)); - clear_buffer_dirty(tbh); - get_bh(tbh); - tbh->b_end_io = end_buffer_write_sync; - submit_bh(REQ_OP_WRITE, tbh); - } - /* Wait on i/o completion of buffers. */ - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { - struct buffer_head *tbh = bhs[i_bhs]; - - wait_on_buffer(tbh); - if (unlikely(!buffer_uptodate(tbh))) { - err = -EIO; - /* - * Set the buffer uptodate so the page and - * buffer states do not become out of sync. - */ - set_buffer_uptodate(tbh); - } - } - } else /* if (unlikely(err)) */ { - /* Clean the buffers. */ - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) - clear_buffer_dirty(bhs[i_bhs]); - } - /* Current state: all buffers are clean, unlocked, and uptodate. */ - /* Remove the mst protection fixups again. */ - post_write_mst_fixup((NTFS_RECORD*)kmirr); - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - ntfs_unmap_page(page); - if (likely(!err)) { - ntfs_debug("Done."); - } else { - ntfs_error(vol->sb, "I/O error while writing mft mirror " - "record 0x%lx!", mft_no); -err_out: - ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error " - "code %i). Volume will be left marked dirty " - "on umount. Run ntfsfix on the partition " - "after umounting to correct this.", -err); - NVolSetErrors(vol); - } - return err; -} - -/** - * write_mft_record_nolock - write out a mapped (extent) mft record - * @ni: ntfs inode describing the mapped (extent) mft record - * @m: mapped (extent) mft record to write - * @sync: if true, wait for i/o completion - * - * Write the mapped (extent) mft record @m described by the (regular or extent) - * ntfs inode @ni to backing store. If the mft record @m has a counterpart in - * the mft mirror, that is also updated. - * - * We only write the mft record if the ntfs inode @ni is dirty and the first - * buffer belonging to its mft record is dirty, too. We ignore the dirty state - * of subsequent buffers because we could have raced with - * fs/ntfs/aops.c::mark_ntfs_record_dirty(). - * - * On success, clean the mft record and return 0. On error, leave the mft - * record dirty and return -errno. - * - * NOTE: We always perform synchronous i/o and ignore the @sync parameter. - * However, if the mft record has a counterpart in the mft mirror and @sync is - * true, we write the mft record, wait for i/o completion, and only then write - * the mft mirror copy. This ensures that if the system crashes either the mft - * or the mft mirror will contain a self-consistent mft record @m. If @sync is - * false on the other hand, we start i/o on both and then wait for completion - * on them. This provides a speedup but no longer guarantees that you will end - * up with a self-consistent mft record in the case of a crash but if you asked - * for asynchronous writing you probably do not care about that anyway. - * - * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just - * schedule i/o via ->writepage or do it via kntfsd or whatever. - */ -int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) -{ - ntfs_volume *vol = ni->vol; - struct page *page = ni->page; - unsigned int blocksize = vol->sb->s_blocksize; - unsigned char blocksize_bits = vol->sb->s_blocksize_bits; - int max_bhs = vol->mft_record_size / blocksize; - struct buffer_head *bhs[MAX_BHS]; - struct buffer_head *bh, *head; - runlist_element *rl; - unsigned int block_start, block_end, m_start, m_end; - int i_bhs, nr_bhs, err = 0; - - ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); - BUG_ON(NInoAttr(ni)); - BUG_ON(!max_bhs); - BUG_ON(!PageLocked(page)); - if (WARN_ON(max_bhs > MAX_BHS)) { - err = -EINVAL; - goto err_out; - } - /* - * If the ntfs_inode is clean no need to do anything. If it is dirty, - * mark it as clean now so that it can be redirtied later on if needed. - * There is no danger of races since the caller is holding the locks - * for the mft record @m and the page it is in. - */ - if (!NInoTestClearDirty(ni)) - goto done; - bh = head = page_buffers(page); - BUG_ON(!bh); - rl = NULL; - nr_bhs = 0; - block_start = 0; - m_start = ni->page_ofs; - m_end = m_start + vol->mft_record_size; - do { - block_end = block_start + blocksize; - /* If the buffer is outside the mft record, skip it. */ - if (block_end <= m_start) - continue; - if (unlikely(block_start >= m_end)) - break; - /* - * If this block is not the first one in the record, we ignore - * the buffer's dirty state because we could have raced with a - * parallel mark_ntfs_record_dirty(). - */ - if (block_start == m_start) { - /* This block is the first one in the record. */ - if (!buffer_dirty(bh)) { - BUG_ON(nr_bhs); - /* Clean records are not written out. */ - break; - } - } - /* Need to map the buffer if it is not mapped already. */ - if (unlikely(!buffer_mapped(bh))) { - VCN vcn; - LCN lcn; - unsigned int vcn_ofs; - - bh->b_bdev = vol->sb->s_bdev; - /* Obtain the vcn and offset of the current block. */ - vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) + - (block_start - m_start); - vcn_ofs = vcn & vol->cluster_size_mask; - vcn >>= vol->cluster_size_bits; - if (!rl) { - down_read(&NTFS_I(vol->mft_ino)->runlist.lock); - rl = NTFS_I(vol->mft_ino)->runlist.rl; - BUG_ON(!rl); - } - /* Seek to element containing target vcn. */ - while (rl->length && rl[1].vcn <= vcn) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); - /* For $MFT, only lcn >= 0 is a successful remap. */ - if (likely(lcn >= 0)) { - /* Setup buffer head to correct block. */ - bh->b_blocknr = ((lcn << - vol->cluster_size_bits) + - vcn_ofs) >> blocksize_bits; - set_buffer_mapped(bh); - } else { - bh->b_blocknr = -1; - ntfs_error(vol->sb, "Cannot write mft record " - "0x%lx because its location " - "on disk could not be " - "determined (error code %lli).", - ni->mft_no, (long long)lcn); - err = -EIO; - } - } - BUG_ON(!buffer_uptodate(bh)); - BUG_ON(!nr_bhs && (m_start != block_start)); - BUG_ON(nr_bhs >= max_bhs); - bhs[nr_bhs++] = bh; - BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); - } while (block_start = block_end, (bh = bh->b_this_page) != head); - if (unlikely(rl)) - up_read(&NTFS_I(vol->mft_ino)->runlist.lock); - if (!nr_bhs) - goto done; - if (unlikely(err)) - goto cleanup_out; - /* Apply the mst protection fixups. */ - err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size); - if (err) { - ntfs_error(vol->sb, "Failed to apply mst fixups!"); - goto cleanup_out; - } - flush_dcache_mft_record_page(ni); - /* Lock buffers and start synchronous write i/o on them. */ - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { - struct buffer_head *tbh = bhs[i_bhs]; - - if (!trylock_buffer(tbh)) - BUG(); - BUG_ON(!buffer_uptodate(tbh)); - clear_buffer_dirty(tbh); - get_bh(tbh); - tbh->b_end_io = end_buffer_write_sync; - submit_bh(REQ_OP_WRITE, tbh); - } - /* Synchronize the mft mirror now if not @sync. */ - if (!sync && ni->mft_no < vol->mftmirr_size) - ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync); - /* Wait on i/o completion of buffers. */ - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { - struct buffer_head *tbh = bhs[i_bhs]; - - wait_on_buffer(tbh); - if (unlikely(!buffer_uptodate(tbh))) { - err = -EIO; - /* - * Set the buffer uptodate so the page and buffer - * states do not become out of sync. - */ - if (PageUptodate(page)) - set_buffer_uptodate(tbh); - } - } - /* If @sync, now synchronize the mft mirror. */ - if (sync && ni->mft_no < vol->mftmirr_size) - ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync); - /* Remove the mst protection fixups again. */ - post_write_mst_fixup((NTFS_RECORD*)m); - flush_dcache_mft_record_page(ni); - if (unlikely(err)) { - /* I/O error during writing. This is really bad! */ - ntfs_error(vol->sb, "I/O error while writing mft record " - "0x%lx! Marking base inode as bad. You " - "should unmount the volume and run chkdsk.", - ni->mft_no); - goto err_out; - } -done: - ntfs_debug("Done."); - return 0; -cleanup_out: - /* Clean the buffers. */ - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) - clear_buffer_dirty(bhs[i_bhs]); -err_out: - /* - * Current state: all buffers are clean, unlocked, and uptodate. - * The caller should mark the base inode as bad so that no more i/o - * happens. ->clear_inode() will still be invoked so all extent inodes - * and other allocated memory will be freed. - */ - if (err == -ENOMEM) { - ntfs_error(vol->sb, "Not enough memory to write mft record. " - "Redirtying so the write is retried later."); - mark_mft_record_dirty(ni); - err = 0; - } else - NVolSetErrors(vol); - return err; -} - -/** - * ntfs_may_write_mft_record - check if an mft record may be written out - * @vol: [IN] ntfs volume on which the mft record to check resides - * @mft_no: [IN] mft record number of the mft record to check - * @m: [IN] mapped mft record to check - * @locked_ni: [OUT] caller has to unlock this ntfs inode if one is returned - * - * Check if the mapped (base or extent) mft record @m with mft record number - * @mft_no belonging to the ntfs volume @vol may be written out. If necessary - * and possible the ntfs inode of the mft record is locked and the base vfs - * inode is pinned. The locked ntfs inode is then returned in @locked_ni. The - * caller is responsible for unlocking the ntfs inode and unpinning the base - * vfs inode. - * - * Return 'true' if the mft record may be written out and 'false' if not. - * - * The caller has locked the page and cleared the uptodate flag on it which - * means that we can safely write out any dirty mft records that do not have - * their inodes in icache as determined by ilookup5() as anyone - * opening/creating such an inode would block when attempting to map the mft - * record in read_cache_page() until we are finished with the write out. - * - * Here is a description of the tests we perform: - * - * If the inode is found in icache we know the mft record must be a base mft - * record. If it is dirty, we do not write it and return 'false' as the vfs - * inode write paths will result in the access times being updated which would - * cause the base mft record to be redirtied and written out again. (We know - * the access time update will modify the base mft record because Windows - * chkdsk complains if the standard information attribute is not in the base - * mft record.) - * - * If the inode is in icache and not dirty, we attempt to lock the mft record - * and if we find the lock was already taken, it is not safe to write the mft - * record and we return 'false'. - * - * If we manage to obtain the lock we have exclusive access to the mft record, - * which also allows us safe writeout of the mft record. We then set - * @locked_ni to the locked ntfs inode and return 'true'. - * - * Note we cannot just lock the mft record and sleep while waiting for the lock - * because this would deadlock due to lock reversal (normally the mft record is - * locked before the page is locked but we already have the page locked here - * when we try to lock the mft record). - * - * If the inode is not in icache we need to perform further checks. - * - * If the mft record is not a FILE record or it is a base mft record, we can - * safely write it and return 'true'. - * - * We now know the mft record is an extent mft record. We check if the inode - * corresponding to its base mft record is in icache and obtain a reference to - * it if it is. If it is not, we can safely write it and return 'true'. - * - * We now have the base inode for the extent mft record. We check if it has an - * ntfs inode for the extent mft record attached and if not it is safe to write - * the extent mft record and we return 'true'. - * - * The ntfs inode for the extent mft record is attached to the base inode so we - * attempt to lock the extent mft record and if we find the lock was already - * taken, it is not safe to write the extent mft record and we return 'false'. - * - * If we manage to obtain the lock we have exclusive access to the extent mft - * record, which also allows us safe writeout of the extent mft record. We - * set the ntfs inode of the extent mft record clean and then set @locked_ni to - * the now locked ntfs inode and return 'true'. - * - * Note, the reason for actually writing dirty mft records here and not just - * relying on the vfs inode dirty code paths is that we can have mft records - * modified without them ever having actual inodes in memory. Also we can have - * dirty mft records with clean ntfs inodes in memory. None of the described - * cases would result in the dirty mft records being written out if we only - * relied on the vfs inode dirty code paths. And these cases can really occur - * during allocation of new mft records and in particular when the - * initialized_size of the $MFT/$DATA attribute is extended and the new space - * is initialized using ntfs_mft_record_format(). The clean inode can then - * appear if the mft record is reused for a new inode before it got written - * out. - */ -bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no, - const MFT_RECORD *m, ntfs_inode **locked_ni) -{ - struct super_block *sb = vol->sb; - struct inode *mft_vi = vol->mft_ino; - struct inode *vi; - ntfs_inode *ni, *eni, **extent_nis; - int i; - ntfs_attr na; - - ntfs_debug("Entering for inode 0x%lx.", mft_no); - /* - * Normally we do not return a locked inode so set @locked_ni to NULL. - */ - BUG_ON(!locked_ni); - *locked_ni = NULL; - /* - * Check if the inode corresponding to this mft record is in the VFS - * inode cache and obtain a reference to it if it is. - */ - ntfs_debug("Looking for inode 0x%lx in icache.", mft_no); - na.mft_no = mft_no; - na.name = NULL; - na.name_len = 0; - na.type = AT_UNUSED; - /* - * Optimize inode 0, i.e. $MFT itself, since we have it in memory and - * we get here for it rather often. - */ - if (!mft_no) { - /* Balance the below iput(). */ - vi = igrab(mft_vi); - BUG_ON(vi != mft_vi); - } else { - /* - * Have to use ilookup5_nowait() since ilookup5() waits for the - * inode lock which causes ntfs to deadlock when a concurrent - * inode write via the inode dirty code paths and the page - * dirty code path of the inode dirty code path when writing - * $MFT occurs. - */ - vi = ilookup5_nowait(sb, mft_no, ntfs_test_inode, &na); - } - if (vi) { - ntfs_debug("Base inode 0x%lx is in icache.", mft_no); - /* The inode is in icache. */ - ni = NTFS_I(vi); - /* Take a reference to the ntfs inode. */ - atomic_inc(&ni->count); - /* If the inode is dirty, do not write this record. */ - if (NInoDirty(ni)) { - ntfs_debug("Inode 0x%lx is dirty, do not write it.", - mft_no); - atomic_dec(&ni->count); - iput(vi); - return false; - } - ntfs_debug("Inode 0x%lx is not dirty.", mft_no); - /* The inode is not dirty, try to take the mft record lock. */ - if (unlikely(!mutex_trylock(&ni->mrec_lock))) { - ntfs_debug("Mft record 0x%lx is already locked, do " - "not write it.", mft_no); - atomic_dec(&ni->count); - iput(vi); - return false; - } - ntfs_debug("Managed to lock mft record 0x%lx, write it.", - mft_no); - /* - * The write has to occur while we hold the mft record lock so - * return the locked ntfs inode. - */ - *locked_ni = ni; - return true; - } - ntfs_debug("Inode 0x%lx is not in icache.", mft_no); - /* The inode is not in icache. */ - /* Write the record if it is not a mft record (type "FILE"). */ - if (!ntfs_is_mft_record(m->magic)) { - ntfs_debug("Mft record 0x%lx is not a FILE record, write it.", - mft_no); - return true; - } - /* Write the mft record if it is a base inode. */ - if (!m->base_mft_record) { - ntfs_debug("Mft record 0x%lx is a base record, write it.", - mft_no); - return true; - } - /* - * This is an extent mft record. Check if the inode corresponding to - * its base mft record is in icache and obtain a reference to it if it - * is. - */ - na.mft_no = MREF_LE(m->base_mft_record); - ntfs_debug("Mft record 0x%lx is an extent record. Looking for base " - "inode 0x%lx in icache.", mft_no, na.mft_no); - if (!na.mft_no) { - /* Balance the below iput(). */ - vi = igrab(mft_vi); - BUG_ON(vi != mft_vi); - } else - vi = ilookup5_nowait(sb, na.mft_no, ntfs_test_inode, - &na); - if (!vi) { - /* - * The base inode is not in icache, write this extent mft - * record. - */ - ntfs_debug("Base inode 0x%lx is not in icache, write the " - "extent record.", na.mft_no); - return true; - } - ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no); - /* - * The base inode is in icache. Check if it has the extent inode - * corresponding to this extent mft record attached. - */ - ni = NTFS_I(vi); - mutex_lock(&ni->extent_lock); - if (ni->nr_extents <= 0) { - /* - * The base inode has no attached extent inodes, write this - * extent mft record. - */ - mutex_unlock(&ni->extent_lock); - iput(vi); - ntfs_debug("Base inode 0x%lx has no attached extent inodes, " - "write the extent record.", na.mft_no); - return true; - } - /* Iterate over the attached extent inodes. */ - extent_nis = ni->ext.extent_ntfs_inos; - for (eni = NULL, i = 0; i < ni->nr_extents; ++i) { - if (mft_no == extent_nis[i]->mft_no) { - /* - * Found the extent inode corresponding to this extent - * mft record. - */ - eni = extent_nis[i]; - break; - } - } - /* - * If the extent inode was not attached to the base inode, write this - * extent mft record. - */ - if (!eni) { - mutex_unlock(&ni->extent_lock); - iput(vi); - ntfs_debug("Extent inode 0x%lx is not attached to its base " - "inode 0x%lx, write the extent record.", - mft_no, na.mft_no); - return true; - } - ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.", - mft_no, na.mft_no); - /* Take a reference to the extent ntfs inode. */ - atomic_inc(&eni->count); - mutex_unlock(&ni->extent_lock); - /* - * Found the extent inode coresponding to this extent mft record. - * Try to take the mft record lock. - */ - if (unlikely(!mutex_trylock(&eni->mrec_lock))) { - atomic_dec(&eni->count); - iput(vi); - ntfs_debug("Extent mft record 0x%lx is already locked, do " - "not write it.", mft_no); - return false; - } - ntfs_debug("Managed to lock extent mft record 0x%lx, write it.", - mft_no); - if (NInoTestClearDirty(eni)) - ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.", - mft_no); - /* - * The write has to occur while we hold the mft record lock so return - * the locked extent ntfs inode. - */ - *locked_ni = eni; - return true; -} - -static const char *es = " Leaving inconsistent metadata. Unmount and run " - "chkdsk."; - -/** - * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name - * @vol: volume on which to search for a free mft record - * @base_ni: open base inode if allocating an extent mft record or NULL - * - * Search for a free mft record in the mft bitmap attribute on the ntfs volume - * @vol. - * - * If @base_ni is NULL start the search at the default allocator position. - * - * If @base_ni is not NULL start the search at the mft record after the base - * mft record @base_ni. - * - * Return the free mft record on success and -errno on error. An error code of - * -ENOSPC means that there are no free mft records in the currently - * initialized mft bitmap. - * - * Locking: Caller must hold vol->mftbmp_lock for writing. - */ -static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol, - ntfs_inode *base_ni) -{ - s64 pass_end, ll, data_pos, pass_start, ofs, bit; - unsigned long flags; - struct address_space *mftbmp_mapping; - u8 *buf, *byte; - struct page *page; - unsigned int page_ofs, size; - u8 pass, b; - - ntfs_debug("Searching for free mft record in the currently " - "initialized mft bitmap."); - mftbmp_mapping = vol->mftbmp_ino->i_mapping; - /* - * Set the end of the pass making sure we do not overflow the mft - * bitmap. - */ - read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags); - pass_end = NTFS_I(vol->mft_ino)->allocated_size >> - vol->mft_record_size_bits; - read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags); - read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags); - ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3; - read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags); - if (pass_end > ll) - pass_end = ll; - pass = 1; - if (!base_ni) - data_pos = vol->mft_data_pos; - else - data_pos = base_ni->mft_no + 1; - if (data_pos < 24) - data_pos = 24; - if (data_pos >= pass_end) { - data_pos = 24; - pass = 2; - /* This happens on a freshly formatted volume. */ - if (data_pos >= pass_end) - return -ENOSPC; - } - pass_start = data_pos; - ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, " - "pass_end 0x%llx, data_pos 0x%llx.", pass, - (long long)pass_start, (long long)pass_end, - (long long)data_pos); - /* Loop until a free mft record is found. */ - for (; pass <= 2;) { - /* Cap size to pass_end. */ - ofs = data_pos >> 3; - page_ofs = ofs & ~PAGE_MASK; - size = PAGE_SIZE - page_ofs; - ll = ((pass_end + 7) >> 3) - ofs; - if (size > ll) - size = ll; - size <<= 3; - /* - * If we are still within the active pass, search the next page - * for a zero bit. - */ - if (size) { - page = ntfs_map_page(mftbmp_mapping, - ofs >> PAGE_SHIFT); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read mft " - "bitmap, aborting."); - return PTR_ERR(page); - } - buf = (u8*)page_address(page) + page_ofs; - bit = data_pos & 7; - data_pos &= ~7ull; - ntfs_debug("Before inner for loop: size 0x%x, " - "data_pos 0x%llx, bit 0x%llx", size, - (long long)data_pos, (long long)bit); - for (; bit < size && data_pos + bit < pass_end; - bit &= ~7ull, bit += 8) { - byte = buf + (bit >> 3); - if (*byte == 0xff) - continue; - b = ffz((unsigned long)*byte); - if (b < 8 && b >= (bit & 7)) { - ll = data_pos + (bit & ~7ull) + b; - if (unlikely(ll > (1ll << 32))) { - ntfs_unmap_page(page); - return -ENOSPC; - } - *byte |= 1 << b; - flush_dcache_page(page); - set_page_dirty(page); - ntfs_unmap_page(page); - ntfs_debug("Done. (Found and " - "allocated mft record " - "0x%llx.)", - (long long)ll); - return ll; - } - } - ntfs_debug("After inner for loop: size 0x%x, " - "data_pos 0x%llx, bit 0x%llx", size, - (long long)data_pos, (long long)bit); - data_pos += size; - ntfs_unmap_page(page); - /* - * If the end of the pass has not been reached yet, - * continue searching the mft bitmap for a zero bit. - */ - if (data_pos < pass_end) - continue; - } - /* Do the next pass. */ - if (++pass == 2) { - /* - * Starting the second pass, in which we scan the first - * part of the zone which we omitted earlier. - */ - pass_end = pass_start; - data_pos = pass_start = 24; - ntfs_debug("pass %i, pass_start 0x%llx, pass_end " - "0x%llx.", pass, (long long)pass_start, - (long long)pass_end); - if (data_pos >= pass_end) - break; - } - } - /* No free mft records in currently initialized mft bitmap. */ - ntfs_debug("Done. (No free mft records left in currently initialized " - "mft bitmap.)"); - return -ENOSPC; -} - -/** - * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster - * @vol: volume on which to extend the mft bitmap attribute - * - * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster. - * - * Note: Only changes allocated_size, i.e. does not touch initialized_size or - * data_size. - * - * Return 0 on success and -errno on error. - * - * Locking: - Caller must hold vol->mftbmp_lock for writing. - * - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for - * writing and releases it before returning. - * - This function takes vol->lcnbmp_lock for writing and releases it - * before returning. - */ -static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) -{ - LCN lcn; - s64 ll; - unsigned long flags; - struct page *page; - ntfs_inode *mft_ni, *mftbmp_ni; - runlist_element *rl, *rl2 = NULL; - ntfs_attr_search_ctx *ctx = NULL; - MFT_RECORD *mrec; - ATTR_RECORD *a = NULL; - int ret, mp_size; - u32 old_alen = 0; - u8 *b, tb; - struct { - u8 added_cluster:1; - u8 added_run:1; - u8 mp_rebuilt:1; - } status = { 0, 0, 0 }; - - ntfs_debug("Extending mft bitmap allocation."); - mft_ni = NTFS_I(vol->mft_ino); - mftbmp_ni = NTFS_I(vol->mftbmp_ino); - /* - * Determine the last lcn of the mft bitmap. The allocated size of the - * mft bitmap cannot be zero so we are ok to do this. - */ - down_write(&mftbmp_ni->runlist.lock); - read_lock_irqsave(&mftbmp_ni->size_lock, flags); - ll = mftbmp_ni->allocated_size; - read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - rl = ntfs_attr_find_vcn_nolock(mftbmp_ni, - (ll - 1) >> vol->cluster_size_bits, NULL); - if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { - up_write(&mftbmp_ni->runlist.lock); - ntfs_error(vol->sb, "Failed to determine last allocated " - "cluster of mft bitmap attribute."); - if (!IS_ERR(rl)) - ret = -EIO; - else - ret = PTR_ERR(rl); - return ret; - } - lcn = rl->lcn + rl->length; - ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.", - (long long)lcn); - /* - * Attempt to get the cluster following the last allocated cluster by - * hand as it may be in the MFT zone so the allocator would not give it - * to us. - */ - ll = lcn >> 3; - page = ntfs_map_page(vol->lcnbmp_ino->i_mapping, - ll >> PAGE_SHIFT); - if (IS_ERR(page)) { - up_write(&mftbmp_ni->runlist.lock); - ntfs_error(vol->sb, "Failed to read from lcn bitmap."); - return PTR_ERR(page); - } - b = (u8*)page_address(page) + (ll & ~PAGE_MASK); - tb = 1 << (lcn & 7ull); - down_write(&vol->lcnbmp_lock); - if (*b != 0xff && !(*b & tb)) { - /* Next cluster is free, allocate it. */ - *b |= tb; - flush_dcache_page(page); - set_page_dirty(page); - up_write(&vol->lcnbmp_lock); - ntfs_unmap_page(page); - /* Update the mft bitmap runlist. */ - rl->length++; - rl[1].vcn++; - status.added_cluster = 1; - ntfs_debug("Appending one cluster to mft bitmap."); - } else { - up_write(&vol->lcnbmp_lock); - ntfs_unmap_page(page); - /* Allocate a cluster from the DATA_ZONE. */ - rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE, - true); - if (IS_ERR(rl2)) { - up_write(&mftbmp_ni->runlist.lock); - ntfs_error(vol->sb, "Failed to allocate a cluster for " - "the mft bitmap."); - return PTR_ERR(rl2); - } - rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2); - if (IS_ERR(rl)) { - up_write(&mftbmp_ni->runlist.lock); - ntfs_error(vol->sb, "Failed to merge runlists for mft " - "bitmap."); - if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to deallocate " - "allocated cluster.%s", es); - NVolSetErrors(vol); - } - ntfs_free(rl2); - return PTR_ERR(rl); - } - mftbmp_ni->runlist.rl = rl; - status.added_run = 1; - ntfs_debug("Adding one run to mft bitmap."); - /* Find the last run in the new runlist. */ - for (; rl[1].length; rl++) - ; - } - /* - * Update the attribute record as well. Note: @rl is the last - * (non-terminator) runlist element of mft bitmap. - */ - mrec = map_mft_record(mft_ni); - if (IS_ERR(mrec)) { - ntfs_error(vol->sb, "Failed to map mft record."); - ret = PTR_ERR(mrec); - goto undo_alloc; - } - ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); - if (unlikely(!ctx)) { - ntfs_error(vol->sb, "Failed to get search context."); - ret = -ENOMEM; - goto undo_alloc; - } - ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, - mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL, - 0, ctx); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to find last attribute extent of " - "mft bitmap attribute."); - if (ret == -ENOENT) - ret = -EIO; - goto undo_alloc; - } - a = ctx->attr; - ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); - /* Search back for the previous last allocated cluster of mft bitmap. */ - for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) { - if (ll >= rl2->vcn) - break; - } - BUG_ON(ll < rl2->vcn); - BUG_ON(ll >= rl2->vcn + rl2->length); - /* Get the size for the new mapping pairs array for this extent. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); - if (unlikely(mp_size <= 0)) { - ntfs_error(vol->sb, "Get size for mapping pairs failed for " - "mft bitmap attribute extent."); - ret = mp_size; - if (!ret) - ret = -EIO; - goto undo_alloc; - } - /* Expand the attribute record if necessary. */ - old_alen = le32_to_cpu(a->length); - ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); - if (unlikely(ret)) { - if (ret != -ENOSPC) { - ntfs_error(vol->sb, "Failed to resize attribute " - "record for mft bitmap attribute."); - goto undo_alloc; - } - // TODO: Deal with this by moving this extent to a new mft - // record or by starting a new extent in a new mft record or by - // moving other attributes out of this mft record. - // Note: It will need to be a special mft record and if none of - // those are available it gets rather complicated... - ntfs_error(vol->sb, "Not enough space in this mft record to " - "accommodate extended mft bitmap attribute " - "extent. Cannot handle this yet."); - ret = -EOPNOTSUPP; - goto undo_alloc; - } - status.mp_rebuilt = 1; - /* Generate the mapping pairs array directly into the attr record. */ - ret = ntfs_mapping_pairs_build(vol, (u8*)a + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset), - mp_size, rl2, ll, -1, NULL); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to build mapping pairs array for " - "mft bitmap attribute."); - goto undo_alloc; - } - /* Update the highest_vcn. */ - a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1); - /* - * We now have extended the mft bitmap allocated_size by one cluster. - * Reflect this in the ntfs_inode structure and the attribute record. - */ - if (a->data.non_resident.lowest_vcn) { - /* - * We are not in the first attribute extent, switch to it, but - * first ensure the changes will make it to disk later. - */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_reinit_search_ctx(ctx); - ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, - mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, - 0, ctx); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to find first attribute " - "extent of mft bitmap attribute."); - goto restore_undo_alloc; - } - a = ctx->attr; - } - write_lock_irqsave(&mftbmp_ni->size_lock, flags); - mftbmp_ni->allocated_size += vol->cluster_size; - a->data.non_resident.allocated_size = - cpu_to_sle64(mftbmp_ni->allocated_size); - write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - up_write(&mftbmp_ni->runlist.lock); - ntfs_debug("Done."); - return 0; -restore_undo_alloc: - ntfs_attr_reinit_search_ctx(ctx); - if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, - mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL, - 0, ctx)) { - ntfs_error(vol->sb, "Failed to find last attribute extent of " - "mft bitmap attribute.%s", es); - write_lock_irqsave(&mftbmp_ni->size_lock, flags); - mftbmp_ni->allocated_size += vol->cluster_size; - write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - up_write(&mftbmp_ni->runlist.lock); - /* - * The only thing that is now wrong is ->allocated_size of the - * base attribute extent which chkdsk should be able to fix. - */ - NVolSetErrors(vol); - return ret; - } - a = ctx->attr; - a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2); -undo_alloc: - if (status.added_cluster) { - /* Truncate the last run in the runlist by one cluster. */ - rl->length--; - rl[1].vcn--; - } else if (status.added_run) { - lcn = rl->lcn; - /* Remove the last run from the runlist. */ - rl->lcn = rl[1].lcn; - rl->length = 0; - } - /* Deallocate the cluster. */ - down_write(&vol->lcnbmp_lock); - if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { - ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es); - NVolSetErrors(vol); - } - up_write(&vol->lcnbmp_lock); - if (status.mp_rebuilt) { - if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), - old_alen - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), - rl2, ll, -1, NULL)) { - ntfs_error(vol->sb, "Failed to restore mapping pairs " - "array.%s", es); - NVolSetErrors(vol); - } - if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { - ntfs_error(vol->sb, "Failed to restore attribute " - "record.%s", es); - NVolSetErrors(vol); - } - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - } - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (!IS_ERR(mrec)) - unmap_mft_record(mft_ni); - up_write(&mftbmp_ni->runlist.lock); - return ret; -} - -/** - * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data - * @vol: volume on which to extend the mft bitmap attribute - * - * Extend the initialized portion of the mft bitmap attribute on the ntfs - * volume @vol by 8 bytes. - * - * Note: Only changes initialized_size and data_size, i.e. requires that - * allocated_size is big enough to fit the new initialized_size. - * - * Return 0 on success and -error on error. - * - * Locking: Caller must hold vol->mftbmp_lock for writing. - */ -static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol) -{ - s64 old_data_size, old_initialized_size; - unsigned long flags; - struct inode *mftbmp_vi; - ntfs_inode *mft_ni, *mftbmp_ni; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *mrec; - ATTR_RECORD *a; - int ret; - - ntfs_debug("Extending mft bitmap initiailized (and data) size."); - mft_ni = NTFS_I(vol->mft_ino); - mftbmp_vi = vol->mftbmp_ino; - mftbmp_ni = NTFS_I(mftbmp_vi); - /* Get the attribute record. */ - mrec = map_mft_record(mft_ni); - if (IS_ERR(mrec)) { - ntfs_error(vol->sb, "Failed to map mft record."); - return PTR_ERR(mrec); - } - ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); - if (unlikely(!ctx)) { - ntfs_error(vol->sb, "Failed to get search context."); - ret = -ENOMEM; - goto unm_err_out; - } - ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, - mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to find first attribute extent of " - "mft bitmap attribute."); - if (ret == -ENOENT) - ret = -EIO; - goto put_err_out; - } - a = ctx->attr; - write_lock_irqsave(&mftbmp_ni->size_lock, flags); - old_data_size = i_size_read(mftbmp_vi); - old_initialized_size = mftbmp_ni->initialized_size; - /* - * We can simply update the initialized_size before filling the space - * with zeroes because the caller is holding the mft bitmap lock for - * writing which ensures that no one else is trying to access the data. - */ - mftbmp_ni->initialized_size += 8; - a->data.non_resident.initialized_size = - cpu_to_sle64(mftbmp_ni->initialized_size); - if (mftbmp_ni->initialized_size > old_data_size) { - i_size_write(mftbmp_vi, mftbmp_ni->initialized_size); - a->data.non_resident.data_size = - cpu_to_sle64(mftbmp_ni->initialized_size); - } - write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - /* Initialize the mft bitmap attribute value with zeroes. */ - ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0); - if (likely(!ret)) { - ntfs_debug("Done. (Wrote eight initialized bytes to mft " - "bitmap."); - return 0; - } - ntfs_error(vol->sb, "Failed to write to mft bitmap."); - /* Try to recover from the error. */ - mrec = map_mft_record(mft_ni); - if (IS_ERR(mrec)) { - ntfs_error(vol->sb, "Failed to map mft record.%s", es); - NVolSetErrors(vol); - return ret; - } - ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); - if (unlikely(!ctx)) { - ntfs_error(vol->sb, "Failed to get search context.%s", es); - NVolSetErrors(vol); - goto unm_err_out; - } - if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, - mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) { - ntfs_error(vol->sb, "Failed to find first attribute extent of " - "mft bitmap attribute.%s", es); - NVolSetErrors(vol); -put_err_out: - ntfs_attr_put_search_ctx(ctx); -unm_err_out: - unmap_mft_record(mft_ni); - goto err_out; - } - a = ctx->attr; - write_lock_irqsave(&mftbmp_ni->size_lock, flags); - mftbmp_ni->initialized_size = old_initialized_size; - a->data.non_resident.initialized_size = - cpu_to_sle64(old_initialized_size); - if (i_size_read(mftbmp_vi) != old_data_size) { - i_size_write(mftbmp_vi, old_data_size); - a->data.non_resident.data_size = cpu_to_sle64(old_data_size); - } - write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); -#ifdef DEBUG - read_lock_irqsave(&mftbmp_ni->size_lock, flags); - ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, " - "data_size 0x%llx, initialized_size 0x%llx.", - (long long)mftbmp_ni->allocated_size, - (long long)i_size_read(mftbmp_vi), - (long long)mftbmp_ni->initialized_size); - read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); -#endif /* DEBUG */ -err_out: - return ret; -} - -/** - * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute - * @vol: volume on which to extend the mft data attribute - * - * Extend the mft data attribute on the ntfs volume @vol by 16 mft records - * worth of clusters or if not enough space for this by one mft record worth - * of clusters. - * - * Note: Only changes allocated_size, i.e. does not touch initialized_size or - * data_size. - * - * Return 0 on success and -errno on error. - * - * Locking: - Caller must hold vol->mftbmp_lock for writing. - * - This function takes NTFS_I(vol->mft_ino)->runlist.lock for - * writing and releases it before returning. - * - This function calls functions which take vol->lcnbmp_lock for - * writing and release it before returning. - */ -static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) -{ - LCN lcn; - VCN old_last_vcn; - s64 min_nr, nr, ll; - unsigned long flags; - ntfs_inode *mft_ni; - runlist_element *rl, *rl2; - ntfs_attr_search_ctx *ctx = NULL; - MFT_RECORD *mrec; - ATTR_RECORD *a = NULL; - int ret, mp_size; - u32 old_alen = 0; - bool mp_rebuilt = false; - - ntfs_debug("Extending mft data allocation."); - mft_ni = NTFS_I(vol->mft_ino); - /* - * Determine the preferred allocation location, i.e. the last lcn of - * the mft data attribute. The allocated size of the mft data - * attribute cannot be zero so we are ok to do this. - */ - down_write(&mft_ni->runlist.lock); - read_lock_irqsave(&mft_ni->size_lock, flags); - ll = mft_ni->allocated_size; - read_unlock_irqrestore(&mft_ni->size_lock, flags); - rl = ntfs_attr_find_vcn_nolock(mft_ni, - (ll - 1) >> vol->cluster_size_bits, NULL); - if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { - up_write(&mft_ni->runlist.lock); - ntfs_error(vol->sb, "Failed to determine last allocated " - "cluster of mft data attribute."); - if (!IS_ERR(rl)) - ret = -EIO; - else - ret = PTR_ERR(rl); - return ret; - } - lcn = rl->lcn + rl->length; - ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn); - /* Minimum allocation is one mft record worth of clusters. */ - min_nr = vol->mft_record_size >> vol->cluster_size_bits; - if (!min_nr) - min_nr = 1; - /* Want to allocate 16 mft records worth of clusters. */ - nr = vol->mft_record_size << 4 >> vol->cluster_size_bits; - if (!nr) - nr = min_nr; - /* Ensure we do not go above 2^32-1 mft records. */ - read_lock_irqsave(&mft_ni->size_lock, flags); - ll = mft_ni->allocated_size; - read_unlock_irqrestore(&mft_ni->size_lock, flags); - if (unlikely((ll + (nr << vol->cluster_size_bits)) >> - vol->mft_record_size_bits >= (1ll << 32))) { - nr = min_nr; - if (unlikely((ll + (nr << vol->cluster_size_bits)) >> - vol->mft_record_size_bits >= (1ll << 32))) { - ntfs_warning(vol->sb, "Cannot allocate mft record " - "because the maximum number of inodes " - "(2^32) has already been reached."); - up_write(&mft_ni->runlist.lock); - return -ENOSPC; - } - } - ntfs_debug("Trying mft data allocation with %s cluster count %lli.", - nr > min_nr ? "default" : "minimal", (long long)nr); - old_last_vcn = rl[1].vcn; - do { - rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE, - true); - if (!IS_ERR(rl2)) - break; - if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) { - ntfs_error(vol->sb, "Failed to allocate the minimal " - "number of clusters (%lli) for the " - "mft data attribute.", (long long)nr); - up_write(&mft_ni->runlist.lock); - return PTR_ERR(rl2); - } - /* - * There is not enough space to do the allocation, but there - * might be enough space to do a minimal allocation so try that - * before failing. - */ - nr = min_nr; - ntfs_debug("Retrying mft data allocation with minimal cluster " - "count %lli.", (long long)nr); - } while (1); - rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2); - if (IS_ERR(rl)) { - up_write(&mft_ni->runlist.lock); - ntfs_error(vol->sb, "Failed to merge runlists for mft data " - "attribute."); - if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to deallocate clusters " - "from the mft data attribute.%s", es); - NVolSetErrors(vol); - } - ntfs_free(rl2); - return PTR_ERR(rl); - } - mft_ni->runlist.rl = rl; - ntfs_debug("Allocated %lli clusters.", (long long)nr); - /* Find the last run in the new runlist. */ - for (; rl[1].length; rl++) - ; - /* Update the attribute record as well. */ - mrec = map_mft_record(mft_ni); - if (IS_ERR(mrec)) { - ntfs_error(vol->sb, "Failed to map mft record."); - ret = PTR_ERR(mrec); - goto undo_alloc; - } - ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); - if (unlikely(!ctx)) { - ntfs_error(vol->sb, "Failed to get search context."); - ret = -ENOMEM; - goto undo_alloc; - } - ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, - CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to find last attribute extent of " - "mft data attribute."); - if (ret == -ENOENT) - ret = -EIO; - goto undo_alloc; - } - a = ctx->attr; - ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); - /* Search back for the previous last allocated cluster of mft bitmap. */ - for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) { - if (ll >= rl2->vcn) - break; - } - BUG_ON(ll < rl2->vcn); - BUG_ON(ll >= rl2->vcn + rl2->length); - /* Get the size for the new mapping pairs array for this extent. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); - if (unlikely(mp_size <= 0)) { - ntfs_error(vol->sb, "Get size for mapping pairs failed for " - "mft data attribute extent."); - ret = mp_size; - if (!ret) - ret = -EIO; - goto undo_alloc; - } - /* Expand the attribute record if necessary. */ - old_alen = le32_to_cpu(a->length); - ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); - if (unlikely(ret)) { - if (ret != -ENOSPC) { - ntfs_error(vol->sb, "Failed to resize attribute " - "record for mft data attribute."); - goto undo_alloc; - } - // TODO: Deal with this by moving this extent to a new mft - // record or by starting a new extent in a new mft record or by - // moving other attributes out of this mft record. - // Note: Use the special reserved mft records and ensure that - // this extent is not required to find the mft record in - // question. If no free special records left we would need to - // move an existing record away, insert ours in its place, and - // then place the moved record into the newly allocated space - // and we would then need to update all references to this mft - // record appropriately. This is rather complicated... - ntfs_error(vol->sb, "Not enough space in this mft record to " - "accommodate extended mft data attribute " - "extent. Cannot handle this yet."); - ret = -EOPNOTSUPP; - goto undo_alloc; - } - mp_rebuilt = true; - /* Generate the mapping pairs array directly into the attr record. */ - ret = ntfs_mapping_pairs_build(vol, (u8*)a + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset), - mp_size, rl2, ll, -1, NULL); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to build mapping pairs array of " - "mft data attribute."); - goto undo_alloc; - } - /* Update the highest_vcn. */ - a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1); - /* - * We now have extended the mft data allocated_size by nr clusters. - * Reflect this in the ntfs_inode structure and the attribute record. - * @rl is the last (non-terminator) runlist element of mft data - * attribute. - */ - if (a->data.non_resident.lowest_vcn) { - /* - * We are not in the first attribute extent, switch to it, but - * first ensure the changes will make it to disk later. - */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_reinit_search_ctx(ctx); - ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, - mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, - ctx); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to find first attribute " - "extent of mft data attribute."); - goto restore_undo_alloc; - } - a = ctx->attr; - } - write_lock_irqsave(&mft_ni->size_lock, flags); - mft_ni->allocated_size += nr << vol->cluster_size_bits; - a->data.non_resident.allocated_size = - cpu_to_sle64(mft_ni->allocated_size); - write_unlock_irqrestore(&mft_ni->size_lock, flags); - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - up_write(&mft_ni->runlist.lock); - ntfs_debug("Done."); - return 0; -restore_undo_alloc: - ntfs_attr_reinit_search_ctx(ctx); - if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, - CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) { - ntfs_error(vol->sb, "Failed to find last attribute extent of " - "mft data attribute.%s", es); - write_lock_irqsave(&mft_ni->size_lock, flags); - mft_ni->allocated_size += nr << vol->cluster_size_bits; - write_unlock_irqrestore(&mft_ni->size_lock, flags); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - up_write(&mft_ni->runlist.lock); - /* - * The only thing that is now wrong is ->allocated_size of the - * base attribute extent which chkdsk should be able to fix. - */ - NVolSetErrors(vol); - return ret; - } - ctx->attr->data.non_resident.highest_vcn = - cpu_to_sle64(old_last_vcn - 1); -undo_alloc: - if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) { - ntfs_error(vol->sb, "Failed to free clusters from mft data " - "attribute.%s", es); - NVolSetErrors(vol); - } - - if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) { - ntfs_error(vol->sb, "Failed to truncate mft data attribute " - "runlist.%s", es); - NVolSetErrors(vol); - } - if (ctx) { - a = ctx->attr; - if (mp_rebuilt && !IS_ERR(ctx->mrec)) { - if (ntfs_mapping_pairs_build(vol, (u8 *)a + le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), - old_alen - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), - rl2, ll, -1, NULL)) { - ntfs_error(vol->sb, "Failed to restore mapping pairs " - "array.%s", es); - NVolSetErrors(vol); - } - if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { - ntfs_error(vol->sb, "Failed to restore attribute " - "record.%s", es); - NVolSetErrors(vol); - } - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - } else if (IS_ERR(ctx->mrec)) { - ntfs_error(vol->sb, "Failed to restore attribute search " - "context.%s", es); - NVolSetErrors(vol); - } - ntfs_attr_put_search_ctx(ctx); - } - if (!IS_ERR(mrec)) - unmap_mft_record(mft_ni); - up_write(&mft_ni->runlist.lock); - return ret; -} - -/** - * ntfs_mft_record_layout - layout an mft record into a memory buffer - * @vol: volume to which the mft record will belong - * @mft_no: mft reference specifying the mft record number - * @m: destination buffer of size >= @vol->mft_record_size bytes - * - * Layout an empty, unused mft record with the mft record number @mft_no into - * the buffer @m. The volume @vol is needed because the mft record structure - * was modified in NTFS 3.1 so we need to know which volume version this mft - * record will be used on. - * - * Return 0 on success and -errno on error. - */ -static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no, - MFT_RECORD *m) -{ - ATTR_RECORD *a; - - ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no); - if (mft_no >= (1ll << 32)) { - ntfs_error(vol->sb, "Mft record number 0x%llx exceeds " - "maximum of 2^32.", (long long)mft_no); - return -ERANGE; - } - /* Start by clearing the whole mft record to gives us a clean slate. */ - memset(m, 0, vol->mft_record_size); - /* Aligned to 2-byte boundary. */ - if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver)) - m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1); - else { - m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1); - /* - * Set the NTFS 3.1+ specific fields while we know that the - * volume version is 3.1+. - */ - m->reserved = 0; - m->mft_record_number = cpu_to_le32((u32)mft_no); - } - m->magic = magic_FILE; - if (vol->mft_record_size >= NTFS_BLOCK_SIZE) - m->usa_count = cpu_to_le16(vol->mft_record_size / - NTFS_BLOCK_SIZE + 1); - else { - m->usa_count = cpu_to_le16(1); - ntfs_warning(vol->sb, "Sector size is bigger than mft record " - "size. Setting usa_count to 1. If chkdsk " - "reports this as corruption, please email " - "linux-ntfs-dev@lists.sourceforge.net stating " - "that you saw this message and that the " - "modified filesystem created was corrupt. " - "Thank you."); - } - /* Set the update sequence number to 1. */ - *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1); - m->lsn = 0; - m->sequence_number = cpu_to_le16(1); - m->link_count = 0; - /* - * Place the attributes straight after the update sequence array, - * aligned to 8-byte boundary. - */ - m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) + - (le16_to_cpu(m->usa_count) << 1) + 7) & ~7); - m->flags = 0; - /* - * Using attrs_offset plus eight bytes (for the termination attribute). - * attrs_offset is already aligned to 8-byte boundary, so no need to - * align again. - */ - m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8); - m->bytes_allocated = cpu_to_le32(vol->mft_record_size); - m->base_mft_record = 0; - m->next_attr_instance = 0; - /* Add the termination attribute. */ - a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset)); - a->type = AT_END; - a->length = 0; - ntfs_debug("Done."); - return 0; -} - -/** - * ntfs_mft_record_format - format an mft record on an ntfs volume - * @vol: volume on which to format the mft record - * @mft_no: mft record number to format - * - * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused - * mft record into the appropriate place of the mft data attribute. This is - * used when extending the mft data attribute. - * - * Return 0 on success and -errno on error. - */ -static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no) -{ - loff_t i_size; - struct inode *mft_vi = vol->mft_ino; - struct page *page; - MFT_RECORD *m; - pgoff_t index, end_index; - unsigned int ofs; - int err; - - ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no); - /* - * The index into the page cache and the offset within the page cache - * page of the wanted mft record. - */ - index = mft_no << vol->mft_record_size_bits >> PAGE_SHIFT; - ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; - /* The maximum valid index into the page cache for $MFT's data. */ - i_size = i_size_read(mft_vi); - end_index = i_size >> PAGE_SHIFT; - if (unlikely(index >= end_index)) { - if (unlikely(index > end_index || ofs + vol->mft_record_size >= - (i_size & ~PAGE_MASK))) { - ntfs_error(vol->sb, "Tried to format non-existing mft " - "record 0x%llx.", (long long)mft_no); - return -ENOENT; - } - } - /* Read, map, and pin the page containing the mft record. */ - page = ntfs_map_page(mft_vi->i_mapping, index); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to map page containing mft record " - "to format 0x%llx.", (long long)mft_no); - return PTR_ERR(page); - } - lock_page(page); - BUG_ON(!PageUptodate(page)); - ClearPageUptodate(page); - m = (MFT_RECORD*)((u8*)page_address(page) + ofs); - err = ntfs_mft_record_layout(vol, mft_no, m); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.", - (long long)mft_no); - SetPageUptodate(page); - unlock_page(page); - ntfs_unmap_page(page); - return err; - } - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - /* - * Make sure the mft record is written out to disk. We could use - * ilookup5() to check if an inode is in icache and so on but this is - * unnecessary as ntfs_writepage() will write the dirty record anyway. - */ - mark_ntfs_record_dirty(page, ofs); - ntfs_unmap_page(page); - ntfs_debug("Done."); - return 0; -} - -/** - * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume - * @vol: [IN] volume on which to allocate the mft record - * @mode: [IN] mode if want a file or directory, i.e. base inode or 0 - * @base_ni: [IN] open base inode if allocating an extent mft record or NULL - * @mrec: [OUT] on successful return this is the mapped mft record - * - * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol. - * - * If @base_ni is NULL make the mft record a base mft record, i.e. a file or - * direvctory inode, and allocate it at the default allocator position. In - * this case @mode is the file mode as given to us by the caller. We in - * particular use @mode to distinguish whether a file or a directory is being - * created (S_IFDIR(mode) and S_IFREG(mode), respectively). - * - * If @base_ni is not NULL make the allocated mft record an extent record, - * allocate it starting at the mft record after the base mft record and attach - * the allocated and opened ntfs inode to the base inode @base_ni. In this - * case @mode must be 0 as it is meaningless for extent inodes. - * - * You need to check the return value with IS_ERR(). If false, the function - * was successful and the return value is the now opened ntfs inode of the - * allocated mft record. *@mrec is then set to the allocated, mapped, pinned, - * and locked mft record. If IS_ERR() is true, the function failed and the - * error code is obtained from PTR_ERR(return value). *@mrec is undefined in - * this case. - * - * Allocation strategy: - * - * To find a free mft record, we scan the mft bitmap for a zero bit. To - * optimize this we start scanning at the place specified by @base_ni or if - * @base_ni is NULL we start where we last stopped and we perform wrap around - * when we reach the end. Note, we do not try to allocate mft records below - * number 24 because numbers 0 to 15 are the defined system files anyway and 16 - * to 24 are special in that they are used for storing extension mft records - * for the $DATA attribute of $MFT. This is required to avoid the possibility - * of creating a runlist with a circular dependency which once written to disk - * can never be read in again. Windows will only use records 16 to 24 for - * normal files if the volume is completely out of space. We never use them - * which means that when the volume is really out of space we cannot create any - * more files while Windows can still create up to 8 small files. We can start - * doing this at some later time, it does not matter much for now. - * - * When scanning the mft bitmap, we only search up to the last allocated mft - * record. If there are no free records left in the range 24 to number of - * allocated mft records, then we extend the $MFT/$DATA attribute in order to - * create free mft records. We extend the allocated size of $MFT/$DATA by 16 - * records at a time or one cluster, if cluster size is above 16kiB. If there - * is not sufficient space to do this, we try to extend by a single mft record - * or one cluster, if cluster size is above the mft record size. - * - * No matter how many mft records we allocate, we initialize only the first - * allocated mft record, incrementing mft data size and initialized size - * accordingly, open an ntfs_inode for it and return it to the caller, unless - * there are less than 24 mft records, in which case we allocate and initialize - * mft records until we reach record 24 which we consider as the first free mft - * record for use by normal files. - * - * If during any stage we overflow the initialized data in the mft bitmap, we - * extend the initialized size (and data size) by 8 bytes, allocating another - * cluster if required. The bitmap data size has to be at least equal to the - * number of mft records in the mft, but it can be bigger, in which case the - * superflous bits are padded with zeroes. - * - * Thus, when we return successfully (IS_ERR() is false), we will have: - * - initialized / extended the mft bitmap if necessary, - * - initialized / extended the mft data if necessary, - * - set the bit corresponding to the mft record being allocated in the - * mft bitmap, - * - opened an ntfs_inode for the allocated mft record, and we will have - * - returned the ntfs_inode as well as the allocated mapped, pinned, and - * locked mft record. - * - * On error, the volume will be left in a consistent state and no record will - * be allocated. If rolling back a partial operation fails, we may leave some - * inconsistent metadata in which case we set NVolErrors() so the volume is - * left dirty when unmounted. - * - * Note, this function cannot make use of most of the normal functions, like - * for example for attribute resizing, etc, because when the run list overflows - * the base mft record and an attribute list is used, it is very important that - * the extension mft records used to store the $DATA attribute of $MFT can be - * reached without having to read the information contained inside them, as - * this would make it impossible to find them in the first place after the - * volume is unmounted. $MFT/$BITMAP probably does not need to follow this - * rule because the bitmap is not essential for finding the mft records, but on - * the other hand, handling the bitmap in this special way would make life - * easier because otherwise there might be circular invocations of functions - * when reading the bitmap. - */ -ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, - ntfs_inode *base_ni, MFT_RECORD **mrec) -{ - s64 ll, bit, old_data_initialized, old_data_size; - unsigned long flags; - struct inode *vi; - struct page *page; - ntfs_inode *mft_ni, *mftbmp_ni, *ni; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *m; - ATTR_RECORD *a; - pgoff_t index; - unsigned int ofs; - int err; - le16 seq_no, usn; - bool record_formatted = false; - - if (base_ni) { - ntfs_debug("Entering (allocating an extent mft record for " - "base mft record 0x%llx).", - (long long)base_ni->mft_no); - /* @mode and @base_ni are mutually exclusive. */ - BUG_ON(mode); - } else - ntfs_debug("Entering (allocating a base mft record)."); - if (mode) { - /* @mode and @base_ni are mutually exclusive. */ - BUG_ON(base_ni); - /* We only support creation of normal files and directories. */ - if (!S_ISREG(mode) && !S_ISDIR(mode)) - return ERR_PTR(-EOPNOTSUPP); - } - BUG_ON(!mrec); - mft_ni = NTFS_I(vol->mft_ino); - mftbmp_ni = NTFS_I(vol->mftbmp_ino); - down_write(&vol->mftbmp_lock); - bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni); - if (bit >= 0) { - ntfs_debug("Found and allocated free record (#1), bit 0x%llx.", - (long long)bit); - goto have_alloc_rec; - } - if (bit != -ENOSPC) { - up_write(&vol->mftbmp_lock); - return ERR_PTR(bit); - } - /* - * No free mft records left. If the mft bitmap already covers more - * than the currently used mft records, the next records are all free, - * so we can simply allocate the first unused mft record. - * Note: We also have to make sure that the mft bitmap at least covers - * the first 24 mft records as they are special and whilst they may not - * be in use, we do not allocate from them. - */ - read_lock_irqsave(&mft_ni->size_lock, flags); - ll = mft_ni->initialized_size >> vol->mft_record_size_bits; - read_unlock_irqrestore(&mft_ni->size_lock, flags); - read_lock_irqsave(&mftbmp_ni->size_lock, flags); - old_data_initialized = mftbmp_ni->initialized_size; - read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - if (old_data_initialized << 3 > ll && old_data_initialized > 3) { - bit = ll; - if (bit < 24) - bit = 24; - if (unlikely(bit >= (1ll << 32))) - goto max_err_out; - ntfs_debug("Found free record (#2), bit 0x%llx.", - (long long)bit); - goto found_free_rec; - } - /* - * The mft bitmap needs to be expanded until it covers the first unused - * mft record that we can allocate. - * Note: The smallest mft record we allocate is mft record 24. - */ - bit = old_data_initialized << 3; - if (unlikely(bit >= (1ll << 32))) - goto max_err_out; - read_lock_irqsave(&mftbmp_ni->size_lock, flags); - old_data_size = mftbmp_ni->allocated_size; - ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, " - "data_size 0x%llx, initialized_size 0x%llx.", - (long long)old_data_size, - (long long)i_size_read(vol->mftbmp_ino), - (long long)old_data_initialized); - read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - if (old_data_initialized + 8 > old_data_size) { - /* Need to extend bitmap by one more cluster. */ - ntfs_debug("mftbmp: initialized_size + 8 > allocated_size."); - err = ntfs_mft_bitmap_extend_allocation_nolock(vol); - if (unlikely(err)) { - up_write(&vol->mftbmp_lock); - goto err_out; - } -#ifdef DEBUG - read_lock_irqsave(&mftbmp_ni->size_lock, flags); - ntfs_debug("Status of mftbmp after allocation extension: " - "allocated_size 0x%llx, data_size 0x%llx, " - "initialized_size 0x%llx.", - (long long)mftbmp_ni->allocated_size, - (long long)i_size_read(vol->mftbmp_ino), - (long long)mftbmp_ni->initialized_size); - read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); -#endif /* DEBUG */ - } - /* - * We now have sufficient allocated space, extend the initialized_size - * as well as the data_size if necessary and fill the new space with - * zeroes. - */ - err = ntfs_mft_bitmap_extend_initialized_nolock(vol); - if (unlikely(err)) { - up_write(&vol->mftbmp_lock); - goto err_out; - } -#ifdef DEBUG - read_lock_irqsave(&mftbmp_ni->size_lock, flags); - ntfs_debug("Status of mftbmp after initialized extension: " - "allocated_size 0x%llx, data_size 0x%llx, " - "initialized_size 0x%llx.", - (long long)mftbmp_ni->allocated_size, - (long long)i_size_read(vol->mftbmp_ino), - (long long)mftbmp_ni->initialized_size); - read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); -#endif /* DEBUG */ - ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit); -found_free_rec: - /* @bit is the found free mft record, allocate it in the mft bitmap. */ - ntfs_debug("At found_free_rec."); - err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap."); - up_write(&vol->mftbmp_lock); - goto err_out; - } - ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit); -have_alloc_rec: - /* - * The mft bitmap is now uptodate. Deal with mft data attribute now. - * Note, we keep hold of the mft bitmap lock for writing until all - * modifications to the mft data attribute are complete, too, as they - * will impact decisions for mft bitmap and mft record allocation done - * by a parallel allocation and if the lock is not maintained a - * parallel allocation could allocate the same mft record as this one. - */ - ll = (bit + 1) << vol->mft_record_size_bits; - read_lock_irqsave(&mft_ni->size_lock, flags); - old_data_initialized = mft_ni->initialized_size; - read_unlock_irqrestore(&mft_ni->size_lock, flags); - if (ll <= old_data_initialized) { - ntfs_debug("Allocated mft record already initialized."); - goto mft_rec_already_initialized; - } - ntfs_debug("Initializing allocated mft record."); - /* - * The mft record is outside the initialized data. Extend the mft data - * attribute until it covers the allocated record. The loop is only - * actually traversed more than once when a freshly formatted volume is - * first written to so it optimizes away nicely in the common case. - */ - read_lock_irqsave(&mft_ni->size_lock, flags); - ntfs_debug("Status of mft data before extension: " - "allocated_size 0x%llx, data_size 0x%llx, " - "initialized_size 0x%llx.", - (long long)mft_ni->allocated_size, - (long long)i_size_read(vol->mft_ino), - (long long)mft_ni->initialized_size); - while (ll > mft_ni->allocated_size) { - read_unlock_irqrestore(&mft_ni->size_lock, flags); - err = ntfs_mft_data_extend_allocation_nolock(vol); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to extend mft data " - "allocation."); - goto undo_mftbmp_alloc_nolock; - } - read_lock_irqsave(&mft_ni->size_lock, flags); - ntfs_debug("Status of mft data after allocation extension: " - "allocated_size 0x%llx, data_size 0x%llx, " - "initialized_size 0x%llx.", - (long long)mft_ni->allocated_size, - (long long)i_size_read(vol->mft_ino), - (long long)mft_ni->initialized_size); - } - read_unlock_irqrestore(&mft_ni->size_lock, flags); - /* - * Extend mft data initialized size (and data size of course) to reach - * the allocated mft record, formatting the mft records allong the way. - * Note: We only modify the ntfs_inode structure as that is all that is - * needed by ntfs_mft_record_format(). We will update the attribute - * record itself in one fell swoop later on. - */ - write_lock_irqsave(&mft_ni->size_lock, flags); - old_data_initialized = mft_ni->initialized_size; - old_data_size = vol->mft_ino->i_size; - while (ll > mft_ni->initialized_size) { - s64 new_initialized_size, mft_no; - - new_initialized_size = mft_ni->initialized_size + - vol->mft_record_size; - mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits; - if (new_initialized_size > i_size_read(vol->mft_ino)) - i_size_write(vol->mft_ino, new_initialized_size); - write_unlock_irqrestore(&mft_ni->size_lock, flags); - ntfs_debug("Initializing mft record 0x%llx.", - (long long)mft_no); - err = ntfs_mft_record_format(vol, mft_no); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to format mft record."); - goto undo_data_init; - } - write_lock_irqsave(&mft_ni->size_lock, flags); - mft_ni->initialized_size = new_initialized_size; - } - write_unlock_irqrestore(&mft_ni->size_lock, flags); - record_formatted = true; - /* Update the mft data attribute record to reflect the new sizes. */ - m = map_mft_record(mft_ni); - if (IS_ERR(m)) { - ntfs_error(vol->sb, "Failed to map mft record."); - err = PTR_ERR(m); - goto undo_data_init; - } - ctx = ntfs_attr_get_search_ctx(mft_ni, m); - if (unlikely(!ctx)) { - ntfs_error(vol->sb, "Failed to get search context."); - err = -ENOMEM; - unmap_mft_record(mft_ni); - goto undo_data_init; - } - err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to find first attribute extent of " - "mft data attribute."); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - goto undo_data_init; - } - a = ctx->attr; - read_lock_irqsave(&mft_ni->size_lock, flags); - a->data.non_resident.initialized_size = - cpu_to_sle64(mft_ni->initialized_size); - a->data.non_resident.data_size = - cpu_to_sle64(i_size_read(vol->mft_ino)); - read_unlock_irqrestore(&mft_ni->size_lock, flags); - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - read_lock_irqsave(&mft_ni->size_lock, flags); - ntfs_debug("Status of mft data after mft record initialization: " - "allocated_size 0x%llx, data_size 0x%llx, " - "initialized_size 0x%llx.", - (long long)mft_ni->allocated_size, - (long long)i_size_read(vol->mft_ino), - (long long)mft_ni->initialized_size); - BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size); - BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino)); - read_unlock_irqrestore(&mft_ni->size_lock, flags); -mft_rec_already_initialized: - /* - * We can finally drop the mft bitmap lock as the mft data attribute - * has been fully updated. The only disparity left is that the - * allocated mft record still needs to be marked as in use to match the - * set bit in the mft bitmap but this is actually not a problem since - * this mft record is not referenced from anywhere yet and the fact - * that it is allocated in the mft bitmap means that no-one will try to - * allocate it either. - */ - up_write(&vol->mftbmp_lock); - /* - * We now have allocated and initialized the mft record. Calculate the - * index of and the offset within the page cache page the record is in. - */ - index = bit << vol->mft_record_size_bits >> PAGE_SHIFT; - ofs = (bit << vol->mft_record_size_bits) & ~PAGE_MASK; - /* Read, map, and pin the page containing the mft record. */ - page = ntfs_map_page(vol->mft_ino->i_mapping, index); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to map page containing allocated " - "mft record 0x%llx.", (long long)bit); - err = PTR_ERR(page); - goto undo_mftbmp_alloc; - } - lock_page(page); - BUG_ON(!PageUptodate(page)); - ClearPageUptodate(page); - m = (MFT_RECORD*)((u8*)page_address(page) + ofs); - /* If we just formatted the mft record no need to do it again. */ - if (!record_formatted) { - /* Sanity check that the mft record is really not in use. */ - if (ntfs_is_file_record(m->magic) && - (m->flags & MFT_RECORD_IN_USE)) { - ntfs_error(vol->sb, "Mft record 0x%llx was marked " - "free in mft bitmap but is marked " - "used itself. Corrupt filesystem. " - "Unmount and run chkdsk.", - (long long)bit); - err = -EIO; - SetPageUptodate(page); - unlock_page(page); - ntfs_unmap_page(page); - NVolSetErrors(vol); - goto undo_mftbmp_alloc; - } - /* - * We need to (re-)format the mft record, preserving the - * sequence number if it is not zero as well as the update - * sequence number if it is not zero or -1 (0xffff). This - * means we do not need to care whether or not something went - * wrong with the previous mft record. - */ - seq_no = m->sequence_number; - usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)); - err = ntfs_mft_record_layout(vol, bit, m); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to layout allocated mft " - "record 0x%llx.", (long long)bit); - SetPageUptodate(page); - unlock_page(page); - ntfs_unmap_page(page); - goto undo_mftbmp_alloc; - } - if (seq_no) - m->sequence_number = seq_no; - if (usn && le16_to_cpu(usn) != 0xffff) - *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn; - } - /* Set the mft record itself in use. */ - m->flags |= MFT_RECORD_IN_USE; - if (S_ISDIR(mode)) - m->flags |= MFT_RECORD_IS_DIRECTORY; - flush_dcache_page(page); - SetPageUptodate(page); - if (base_ni) { - MFT_RECORD *m_tmp; - - /* - * Setup the base mft record in the extent mft record. This - * completes initialization of the allocated extent mft record - * and we can simply use it with map_extent_mft_record(). - */ - m->base_mft_record = MK_LE_MREF(base_ni->mft_no, - base_ni->seq_no); - /* - * Allocate an extent inode structure for the new mft record, - * attach it to the base inode @base_ni and map, pin, and lock - * its, i.e. the allocated, mft record. - */ - m_tmp = map_extent_mft_record(base_ni, bit, &ni); - if (IS_ERR(m_tmp)) { - ntfs_error(vol->sb, "Failed to map allocated extent " - "mft record 0x%llx.", (long long)bit); - err = PTR_ERR(m_tmp); - /* Set the mft record itself not in use. */ - m->flags &= cpu_to_le16( - ~le16_to_cpu(MFT_RECORD_IN_USE)); - flush_dcache_page(page); - /* Make sure the mft record is written out to disk. */ - mark_ntfs_record_dirty(page, ofs); - unlock_page(page); - ntfs_unmap_page(page); - goto undo_mftbmp_alloc; - } - BUG_ON(m != m_tmp); - /* - * Make sure the allocated mft record is written out to disk. - * No need to set the inode dirty because the caller is going - * to do that anyway after finishing with the new extent mft - * record (e.g. at a minimum a new attribute will be added to - * the mft record. - */ - mark_ntfs_record_dirty(page, ofs); - unlock_page(page); - /* - * Need to unmap the page since map_extent_mft_record() mapped - * it as well so we have it mapped twice at the moment. - */ - ntfs_unmap_page(page); - } else { - /* - * Allocate a new VFS inode and set it up. NOTE: @vi->i_nlink - * is set to 1 but the mft record->link_count is 0. The caller - * needs to bear this in mind. - */ - vi = new_inode(vol->sb); - if (unlikely(!vi)) { - err = -ENOMEM; - /* Set the mft record itself not in use. */ - m->flags &= cpu_to_le16( - ~le16_to_cpu(MFT_RECORD_IN_USE)); - flush_dcache_page(page); - /* Make sure the mft record is written out to disk. */ - mark_ntfs_record_dirty(page, ofs); - unlock_page(page); - ntfs_unmap_page(page); - goto undo_mftbmp_alloc; - } - vi->i_ino = bit; - - /* The owner and group come from the ntfs volume. */ - vi->i_uid = vol->uid; - vi->i_gid = vol->gid; - - /* Initialize the ntfs specific part of @vi. */ - ntfs_init_big_inode(vi); - ni = NTFS_I(vi); - /* - * Set the appropriate mode, attribute type, and name. For - * directories, also setup the index values to the defaults. - */ - if (S_ISDIR(mode)) { - vi->i_mode = S_IFDIR | S_IRWXUGO; - vi->i_mode &= ~vol->dmask; - - NInoSetMstProtected(ni); - ni->type = AT_INDEX_ALLOCATION; - ni->name = I30; - ni->name_len = 4; - - ni->itype.index.block_size = 4096; - ni->itype.index.block_size_bits = ntfs_ffs(4096) - 1; - ni->itype.index.collation_rule = COLLATION_FILE_NAME; - if (vol->cluster_size <= ni->itype.index.block_size) { - ni->itype.index.vcn_size = vol->cluster_size; - ni->itype.index.vcn_size_bits = - vol->cluster_size_bits; - } else { - ni->itype.index.vcn_size = vol->sector_size; - ni->itype.index.vcn_size_bits = - vol->sector_size_bits; - } - } else { - vi->i_mode = S_IFREG | S_IRWXUGO; - vi->i_mode &= ~vol->fmask; - - ni->type = AT_DATA; - ni->name = NULL; - ni->name_len = 0; - } - if (IS_RDONLY(vi)) - vi->i_mode &= ~S_IWUGO; - - /* Set the inode times to the current time. */ - simple_inode_init_ts(vi); - /* - * Set the file size to 0, the ntfs inode sizes are set to 0 by - * the call to ntfs_init_big_inode() below. - */ - vi->i_size = 0; - vi->i_blocks = 0; - - /* Set the sequence number. */ - vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); - /* - * Manually map, pin, and lock the mft record as we already - * have its page mapped and it is very easy to do. - */ - atomic_inc(&ni->count); - mutex_lock(&ni->mrec_lock); - ni->page = page; - ni->page_ofs = ofs; - /* - * Make sure the allocated mft record is written out to disk. - * NOTE: We do not set the ntfs inode dirty because this would - * fail in ntfs_write_inode() because the inode does not have a - * standard information attribute yet. Also, there is no need - * to set the inode dirty because the caller is going to do - * that anyway after finishing with the new mft record (e.g. at - * a minimum some new attributes will be added to the mft - * record. - */ - mark_ntfs_record_dirty(page, ofs); - unlock_page(page); - - /* Add the inode to the inode hash for the superblock. */ - insert_inode_hash(vi); - - /* Update the default mft allocation position. */ - vol->mft_data_pos = bit + 1; - } - /* - * Return the opened, allocated inode of the allocated mft record as - * well as the mapped, pinned, and locked mft record. - */ - ntfs_debug("Returning opened, allocated %sinode 0x%llx.", - base_ni ? "extent " : "", (long long)bit); - *mrec = m; - return ni; -undo_data_init: - write_lock_irqsave(&mft_ni->size_lock, flags); - mft_ni->initialized_size = old_data_initialized; - i_size_write(vol->mft_ino, old_data_size); - write_unlock_irqrestore(&mft_ni->size_lock, flags); - goto undo_mftbmp_alloc_nolock; -undo_mftbmp_alloc: - down_write(&vol->mftbmp_lock); -undo_mftbmp_alloc_nolock: - if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) { - ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es); - NVolSetErrors(vol); - } - up_write(&vol->mftbmp_lock); -err_out: - return ERR_PTR(err); -max_err_out: - ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum " - "number of inodes (2^32) has already been reached."); - up_write(&vol->mftbmp_lock); - return ERR_PTR(-ENOSPC); -} - -/** - * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume - * @ni: ntfs inode of the mapped extent mft record to free - * @m: mapped extent mft record of the ntfs inode @ni - * - * Free the mapped extent mft record @m of the extent ntfs inode @ni. - * - * Note that this function unmaps the mft record and closes and destroys @ni - * internally and hence you cannot use either @ni nor @m any more after this - * function returns success. - * - * On success return 0 and on error return -errno. @ni and @m are still valid - * in this case and have not been freed. - * - * For some errors an error message is displayed and the success code 0 is - * returned and the volume is then left dirty on umount. This makes sense in - * case we could not rollback the changes that were already done since the - * caller no longer wants to reference this mft record so it does not matter to - * the caller if something is wrong with it as long as it is properly detached - * from the base inode. - */ -int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m) -{ - unsigned long mft_no = ni->mft_no; - ntfs_volume *vol = ni->vol; - ntfs_inode *base_ni; - ntfs_inode **extent_nis; - int i, err; - le16 old_seq_no; - u16 seq_no; - - BUG_ON(NInoAttr(ni)); - BUG_ON(ni->nr_extents != -1); - - mutex_lock(&ni->extent_lock); - base_ni = ni->ext.base_ntfs_ino; - mutex_unlock(&ni->extent_lock); - - BUG_ON(base_ni->nr_extents <= 0); - - ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n", - mft_no, base_ni->mft_no); - - mutex_lock(&base_ni->extent_lock); - - /* Make sure we are holding the only reference to the extent inode. */ - if (atomic_read(&ni->count) > 2) { - ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, " - "not freeing.", base_ni->mft_no); - mutex_unlock(&base_ni->extent_lock); - return -EBUSY; - } - - /* Dissociate the ntfs inode from the base inode. */ - extent_nis = base_ni->ext.extent_ntfs_inos; - err = -ENOENT; - for (i = 0; i < base_ni->nr_extents; i++) { - if (ni != extent_nis[i]) - continue; - extent_nis += i; - base_ni->nr_extents--; - memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) * - sizeof(ntfs_inode*)); - err = 0; - break; - } - - mutex_unlock(&base_ni->extent_lock); - - if (unlikely(err)) { - ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to " - "its base inode 0x%lx.", mft_no, - base_ni->mft_no); - BUG(); - } - - /* - * The extent inode is no longer attached to the base inode so no one - * can get a reference to it any more. - */ - - /* Mark the mft record as not in use. */ - m->flags &= ~MFT_RECORD_IN_USE; - - /* Increment the sequence number, skipping zero, if it is not zero. */ - old_seq_no = m->sequence_number; - seq_no = le16_to_cpu(old_seq_no); - if (seq_no == 0xffff) - seq_no = 1; - else if (seq_no) - seq_no++; - m->sequence_number = cpu_to_le16(seq_no); - - /* - * Set the ntfs inode dirty and write it out. We do not need to worry - * about the base inode here since whatever caused the extent mft - * record to be freed is guaranteed to do it already. - */ - NInoSetDirty(ni); - err = write_mft_record(ni, m, 0); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not " - "freeing.", mft_no); - goto rollback; - } -rollback_error: - /* Unmap and throw away the now freed extent inode. */ - unmap_extent_mft_record(ni); - ntfs_clear_extent_inode(ni); - - /* Clear the bit in the $MFT/$BITMAP corresponding to this record. */ - down_write(&vol->mftbmp_lock); - err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no); - up_write(&vol->mftbmp_lock); - if (unlikely(err)) { - /* - * The extent inode is gone but we failed to deallocate it in - * the mft bitmap. Just emit a warning and leave the volume - * dirty on umount. - */ - ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es); - NVolSetErrors(vol); - } - return 0; -rollback: - /* Rollback what we did... */ - mutex_lock(&base_ni->extent_lock); - extent_nis = base_ni->ext.extent_ntfs_inos; - if (!(base_ni->nr_extents & 3)) { - int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*); - - extent_nis = kmalloc(new_size, GFP_NOFS); - if (unlikely(!extent_nis)) { - ntfs_error(vol->sb, "Failed to allocate internal " - "buffer during rollback.%s", es); - mutex_unlock(&base_ni->extent_lock); - NVolSetErrors(vol); - goto rollback_error; - } - if (base_ni->nr_extents) { - BUG_ON(!base_ni->ext.extent_ntfs_inos); - memcpy(extent_nis, base_ni->ext.extent_ntfs_inos, - new_size - 4 * sizeof(ntfs_inode*)); - kfree(base_ni->ext.extent_ntfs_inos); - } - base_ni->ext.extent_ntfs_inos = extent_nis; - } - m->flags |= MFT_RECORD_IN_USE; - m->sequence_number = old_seq_no; - extent_nis[base_ni->nr_extents++] = ni; - mutex_unlock(&base_ni->extent_lock); - mark_mft_record_dirty(ni); - return err; -} -#endif /* NTFS_RW */ diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h deleted file mode 100644 index 49c001af16edc..0000000000000 --- a/fs/ntfs/mft.h +++ /dev/null @@ -1,110 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * mft.h - Defines for mft record handling in NTFS Linux kernel driver. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_MFT_H -#define _LINUX_NTFS_MFT_H - -#include -#include -#include - -#include "inode.h" - -extern MFT_RECORD *map_mft_record(ntfs_inode *ni); -extern void unmap_mft_record(ntfs_inode *ni); - -extern MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, - ntfs_inode **ntfs_ino); - -static inline void unmap_extent_mft_record(ntfs_inode *ni) -{ - unmap_mft_record(ni); - return; -} - -#ifdef NTFS_RW - -/** - * flush_dcache_mft_record_page - flush_dcache_page() for mft records - * @ni: ntfs inode structure of mft record - * - * Call flush_dcache_page() for the page in which an mft record resides. - * - * This must be called every time an mft record is modified, just after the - * modification. - */ -static inline void flush_dcache_mft_record_page(ntfs_inode *ni) -{ - flush_dcache_page(ni->page); -} - -extern void __mark_mft_record_dirty(ntfs_inode *ni); - -/** - * mark_mft_record_dirty - set the mft record and the page containing it dirty - * @ni: ntfs inode describing the mapped mft record - * - * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni, - * as well as the page containing the mft record, dirty. Also, mark the base - * vfs inode dirty. This ensures that any changes to the mft record are - * written out to disk. - * - * NOTE: Do not do anything if the mft record is already marked dirty. - */ -static inline void mark_mft_record_dirty(ntfs_inode *ni) -{ - if (!NInoTestSetDirty(ni)) - __mark_mft_record_dirty(ni); -} - -extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, - MFT_RECORD *m, int sync); - -extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync); - -/** - * write_mft_record - write out a mapped (extent) mft record - * @ni: ntfs inode describing the mapped (extent) mft record - * @m: mapped (extent) mft record to write - * @sync: if true, wait for i/o completion - * - * This is just a wrapper for write_mft_record_nolock() (see mft.c), which - * locks the page for the duration of the write. This ensures that there are - * no race conditions between writing the mft record via the dirty inode code - * paths and via the page cache write back code paths or between writing - * neighbouring mft records residing in the same page. - * - * Locking the page also serializes us against ->read_folio() if the page is not - * uptodate. - * - * On success, clean the mft record and return 0. On error, leave the mft - * record dirty and return -errno. - */ -static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync) -{ - struct page *page = ni->page; - int err; - - BUG_ON(!page); - lock_page(page); - err = write_mft_record_nolock(ni, m, sync); - unlock_page(page); - return err; -} - -extern bool ntfs_may_write_mft_record(ntfs_volume *vol, - const unsigned long mft_no, const MFT_RECORD *m, - ntfs_inode **locked_ni); - -extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, - ntfs_inode *base_ni, MFT_RECORD **mrec); -extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_MFT_H */ diff --git a/fs/ntfs/mst.c b/fs/ntfs/mst.c deleted file mode 100644 index 16b3c884abfc4..0000000000000 --- a/fs/ntfs/mst.c +++ /dev/null @@ -1,189 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * mst.c - NTFS multi sector transfer protection handling code. Part of the - * Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - */ - -#include "ntfs.h" - -/** - * post_read_mst_fixup - deprotect multi sector transfer protected data - * @b: pointer to the data to deprotect - * @size: size in bytes of @b - * - * Perform the necessary post read multi sector transfer fixup and detect the - * presence of incomplete multi sector transfers. - In that case, overwrite the - * magic of the ntfs record header being processed with "BAAD" (in memory only!) - * and abort processing. - * - * Return 0 on success and -EINVAL on error ("BAAD" magic will be present). - * - * NOTE: We consider the absence / invalidity of an update sequence array to - * mean that the structure is not protected at all and hence doesn't need to - * be fixed up. Thus, we return success and not failure in this case. This is - * in contrast to pre_write_mst_fixup(), see below. - */ -int post_read_mst_fixup(NTFS_RECORD *b, const u32 size) -{ - u16 usa_ofs, usa_count, usn; - u16 *usa_pos, *data_pos; - - /* Setup the variables. */ - usa_ofs = le16_to_cpu(b->usa_ofs); - /* Decrement usa_count to get number of fixups. */ - usa_count = le16_to_cpu(b->usa_count) - 1; - /* Size and alignment checks. */ - if ( size & (NTFS_BLOCK_SIZE - 1) || - usa_ofs & 1 || - usa_ofs + (usa_count * 2) > size || - (size >> NTFS_BLOCK_SIZE_BITS) != usa_count) - return 0; - /* Position of usn in update sequence array. */ - usa_pos = (u16*)b + usa_ofs/sizeof(u16); - /* - * The update sequence number which has to be equal to each of the - * u16 values before they are fixed up. Note no need to care for - * endianness since we are comparing and moving data for on disk - * structures which means the data is consistent. - If it is - * consistenty the wrong endianness it doesn't make any difference. - */ - usn = *usa_pos; - /* - * Position in protected data of first u16 that needs fixing up. - */ - data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1; - /* - * Check for incomplete multi sector transfer(s). - */ - while (usa_count--) { - if (*data_pos != usn) { - /* - * Incomplete multi sector transfer detected! )-: - * Set the magic to "BAAD" and return failure. - * Note that magic_BAAD is already converted to le32. - */ - b->magic = magic_BAAD; - return -EINVAL; - } - data_pos += NTFS_BLOCK_SIZE/sizeof(u16); - } - /* Re-setup the variables. */ - usa_count = le16_to_cpu(b->usa_count) - 1; - data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1; - /* Fixup all sectors. */ - while (usa_count--) { - /* - * Increment position in usa and restore original data from - * the usa into the data buffer. - */ - *data_pos = *(++usa_pos); - /* Increment position in data as well. */ - data_pos += NTFS_BLOCK_SIZE/sizeof(u16); - } - return 0; -} - -/** - * pre_write_mst_fixup - apply multi sector transfer protection - * @b: pointer to the data to protect - * @size: size in bytes of @b - * - * Perform the necessary pre write multi sector transfer fixup on the data - * pointer to by @b of @size. - * - * Return 0 if fixup applied (success) or -EINVAL if no fixup was performed - * (assumed not needed). This is in contrast to post_read_mst_fixup() above. - * - * NOTE: We consider the absence / invalidity of an update sequence array to - * mean that the structure is not subject to protection and hence doesn't need - * to be fixed up. This means that you have to create a valid update sequence - * array header in the ntfs record before calling this function, otherwise it - * will fail (the header needs to contain the position of the update sequence - * array together with the number of elements in the array). You also need to - * initialise the update sequence number before calling this function - * otherwise a random word will be used (whatever was in the record at that - * position at that time). - */ -int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size) -{ - le16 *usa_pos, *data_pos; - u16 usa_ofs, usa_count, usn; - le16 le_usn; - - /* Sanity check + only fixup if it makes sense. */ - if (!b || ntfs_is_baad_record(b->magic) || - ntfs_is_hole_record(b->magic)) - return -EINVAL; - /* Setup the variables. */ - usa_ofs = le16_to_cpu(b->usa_ofs); - /* Decrement usa_count to get number of fixups. */ - usa_count = le16_to_cpu(b->usa_count) - 1; - /* Size and alignment checks. */ - if ( size & (NTFS_BLOCK_SIZE - 1) || - usa_ofs & 1 || - usa_ofs + (usa_count * 2) > size || - (size >> NTFS_BLOCK_SIZE_BITS) != usa_count) - return -EINVAL; - /* Position of usn in update sequence array. */ - usa_pos = (le16*)((u8*)b + usa_ofs); - /* - * Cyclically increment the update sequence number - * (skipping 0 and -1, i.e. 0xffff). - */ - usn = le16_to_cpup(usa_pos) + 1; - if (usn == 0xffff || !usn) - usn = 1; - le_usn = cpu_to_le16(usn); - *usa_pos = le_usn; - /* Position in data of first u16 that needs fixing up. */ - data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1; - /* Fixup all sectors. */ - while (usa_count--) { - /* - * Increment the position in the usa and save the - * original data from the data buffer into the usa. - */ - *(++usa_pos) = *data_pos; - /* Apply fixup to data. */ - *data_pos = le_usn; - /* Increment position in data as well. */ - data_pos += NTFS_BLOCK_SIZE/sizeof(le16); - } - return 0; -} - -/** - * post_write_mst_fixup - fast deprotect multi sector transfer protected data - * @b: pointer to the data to deprotect - * - * Perform the necessary post write multi sector transfer fixup, not checking - * for any errors, because we assume we have just used pre_write_mst_fixup(), - * thus the data will be fine or we would never have gotten here. - */ -void post_write_mst_fixup(NTFS_RECORD *b) -{ - le16 *usa_pos, *data_pos; - - u16 usa_ofs = le16_to_cpu(b->usa_ofs); - u16 usa_count = le16_to_cpu(b->usa_count) - 1; - - /* Position of usn in update sequence array. */ - usa_pos = (le16*)b + usa_ofs/sizeof(le16); - - /* Position in protected data of first u16 that needs fixing up. */ - data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1; - - /* Fixup all sectors. */ - while (usa_count--) { - /* - * Increment position in usa and restore original data from - * the usa into the data buffer. - */ - *data_pos = *(++usa_pos); - - /* Increment position in data as well. */ - data_pos += NTFS_BLOCK_SIZE/sizeof(le16); - } -} diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c deleted file mode 100644 index d7498ddc4a721..0000000000000 --- a/fs/ntfs/namei.c +++ /dev/null @@ -1,392 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * namei.c - NTFS kernel directory inode operations. Part of the Linux-NTFS - * project. - * - * Copyright (c) 2001-2006 Anton Altaparmakov - */ - -#include -#include -#include -#include - -#include "attrib.h" -#include "debug.h" -#include "dir.h" -#include "mft.h" -#include "ntfs.h" - -/** - * ntfs_lookup - find the inode represented by a dentry in a directory inode - * @dir_ino: directory inode in which to look for the inode - * @dent: dentry representing the inode to look for - * @flags: lookup flags - * - * In short, ntfs_lookup() looks for the inode represented by the dentry @dent - * in the directory inode @dir_ino and if found attaches the inode to the - * dentry @dent. - * - * In more detail, the dentry @dent specifies which inode to look for by - * supplying the name of the inode in @dent->d_name.name. ntfs_lookup() - * converts the name to Unicode and walks the contents of the directory inode - * @dir_ino looking for the converted Unicode name. If the name is found in the - * directory, the corresponding inode is loaded by calling ntfs_iget() on its - * inode number and the inode is associated with the dentry @dent via a call to - * d_splice_alias(). - * - * If the name is not found in the directory, a NULL inode is inserted into the - * dentry @dent via a call to d_add(). The dentry is then termed a negative - * dentry. - * - * Only if an actual error occurs, do we return an error via ERR_PTR(). - * - * In order to handle the case insensitivity issues of NTFS with regards to the - * dcache and the dcache requiring only one dentry per directory, we deal with - * dentry aliases that only differ in case in ->ntfs_lookup() while maintaining - * a case sensitive dcache. This means that we get the full benefit of dcache - * speed when the file/directory is looked up with the same case as returned by - * ->ntfs_readdir() but that a lookup for any other case (or for the short file - * name) will not find anything in dcache and will enter ->ntfs_lookup() - * instead, where we search the directory for a fully matching file name - * (including case) and if that is not found, we search for a file name that - * matches with different case and if that has non-POSIX semantics we return - * that. We actually do only one search (case sensitive) and keep tabs on - * whether we have found a case insensitive match in the process. - * - * To simplify matters for us, we do not treat the short vs long filenames as - * two hard links but instead if the lookup matches a short filename, we - * return the dentry for the corresponding long filename instead. - * - * There are three cases we need to distinguish here: - * - * 1) @dent perfectly matches (i.e. including case) a directory entry with a - * file name in the WIN32 or POSIX namespaces. In this case - * ntfs_lookup_inode_by_name() will return with name set to NULL and we - * just d_splice_alias() @dent. - * 2) @dent matches (not including case) a directory entry with a file name in - * the WIN32 namespace. In this case ntfs_lookup_inode_by_name() will return - * with name set to point to a kmalloc()ed ntfs_name structure containing - * the properly cased little endian Unicode name. We convert the name to the - * current NLS code page, search if a dentry with this name already exists - * and if so return that instead of @dent. At this point things are - * complicated by the possibility of 'disconnected' dentries due to NFS - * which we deal with appropriately (see the code comments). The VFS will - * then destroy the old @dent and use the one we returned. If a dentry is - * not found, we allocate a new one, d_splice_alias() it, and return it as - * above. - * 3) @dent matches either perfectly or not (i.e. we don't care about case) a - * directory entry with a file name in the DOS namespace. In this case - * ntfs_lookup_inode_by_name() will return with name set to point to a - * kmalloc()ed ntfs_name structure containing the mft reference (cpu endian) - * of the inode. We use the mft reference to read the inode and to find the - * file name in the WIN32 namespace corresponding to the matched short file - * name. We then convert the name to the current NLS code page, and proceed - * searching for a dentry with this name, etc, as in case 2), above. - * - * Locking: Caller must hold i_mutex on the directory. - */ -static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, - unsigned int flags) -{ - ntfs_volume *vol = NTFS_SB(dir_ino->i_sb); - struct inode *dent_inode; - ntfschar *uname; - ntfs_name *name = NULL; - MFT_REF mref; - unsigned long dent_ino; - int uname_len; - - ntfs_debug("Looking up %pd in directory inode 0x%lx.", - dent, dir_ino->i_ino); - /* Convert the name of the dentry to Unicode. */ - uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len, - &uname); - if (uname_len < 0) { - if (uname_len != -ENAMETOOLONG) - ntfs_error(vol->sb, "Failed to convert name to " - "Unicode."); - return ERR_PTR(uname_len); - } - mref = ntfs_lookup_inode_by_name(NTFS_I(dir_ino), uname, uname_len, - &name); - kmem_cache_free(ntfs_name_cache, uname); - if (!IS_ERR_MREF(mref)) { - dent_ino = MREF(mref); - ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino); - dent_inode = ntfs_iget(vol->sb, dent_ino); - if (!IS_ERR(dent_inode)) { - /* Consistency check. */ - if (is_bad_inode(dent_inode) || MSEQNO(mref) == - NTFS_I(dent_inode)->seq_no || - dent_ino == FILE_MFT) { - /* Perfect WIN32/POSIX match. -- Case 1. */ - if (!name) { - ntfs_debug("Done. (Case 1.)"); - return d_splice_alias(dent_inode, dent); - } - /* - * We are too indented. Handle imperfect - * matches and short file names further below. - */ - goto handle_name; - } - ntfs_error(vol->sb, "Found stale reference to inode " - "0x%lx (reference sequence number = " - "0x%x, inode sequence number = 0x%x), " - "returning -EIO. Run chkdsk.", - dent_ino, MSEQNO(mref), - NTFS_I(dent_inode)->seq_no); - iput(dent_inode); - dent_inode = ERR_PTR(-EIO); - } else - ntfs_error(vol->sb, "ntfs_iget(0x%lx) failed with " - "error code %li.", dent_ino, - PTR_ERR(dent_inode)); - kfree(name); - /* Return the error code. */ - return ERR_CAST(dent_inode); - } - /* It is guaranteed that @name is no longer allocated at this point. */ - if (MREF_ERR(mref) == -ENOENT) { - ntfs_debug("Entry was not found, adding negative dentry."); - /* The dcache will handle negative entries. */ - d_add(dent, NULL); - ntfs_debug("Done."); - return NULL; - } - ntfs_error(vol->sb, "ntfs_lookup_ino_by_name() failed with error " - "code %i.", -MREF_ERR(mref)); - return ERR_PTR(MREF_ERR(mref)); - // TODO: Consider moving this lot to a separate function! (AIA) -handle_name: - { - MFT_RECORD *m; - ntfs_attr_search_ctx *ctx; - ntfs_inode *ni = NTFS_I(dent_inode); - int err; - struct qstr nls_name; - - nls_name.name = NULL; - if (name->type != FILE_NAME_DOS) { /* Case 2. */ - ntfs_debug("Case 2."); - nls_name.len = (unsigned)ntfs_ucstonls(vol, - (ntfschar*)&name->name, name->len, - (unsigned char**)&nls_name.name, 0); - kfree(name); - } else /* if (name->type == FILE_NAME_DOS) */ { /* Case 3. */ - FILE_NAME_ATTR *fn; - - ntfs_debug("Case 3."); - kfree(name); - - /* Find the WIN32 name corresponding to the matched DOS name. */ - ni = NTFS_I(dent_inode); - m = map_mft_record(ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - do { - ATTR_RECORD *a; - u32 val_len; - - err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, - NULL, 0, ctx); - if (unlikely(err)) { - ntfs_error(vol->sb, "Inode corrupt: No WIN32 " - "namespace counterpart to DOS " - "file name. Run chkdsk."); - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - /* Consistency checks. */ - a = ctx->attr; - if (a->non_resident || a->flags) - goto eio_err_out; - val_len = le32_to_cpu(a->data.resident.value_length); - if (le16_to_cpu(a->data.resident.value_offset) + - val_len > le32_to_cpu(a->length)) - goto eio_err_out; - fn = (FILE_NAME_ATTR*)((u8*)ctx->attr + le16_to_cpu( - ctx->attr->data.resident.value_offset)); - if ((u32)(fn->file_name_length * sizeof(ntfschar) + - sizeof(FILE_NAME_ATTR)) > val_len) - goto eio_err_out; - } while (fn->file_name_type != FILE_NAME_WIN32); - - /* Convert the found WIN32 name to current NLS code page. */ - nls_name.len = (unsigned)ntfs_ucstonls(vol, - (ntfschar*)&fn->file_name, fn->file_name_length, - (unsigned char**)&nls_name.name, 0); - - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); - } - m = NULL; - ctx = NULL; - - /* Check if a conversion error occurred. */ - if ((signed)nls_name.len < 0) { - err = (signed)nls_name.len; - goto err_out; - } - nls_name.hash = full_name_hash(dent, nls_name.name, nls_name.len); - - dent = d_add_ci(dent, dent_inode, &nls_name); - kfree(nls_name.name); - return dent; - -eio_err_out: - ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk."); - err = -EIO; -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(ni); - iput(dent_inode); - ntfs_error(vol->sb, "Failed, returning error code %i.", err); - return ERR_PTR(err); - } -} - -/* - * Inode operations for directories. - */ -const struct inode_operations ntfs_dir_inode_ops = { - .lookup = ntfs_lookup, /* VFS: Lookup directory. */ -}; - -/** - * ntfs_get_parent - find the dentry of the parent of a given directory dentry - * @child_dent: dentry of the directory whose parent directory to find - * - * Find the dentry for the parent directory of the directory specified by the - * dentry @child_dent. This function is called from - * fs/exportfs/expfs.c::find_exported_dentry() which in turn is called from the - * default ->decode_fh() which is export_decode_fh() in the same file. - * - * The code is based on the ext3 ->get_parent() implementation found in - * fs/ext3/namei.c::ext3_get_parent(). - * - * Note: ntfs_get_parent() is called with @d_inode(child_dent)->i_mutex down. - * - * Return the dentry of the parent directory on success or the error code on - * error (IS_ERR() is true). - */ -static struct dentry *ntfs_get_parent(struct dentry *child_dent) -{ - struct inode *vi = d_inode(child_dent); - ntfs_inode *ni = NTFS_I(vi); - MFT_RECORD *mrec; - ntfs_attr_search_ctx *ctx; - ATTR_RECORD *attr; - FILE_NAME_ATTR *fn; - unsigned long parent_ino; - int err; - - ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); - /* Get the mft record of the inode belonging to the child dentry. */ - mrec = map_mft_record(ni); - if (IS_ERR(mrec)) - return ERR_CAST(mrec); - /* Find the first file name attribute in the mft record. */ - ctx = ntfs_attr_get_search_ctx(ni, mrec); - if (unlikely(!ctx)) { - unmap_mft_record(ni); - return ERR_PTR(-ENOMEM); - } -try_next: - err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, CASE_SENSITIVE, 0, NULL, - 0, ctx); - if (unlikely(err)) { - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); - if (err == -ENOENT) - ntfs_error(vi->i_sb, "Inode 0x%lx does not have a " - "file name attribute. Run chkdsk.", - vi->i_ino); - return ERR_PTR(err); - } - attr = ctx->attr; - if (unlikely(attr->non_resident)) - goto try_next; - fn = (FILE_NAME_ATTR *)((u8 *)attr + - le16_to_cpu(attr->data.resident.value_offset)); - if (unlikely((u8 *)fn + le32_to_cpu(attr->data.resident.value_length) > - (u8*)attr + le32_to_cpu(attr->length))) - goto try_next; - /* Get the inode number of the parent directory. */ - parent_ino = MREF_LE(fn->parent_directory); - /* Release the search context and the mft record of the child. */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); - - return d_obtain_alias(ntfs_iget(vi->i_sb, parent_ino)); -} - -static struct inode *ntfs_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) -{ - struct inode *inode; - - inode = ntfs_iget(sb, ino); - if (!IS_ERR(inode)) { - if (is_bad_inode(inode) || inode->i_generation != generation) { - iput(inode); - inode = ERR_PTR(-ESTALE); - } - } - - return inode; -} - -static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - return generic_fh_to_dentry(sb, fid, fh_len, fh_type, - ntfs_nfs_get_inode); -} - -static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - return generic_fh_to_parent(sb, fid, fh_len, fh_type, - ntfs_nfs_get_inode); -} - -/* - * Export operations allowing NFS exporting of mounted NTFS partitions. - * - * We use the default ->encode_fh() for now. Note that they - * use 32 bits to store the inode number which is an unsigned long so on 64-bit - * architectures is usually 64 bits so it would all fail horribly on huge - * volumes. I guess we need to define our own encode and decode fh functions - * that store 64-bit inode numbers at some point but for now we will ignore the - * problem... - * - * We also use the default ->get_name() helper (used by ->decode_fh() via - * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs - * independent. - * - * The default ->get_parent() just returns -EACCES so we have to provide our - * own and the default ->get_dentry() is incompatible with NTFS due to not - * allowing the inode number 0 which is used in NTFS for the system file $MFT - * and due to using iget() whereas NTFS needs ntfs_iget(). - */ -const struct export_operations ntfs_export_ops = { - .encode_fh = generic_encode_ino32_fh, - .get_parent = ntfs_get_parent, /* Find the parent of a given - directory. */ - .fh_to_dentry = ntfs_fh_to_dentry, - .fh_to_parent = ntfs_fh_to_parent, -}; diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h deleted file mode 100644 index e81376ea91524..0000000000000 --- a/fs/ntfs/ntfs.h +++ /dev/null @@ -1,150 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * ntfs.h - Defines for NTFS Linux kernel driver. - * - * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. - * Copyright (C) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_H -#define _LINUX_NTFS_H - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "types.h" -#include "volume.h" -#include "layout.h" - -typedef enum { - NTFS_BLOCK_SIZE = 512, - NTFS_BLOCK_SIZE_BITS = 9, - NTFS_SB_MAGIC = 0x5346544e, /* 'NTFS' */ - NTFS_MAX_NAME_LEN = 255, - NTFS_MAX_ATTR_NAME_LEN = 255, - NTFS_MAX_CLUSTER_SIZE = 64 * 1024, /* 64kiB */ - NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_SIZE, -} NTFS_CONSTANTS; - -/* Global variables. */ - -/* Slab caches (from super.c). */ -extern struct kmem_cache *ntfs_name_cache; -extern struct kmem_cache *ntfs_inode_cache; -extern struct kmem_cache *ntfs_big_inode_cache; -extern struct kmem_cache *ntfs_attr_ctx_cache; -extern struct kmem_cache *ntfs_index_ctx_cache; - -/* The various operations structs defined throughout the driver files. */ -extern const struct address_space_operations ntfs_normal_aops; -extern const struct address_space_operations ntfs_compressed_aops; -extern const struct address_space_operations ntfs_mst_aops; - -extern const struct file_operations ntfs_file_ops; -extern const struct inode_operations ntfs_file_inode_ops; - -extern const struct file_operations ntfs_dir_ops; -extern const struct inode_operations ntfs_dir_inode_ops; - -extern const struct file_operations ntfs_empty_file_ops; -extern const struct inode_operations ntfs_empty_inode_ops; - -extern const struct export_operations ntfs_export_ops; - -/** - * NTFS_SB - return the ntfs volume given a vfs super block - * @sb: VFS super block - * - * NTFS_SB() returns the ntfs volume associated with the VFS super block @sb. - */ -static inline ntfs_volume *NTFS_SB(struct super_block *sb) -{ - return sb->s_fs_info; -} - -/* Declarations of functions and global variables. */ - -/* From fs/ntfs/compress.c */ -extern int ntfs_read_compressed_block(struct page *page); -extern int allocate_compression_buffers(void); -extern void free_compression_buffers(void); - -/* From fs/ntfs/super.c */ -#define default_upcase_len 0x10000 -extern struct mutex ntfs_lock; - -typedef struct { - int val; - char *str; -} option_t; -extern const option_t on_errors_arr[]; - -/* From fs/ntfs/mst.c */ -extern int post_read_mst_fixup(NTFS_RECORD *b, const u32 size); -extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size); -extern void post_write_mst_fixup(NTFS_RECORD *b); - -/* From fs/ntfs/unistr.c */ -extern bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, - const ntfschar *s2, size_t s2_len, - const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_size); -extern int ntfs_collate_names(const ntfschar *name1, const u32 name1_len, - const ntfschar *name2, const u32 name2_len, - const int err_val, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_len); -extern int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n); -extern int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, - const ntfschar *upcase, const u32 upcase_size); -extern void ntfs_upcase_name(ntfschar *name, u32 name_len, - const ntfschar *upcase, const u32 upcase_len); -extern void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr, - const ntfschar *upcase, const u32 upcase_len); -extern int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1, - FILE_NAME_ATTR *file_name_attr2, - const int err_val, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_len); -extern int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, - const int ins_len, ntfschar **outs); -extern int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, - const int ins_len, unsigned char **outs, int outs_len); - -/* From fs/ntfs/upcase.c */ -extern ntfschar *generate_default_upcase(void); - -static inline int ntfs_ffs(int x) -{ - int r = 1; - - if (!x) - return 0; - if (!(x & 0xffff)) { - x >>= 16; - r += 16; - } - if (!(x & 0xff)) { - x >>= 8; - r += 8; - } - if (!(x & 0xf)) { - x >>= 4; - r += 4; - } - if (!(x & 3)) { - x >>= 2; - r += 2; - } - if (!(x & 1)) { - x >>= 1; - r += 1; - } - return r; -} - -#endif /* _LINUX_NTFS_H */ diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c deleted file mode 100644 index 9160480222fd7..0000000000000 --- a/fs/ntfs/quota.c +++ /dev/null @@ -1,103 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * quota.c - NTFS kernel quota ($Quota) handling. Part of the Linux-NTFS - * project. - * - * Copyright (c) 2004 Anton Altaparmakov - */ - -#ifdef NTFS_RW - -#include "index.h" -#include "quota.h" -#include "debug.h" -#include "ntfs.h" - -/** - * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume - * @vol: ntfs volume on which to mark the quotas out of date - * - * Mark the quotas out of date on the ntfs volume @vol and return 'true' on - * success and 'false' on error. - */ -bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) -{ - ntfs_index_context *ictx; - QUOTA_CONTROL_ENTRY *qce; - const le32 qid = QUOTA_DEFAULTS_ID; - int err; - - ntfs_debug("Entering."); - if (NVolQuotaOutOfDate(vol)) - goto done; - if (!vol->quota_ino || !vol->quota_q_ino) { - ntfs_error(vol->sb, "Quota inodes are not open."); - return false; - } - inode_lock(vol->quota_q_ino); - ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino)); - if (!ictx) { - ntfs_error(vol->sb, "Failed to get index context."); - goto err_out; - } - err = ntfs_index_lookup(&qid, sizeof(qid), ictx); - if (err) { - if (err == -ENOENT) - ntfs_error(vol->sb, "Quota defaults entry is not " - "present."); - else - ntfs_error(vol->sb, "Lookup of quota defaults entry " - "failed."); - goto err_out; - } - if (ictx->data_len < offsetof(QUOTA_CONTROL_ENTRY, sid)) { - ntfs_error(vol->sb, "Quota defaults entry size is invalid. " - "Run chkdsk."); - goto err_out; - } - qce = (QUOTA_CONTROL_ENTRY*)ictx->data; - if (le32_to_cpu(qce->version) != QUOTA_VERSION) { - ntfs_error(vol->sb, "Quota defaults entry version 0x%x is not " - "supported.", le32_to_cpu(qce->version)); - goto err_out; - } - ntfs_debug("Quota defaults flags = 0x%x.", le32_to_cpu(qce->flags)); - /* If quotas are already marked out of date, no need to do anything. */ - if (qce->flags & QUOTA_FLAG_OUT_OF_DATE) - goto set_done; - /* - * If quota tracking is neither requested, nor enabled and there are no - * pending deletes, no need to mark the quotas out of date. - */ - if (!(qce->flags & (QUOTA_FLAG_TRACKING_ENABLED | - QUOTA_FLAG_TRACKING_REQUESTED | - QUOTA_FLAG_PENDING_DELETES))) - goto set_done; - /* - * Set the QUOTA_FLAG_OUT_OF_DATE bit thus marking quotas out of date. - * This is verified on WinXP to be sufficient to cause windows to - * rescan the volume on boot and update all quota entries. - */ - qce->flags |= QUOTA_FLAG_OUT_OF_DATE; - /* Ensure the modified flags are written to disk. */ - ntfs_index_entry_flush_dcache_page(ictx); - ntfs_index_entry_mark_dirty(ictx); -set_done: - ntfs_index_ctx_put(ictx); - inode_unlock(vol->quota_q_ino); - /* - * We set the flag so we do not try to mark the quotas out of date - * again on remount. - */ - NVolSetQuotaOutOfDate(vol); -done: - ntfs_debug("Done."); - return true; -err_out: - if (ictx) - ntfs_index_ctx_put(ictx); - inode_unlock(vol->quota_q_ino); - return false; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/quota.h b/fs/ntfs/quota.h deleted file mode 100644 index fe3132a3d6d22..0000000000000 --- a/fs/ntfs/quota.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * quota.h - Defines for NTFS kernel quota ($Quota) handling. Part of the - * Linux-NTFS project. - * - * Copyright (c) 2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_QUOTA_H -#define _LINUX_NTFS_QUOTA_H - -#ifdef NTFS_RW - -#include "types.h" -#include "volume.h" - -extern bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_QUOTA_H */ diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c deleted file mode 100644 index 0d448e9881f71..0000000000000 --- a/fs/ntfs/runlist.c +++ /dev/null @@ -1,1893 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2007 Anton Altaparmakov - * Copyright (c) 2002-2005 Richard Russon - */ - -#include "debug.h" -#include "dir.h" -#include "endian.h" -#include "malloc.h" -#include "ntfs.h" - -/** - * ntfs_rl_mm - runlist memmove - * - * It is up to the caller to serialize access to the runlist @base. - */ -static inline void ntfs_rl_mm(runlist_element *base, int dst, int src, - int size) -{ - if (likely((dst != src) && (size > 0))) - memmove(base + dst, base + src, size * sizeof(*base)); -} - -/** - * ntfs_rl_mc - runlist memory copy - * - * It is up to the caller to serialize access to the runlists @dstbase and - * @srcbase. - */ -static inline void ntfs_rl_mc(runlist_element *dstbase, int dst, - runlist_element *srcbase, int src, int size) -{ - if (likely(size > 0)) - memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase)); -} - -/** - * ntfs_rl_realloc - Reallocate memory for runlists - * @rl: original runlist - * @old_size: number of runlist elements in the original runlist @rl - * @new_size: number of runlist elements we need space for - * - * As the runlists grow, more memory will be required. To prevent the - * kernel having to allocate and reallocate large numbers of small bits of - * memory, this function returns an entire page of memory. - * - * It is up to the caller to serialize access to the runlist @rl. - * - * N.B. If the new allocation doesn't require a different number of pages in - * memory, the function will return the original pointer. - * - * On success, return a pointer to the newly allocated, or recycled, memory. - * On error, return -errno. The following error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - */ -static inline runlist_element *ntfs_rl_realloc(runlist_element *rl, - int old_size, int new_size) -{ - runlist_element *new_rl; - - old_size = PAGE_ALIGN(old_size * sizeof(*rl)); - new_size = PAGE_ALIGN(new_size * sizeof(*rl)); - if (old_size == new_size) - return rl; - - new_rl = ntfs_malloc_nofs(new_size); - if (unlikely(!new_rl)) - return ERR_PTR(-ENOMEM); - - if (likely(rl != NULL)) { - if (unlikely(old_size > new_size)) - old_size = new_size; - memcpy(new_rl, rl, old_size); - ntfs_free(rl); - } - return new_rl; -} - -/** - * ntfs_rl_realloc_nofail - Reallocate memory for runlists - * @rl: original runlist - * @old_size: number of runlist elements in the original runlist @rl - * @new_size: number of runlist elements we need space for - * - * As the runlists grow, more memory will be required. To prevent the - * kernel having to allocate and reallocate large numbers of small bits of - * memory, this function returns an entire page of memory. - * - * This function guarantees that the allocation will succeed. It will sleep - * for as long as it takes to complete the allocation. - * - * It is up to the caller to serialize access to the runlist @rl. - * - * N.B. If the new allocation doesn't require a different number of pages in - * memory, the function will return the original pointer. - * - * On success, return a pointer to the newly allocated, or recycled, memory. - * On error, return -errno. The following error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - */ -static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl, - int old_size, int new_size) -{ - runlist_element *new_rl; - - old_size = PAGE_ALIGN(old_size * sizeof(*rl)); - new_size = PAGE_ALIGN(new_size * sizeof(*rl)); - if (old_size == new_size) - return rl; - - new_rl = ntfs_malloc_nofs_nofail(new_size); - BUG_ON(!new_rl); - - if (likely(rl != NULL)) { - if (unlikely(old_size > new_size)) - old_size = new_size; - memcpy(new_rl, rl, old_size); - ntfs_free(rl); - } - return new_rl; -} - -/** - * ntfs_are_rl_mergeable - test if two runlists can be joined together - * @dst: original runlist - * @src: new runlist to test for mergeability with @dst - * - * Test if two runlists can be joined together. For this, their VCNs and LCNs - * must be adjacent. - * - * It is up to the caller to serialize access to the runlists @dst and @src. - * - * Return: true Success, the runlists can be merged. - * false Failure, the runlists cannot be merged. - */ -static inline bool ntfs_are_rl_mergeable(runlist_element *dst, - runlist_element *src) -{ - BUG_ON(!dst); - BUG_ON(!src); - - /* We can merge unmapped regions even if they are misaligned. */ - if ((dst->lcn == LCN_RL_NOT_MAPPED) && (src->lcn == LCN_RL_NOT_MAPPED)) - return true; - /* If the runs are misaligned, we cannot merge them. */ - if ((dst->vcn + dst->length) != src->vcn) - return false; - /* If both runs are non-sparse and contiguous, we can merge them. */ - if ((dst->lcn >= 0) && (src->lcn >= 0) && - ((dst->lcn + dst->length) == src->lcn)) - return true; - /* If we are merging two holes, we can merge them. */ - if ((dst->lcn == LCN_HOLE) && (src->lcn == LCN_HOLE)) - return true; - /* Cannot merge. */ - return false; -} - -/** - * __ntfs_rl_merge - merge two runlists without testing if they can be merged - * @dst: original, destination runlist - * @src: new runlist to merge with @dst - * - * Merge the two runlists, writing into the destination runlist @dst. The - * caller must make sure the runlists can be merged or this will corrupt the - * destination runlist. - * - * It is up to the caller to serialize access to the runlists @dst and @src. - */ -static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src) -{ - dst->length += src->length; -} - -/** - * ntfs_rl_append - append a runlist after a given element - * @dst: original runlist to be worked on - * @dsize: number of elements in @dst (including end marker) - * @src: runlist to be inserted into @dst - * @ssize: number of elements in @src (excluding end marker) - * @loc: append the new runlist @src after this element in @dst - * - * Append the runlist @src after element @loc in @dst. Merge the right end of - * the new runlist, if necessary. Adjust the size of the hole before the - * appended runlist. - * - * It is up to the caller to serialize access to the runlists @dst and @src. - * - * On success, return a pointer to the new, combined, runlist. Note, both - * runlists @dst and @src are deallocated before returning so you cannot use - * the pointers for anything any more. (Strictly speaking the returned runlist - * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - */ -static inline runlist_element *ntfs_rl_append(runlist_element *dst, - int dsize, runlist_element *src, int ssize, int loc) -{ - bool right = false; /* Right end of @src needs merging. */ - int marker; /* End of the inserted runs. */ - - BUG_ON(!dst); - BUG_ON(!src); - - /* First, check if the right hand end needs merging. */ - if ((loc + 1) < dsize) - right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1); - - /* Space required: @dst size + @src size, less one if we merged. */ - dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right); - if (IS_ERR(dst)) - return dst; - /* - * We are guaranteed to succeed from here so can start modifying the - * original runlists. - */ - - /* First, merge the right hand end, if necessary. */ - if (right) - __ntfs_rl_merge(src + ssize - 1, dst + loc + 1); - - /* First run after the @src runs that have been inserted. */ - marker = loc + ssize + 1; - - /* Move the tail of @dst out of the way, then copy in @src. */ - ntfs_rl_mm(dst, marker, loc + 1 + right, dsize - (loc + 1 + right)); - ntfs_rl_mc(dst, loc + 1, src, 0, ssize); - - /* Adjust the size of the preceding hole. */ - dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn; - - /* We may have changed the length of the file, so fix the end marker */ - if (dst[marker].lcn == LCN_ENOENT) - dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; - - return dst; -} - -/** - * ntfs_rl_insert - insert a runlist into another - * @dst: original runlist to be worked on - * @dsize: number of elements in @dst (including end marker) - * @src: new runlist to be inserted - * @ssize: number of elements in @src (excluding end marker) - * @loc: insert the new runlist @src before this element in @dst - * - * Insert the runlist @src before element @loc in the runlist @dst. Merge the - * left end of the new runlist, if necessary. Adjust the size of the hole - * after the inserted runlist. - * - * It is up to the caller to serialize access to the runlists @dst and @src. - * - * On success, return a pointer to the new, combined, runlist. Note, both - * runlists @dst and @src are deallocated before returning so you cannot use - * the pointers for anything any more. (Strictly speaking the returned runlist - * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - */ -static inline runlist_element *ntfs_rl_insert(runlist_element *dst, - int dsize, runlist_element *src, int ssize, int loc) -{ - bool left = false; /* Left end of @src needs merging. */ - bool disc = false; /* Discontinuity between @dst and @src. */ - int marker; /* End of the inserted runs. */ - - BUG_ON(!dst); - BUG_ON(!src); - - /* - * disc => Discontinuity between the end of @dst and the start of @src. - * This means we might need to insert a "not mapped" run. - */ - if (loc == 0) - disc = (src[0].vcn > 0); - else { - s64 merged_length; - - left = ntfs_are_rl_mergeable(dst + loc - 1, src); - - merged_length = dst[loc - 1].length; - if (left) - merged_length += src->length; - - disc = (src[0].vcn > dst[loc - 1].vcn + merged_length); - } - /* - * Space required: @dst size + @src size, less one if we merged, plus - * one if there was a discontinuity. - */ - dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc); - if (IS_ERR(dst)) - return dst; - /* - * We are guaranteed to succeed from here so can start modifying the - * original runlist. - */ - if (left) - __ntfs_rl_merge(dst + loc - 1, src); - /* - * First run after the @src runs that have been inserted. - * Nominally, @marker equals @loc + @ssize, i.e. location + number of - * runs in @src. However, if @left, then the first run in @src has - * been merged with one in @dst. And if @disc, then @dst and @src do - * not meet and we need an extra run to fill the gap. - */ - marker = loc + ssize - left + disc; - - /* Move the tail of @dst out of the way, then copy in @src. */ - ntfs_rl_mm(dst, marker, loc, dsize - loc); - ntfs_rl_mc(dst, loc + disc, src, left, ssize - left); - - /* Adjust the VCN of the first run after the insertion... */ - dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; - /* ... and the length. */ - if (dst[marker].lcn == LCN_HOLE || dst[marker].lcn == LCN_RL_NOT_MAPPED) - dst[marker].length = dst[marker + 1].vcn - dst[marker].vcn; - - /* Writing beyond the end of the file and there is a discontinuity. */ - if (disc) { - if (loc > 0) { - dst[loc].vcn = dst[loc - 1].vcn + dst[loc - 1].length; - dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn; - } else { - dst[loc].vcn = 0; - dst[loc].length = dst[loc + 1].vcn; - } - dst[loc].lcn = LCN_RL_NOT_MAPPED; - } - return dst; -} - -/** - * ntfs_rl_replace - overwrite a runlist element with another runlist - * @dst: original runlist to be worked on - * @dsize: number of elements in @dst (including end marker) - * @src: new runlist to be inserted - * @ssize: number of elements in @src (excluding end marker) - * @loc: index in runlist @dst to overwrite with @src - * - * Replace the runlist element @dst at @loc with @src. Merge the left and - * right ends of the inserted runlist, if necessary. - * - * It is up to the caller to serialize access to the runlists @dst and @src. - * - * On success, return a pointer to the new, combined, runlist. Note, both - * runlists @dst and @src are deallocated before returning so you cannot use - * the pointers for anything any more. (Strictly speaking the returned runlist - * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - */ -static inline runlist_element *ntfs_rl_replace(runlist_element *dst, - int dsize, runlist_element *src, int ssize, int loc) -{ - signed delta; - bool left = false; /* Left end of @src needs merging. */ - bool right = false; /* Right end of @src needs merging. */ - int tail; /* Start of tail of @dst. */ - int marker; /* End of the inserted runs. */ - - BUG_ON(!dst); - BUG_ON(!src); - - /* First, see if the left and right ends need merging. */ - if ((loc + 1) < dsize) - right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1); - if (loc > 0) - left = ntfs_are_rl_mergeable(dst + loc - 1, src); - /* - * Allocate some space. We will need less if the left, right, or both - * ends get merged. The -1 accounts for the run being replaced. - */ - delta = ssize - 1 - left - right; - if (delta > 0) { - dst = ntfs_rl_realloc(dst, dsize, dsize + delta); - if (IS_ERR(dst)) - return dst; - } - /* - * We are guaranteed to succeed from here so can start modifying the - * original runlists. - */ - - /* First, merge the left and right ends, if necessary. */ - if (right) - __ntfs_rl_merge(src + ssize - 1, dst + loc + 1); - if (left) - __ntfs_rl_merge(dst + loc - 1, src); - /* - * Offset of the tail of @dst. This needs to be moved out of the way - * to make space for the runs to be copied from @src, i.e. the first - * run of the tail of @dst. - * Nominally, @tail equals @loc + 1, i.e. location, skipping the - * replaced run. However, if @right, then one of @dst's runs is - * already merged into @src. - */ - tail = loc + right + 1; - /* - * First run after the @src runs that have been inserted, i.e. where - * the tail of @dst needs to be moved to. - * Nominally, @marker equals @loc + @ssize, i.e. location + number of - * runs in @src. However, if @left, then the first run in @src has - * been merged with one in @dst. - */ - marker = loc + ssize - left; - - /* Move the tail of @dst out of the way, then copy in @src. */ - ntfs_rl_mm(dst, marker, tail, dsize - tail); - ntfs_rl_mc(dst, loc, src, left, ssize - left); - - /* We may have changed the length of the file, so fix the end marker. */ - if (dsize - tail > 0 && dst[marker].lcn == LCN_ENOENT) - dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; - return dst; -} - -/** - * ntfs_rl_split - insert a runlist into the centre of a hole - * @dst: original runlist to be worked on - * @dsize: number of elements in @dst (including end marker) - * @src: new runlist to be inserted - * @ssize: number of elements in @src (excluding end marker) - * @loc: index in runlist @dst at which to split and insert @src - * - * Split the runlist @dst at @loc into two and insert @new in between the two - * fragments. No merging of runlists is necessary. Adjust the size of the - * holes either side. - * - * It is up to the caller to serialize access to the runlists @dst and @src. - * - * On success, return a pointer to the new, combined, runlist. Note, both - * runlists @dst and @src are deallocated before returning so you cannot use - * the pointers for anything any more. (Strictly speaking the returned runlist - * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - */ -static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize, - runlist_element *src, int ssize, int loc) -{ - BUG_ON(!dst); - BUG_ON(!src); - - /* Space required: @dst size + @src size + one new hole. */ - dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1); - if (IS_ERR(dst)) - return dst; - /* - * We are guaranteed to succeed from here so can start modifying the - * original runlists. - */ - - /* Move the tail of @dst out of the way, then copy in @src. */ - ntfs_rl_mm(dst, loc + 1 + ssize, loc, dsize - loc); - ntfs_rl_mc(dst, loc + 1, src, 0, ssize); - - /* Adjust the size of the holes either size of @src. */ - dst[loc].length = dst[loc+1].vcn - dst[loc].vcn; - dst[loc+ssize+1].vcn = dst[loc+ssize].vcn + dst[loc+ssize].length; - dst[loc+ssize+1].length = dst[loc+ssize+2].vcn - dst[loc+ssize+1].vcn; - - return dst; -} - -/** - * ntfs_runlists_merge - merge two runlists into one - * @drl: original runlist to be worked on - * @srl: new runlist to be merged into @drl - * - * First we sanity check the two runlists @srl and @drl to make sure that they - * are sensible and can be merged. The runlist @srl must be either after the - * runlist @drl or completely within a hole (or unmapped region) in @drl. - * - * It is up to the caller to serialize access to the runlists @drl and @srl. - * - * Merging of runlists is necessary in two cases: - * 1. When attribute lists are used and a further extent is being mapped. - * 2. When new clusters are allocated to fill a hole or extend a file. - * - * There are four possible ways @srl can be merged. It can: - * - be inserted at the beginning of a hole, - * - split the hole in two and be inserted between the two fragments, - * - be appended at the end of a hole, or it can - * - replace the whole hole. - * It can also be appended to the end of the runlist, which is just a variant - * of the insert case. - * - * On success, return a pointer to the new, combined, runlist. Note, both - * runlists @drl and @srl are deallocated before returning so you cannot use - * the pointers for anything any more. (Strictly speaking the returned runlist - * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - * -ERANGE - The runlists overlap and cannot be merged. - */ -runlist_element *ntfs_runlists_merge(runlist_element *drl, - runlist_element *srl) -{ - int di, si; /* Current index into @[ds]rl. */ - int sstart; /* First index with lcn > LCN_RL_NOT_MAPPED. */ - int dins; /* Index into @drl at which to insert @srl. */ - int dend, send; /* Last index into @[ds]rl. */ - int dfinal, sfinal; /* The last index into @[ds]rl with - lcn >= LCN_HOLE. */ - int marker = 0; - VCN marker_vcn = 0; - -#ifdef DEBUG - ntfs_debug("dst:"); - ntfs_debug_dump_runlist(drl); - ntfs_debug("src:"); - ntfs_debug_dump_runlist(srl); -#endif - - /* Check for silly calling... */ - if (unlikely(!srl)) - return drl; - if (IS_ERR(srl) || IS_ERR(drl)) - return ERR_PTR(-EINVAL); - - /* Check for the case where the first mapping is being done now. */ - if (unlikely(!drl)) { - drl = srl; - /* Complete the source runlist if necessary. */ - if (unlikely(drl[0].vcn)) { - /* Scan to the end of the source runlist. */ - for (dend = 0; likely(drl[dend].length); dend++) - ; - dend++; - drl = ntfs_rl_realloc(drl, dend, dend + 1); - if (IS_ERR(drl)) - return drl; - /* Insert start element at the front of the runlist. */ - ntfs_rl_mm(drl, 1, 0, dend); - drl[0].vcn = 0; - drl[0].lcn = LCN_RL_NOT_MAPPED; - drl[0].length = drl[1].vcn; - } - goto finished; - } - - si = di = 0; - - /* Skip any unmapped start element(s) in the source runlist. */ - while (srl[si].length && srl[si].lcn < LCN_HOLE) - si++; - - /* Can't have an entirely unmapped source runlist. */ - BUG_ON(!srl[si].length); - - /* Record the starting points. */ - sstart = si; - - /* - * Skip forward in @drl until we reach the position where @srl needs to - * be inserted. If we reach the end of @drl, @srl just needs to be - * appended to @drl. - */ - for (; drl[di].length; di++) { - if (drl[di].vcn + drl[di].length > srl[sstart].vcn) - break; - } - dins = di; - - /* Sanity check for illegal overlaps. */ - if ((drl[di].vcn == srl[si].vcn) && (drl[di].lcn >= 0) && - (srl[si].lcn >= 0)) { - ntfs_error(NULL, "Run lists overlap. Cannot merge!"); - return ERR_PTR(-ERANGE); - } - - /* Scan to the end of both runlists in order to know their sizes. */ - for (send = si; srl[send].length; send++) - ; - for (dend = di; drl[dend].length; dend++) - ; - - if (srl[send].lcn == LCN_ENOENT) - marker_vcn = srl[marker = send].vcn; - - /* Scan to the last element with lcn >= LCN_HOLE. */ - for (sfinal = send; sfinal >= 0 && srl[sfinal].lcn < LCN_HOLE; sfinal--) - ; - for (dfinal = dend; dfinal >= 0 && drl[dfinal].lcn < LCN_HOLE; dfinal--) - ; - - { - bool start; - bool finish; - int ds = dend + 1; /* Number of elements in drl & srl */ - int ss = sfinal - sstart + 1; - - start = ((drl[dins].lcn < LCN_RL_NOT_MAPPED) || /* End of file */ - (drl[dins].vcn == srl[sstart].vcn)); /* Start of hole */ - finish = ((drl[dins].lcn >= LCN_RL_NOT_MAPPED) && /* End of file */ - ((drl[dins].vcn + drl[dins].length) <= /* End of hole */ - (srl[send - 1].vcn + srl[send - 1].length))); - - /* Or we will lose an end marker. */ - if (finish && !drl[dins].length) - ss++; - if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn)) - finish = false; -#if 0 - ntfs_debug("dfinal = %i, dend = %i", dfinal, dend); - ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send); - ntfs_debug("start = %i, finish = %i", start, finish); - ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins); -#endif - if (start) { - if (finish) - drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins); - else - drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins); - } else { - if (finish) - drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins); - else - drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins); - } - if (IS_ERR(drl)) { - ntfs_error(NULL, "Merge failed."); - return drl; - } - ntfs_free(srl); - if (marker) { - ntfs_debug("Triggering marker code."); - for (ds = dend; drl[ds].length; ds++) - ; - /* We only need to care if @srl ended after @drl. */ - if (drl[ds].vcn <= marker_vcn) { - int slots = 0; - - if (drl[ds].vcn == marker_vcn) { - ntfs_debug("Old marker = 0x%llx, replacing " - "with LCN_ENOENT.", - (unsigned long long) - drl[ds].lcn); - drl[ds].lcn = LCN_ENOENT; - goto finished; - } - /* - * We need to create an unmapped runlist element in - * @drl or extend an existing one before adding the - * ENOENT terminator. - */ - if (drl[ds].lcn == LCN_ENOENT) { - ds--; - slots = 1; - } - if (drl[ds].lcn != LCN_RL_NOT_MAPPED) { - /* Add an unmapped runlist element. */ - if (!slots) { - drl = ntfs_rl_realloc_nofail(drl, ds, - ds + 2); - slots = 2; - } - ds++; - /* Need to set vcn if it isn't set already. */ - if (slots != 1) - drl[ds].vcn = drl[ds - 1].vcn + - drl[ds - 1].length; - drl[ds].lcn = LCN_RL_NOT_MAPPED; - /* We now used up a slot. */ - slots--; - } - drl[ds].length = marker_vcn - drl[ds].vcn; - /* Finally add the ENOENT terminator. */ - ds++; - if (!slots) - drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1); - drl[ds].vcn = marker_vcn; - drl[ds].lcn = LCN_ENOENT; - drl[ds].length = (s64)0; - } - } - } - -finished: - /* The merge was completed successfully. */ - ntfs_debug("Merged runlist:"); - ntfs_debug_dump_runlist(drl); - return drl; -} - -/** - * ntfs_mapping_pairs_decompress - convert mapping pairs array to runlist - * @vol: ntfs volume on which the attribute resides - * @attr: attribute record whose mapping pairs array to decompress - * @old_rl: optional runlist in which to insert @attr's runlist - * - * It is up to the caller to serialize access to the runlist @old_rl. - * - * Decompress the attribute @attr's mapping pairs array into a runlist. On - * success, return the decompressed runlist. - * - * If @old_rl is not NULL, decompressed runlist is inserted into the - * appropriate place in @old_rl and the resultant, combined runlist is - * returned. The original @old_rl is deallocated. - * - * On error, return -errno. @old_rl is left unmodified in that case. - * - * The following error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EIO - Corrupt runlist. - * -EINVAL - Invalid parameters were passed in. - * -ERANGE - The two runlists overlap. - * - * FIXME: For now we take the conceptionally simplest approach of creating the - * new runlist disregarding the already existing one and then splicing the - * two into one, if that is possible (we check for overlap and discard the new - * runlist if overlap present before returning ERR_PTR(-ERANGE)). - */ -runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, - const ATTR_RECORD *attr, runlist_element *old_rl) -{ - VCN vcn; /* Current vcn. */ - LCN lcn; /* Current lcn. */ - s64 deltaxcn; /* Change in [vl]cn. */ - runlist_element *rl; /* The output runlist. */ - u8 *buf; /* Current position in mapping pairs array. */ - u8 *attr_end; /* End of attribute. */ - int rlsize; /* Size of runlist buffer. */ - u16 rlpos; /* Current runlist position in units of - runlist_elements. */ - u8 b; /* Current byte offset in buf. */ - -#ifdef DEBUG - /* Make sure attr exists and is non-resident. */ - if (!attr || !attr->non_resident || sle64_to_cpu( - attr->data.non_resident.lowest_vcn) < (VCN)0) { - ntfs_error(vol->sb, "Invalid arguments."); - return ERR_PTR(-EINVAL); - } -#endif - /* Start at vcn = lowest_vcn and lcn 0. */ - vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn); - lcn = 0; - /* Get start of the mapping pairs array. */ - buf = (u8*)attr + le16_to_cpu( - attr->data.non_resident.mapping_pairs_offset); - attr_end = (u8*)attr + le32_to_cpu(attr->length); - if (unlikely(buf < (u8*)attr || buf > attr_end)) { - ntfs_error(vol->sb, "Corrupt attribute."); - return ERR_PTR(-EIO); - } - /* If the mapping pairs array is valid but empty, nothing to do. */ - if (!vcn && !*buf) - return old_rl; - /* Current position in runlist array. */ - rlpos = 0; - /* Allocate first page and set current runlist size to one page. */ - rl = ntfs_malloc_nofs(rlsize = PAGE_SIZE); - if (unlikely(!rl)) - return ERR_PTR(-ENOMEM); - /* Insert unmapped starting element if necessary. */ - if (vcn) { - rl->vcn = 0; - rl->lcn = LCN_RL_NOT_MAPPED; - rl->length = vcn; - rlpos++; - } - while (buf < attr_end && *buf) { - /* - * Allocate more memory if needed, including space for the - * not-mapped and terminator elements. ntfs_malloc_nofs() - * operates on whole pages only. - */ - if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) { - runlist_element *rl2; - - rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE); - if (unlikely(!rl2)) { - ntfs_free(rl); - return ERR_PTR(-ENOMEM); - } - memcpy(rl2, rl, rlsize); - ntfs_free(rl); - rl = rl2; - rlsize += PAGE_SIZE; - } - /* Enter the current vcn into the current runlist element. */ - rl[rlpos].vcn = vcn; - /* - * Get the change in vcn, i.e. the run length in clusters. - * Doing it this way ensures that we signextend negative values. - * A negative run length doesn't make any sense, but hey, I - * didn't make up the NTFS specs and Windows NT4 treats the run - * length as a signed value so that's how it is... - */ - b = *buf & 0xf; - if (b) { - if (unlikely(buf + b > attr_end)) - goto io_error; - for (deltaxcn = (s8)buf[b--]; b; b--) - deltaxcn = (deltaxcn << 8) + buf[b]; - } else { /* The length entry is compulsory. */ - ntfs_error(vol->sb, "Missing length entry in mapping " - "pairs array."); - deltaxcn = (s64)-1; - } - /* - * Assume a negative length to indicate data corruption and - * hence clean-up and return NULL. - */ - if (unlikely(deltaxcn < 0)) { - ntfs_error(vol->sb, "Invalid length in mapping pairs " - "array."); - goto err_out; - } - /* - * Enter the current run length into the current runlist - * element. - */ - rl[rlpos].length = deltaxcn; - /* Increment the current vcn by the current run length. */ - vcn += deltaxcn; - /* - * There might be no lcn change at all, as is the case for - * sparse clusters on NTFS 3.0+, in which case we set the lcn - * to LCN_HOLE. - */ - if (!(*buf & 0xf0)) - rl[rlpos].lcn = LCN_HOLE; - else { - /* Get the lcn change which really can be negative. */ - u8 b2 = *buf & 0xf; - b = b2 + ((*buf >> 4) & 0xf); - if (buf + b > attr_end) - goto io_error; - for (deltaxcn = (s8)buf[b--]; b > b2; b--) - deltaxcn = (deltaxcn << 8) + buf[b]; - /* Change the current lcn to its new value. */ - lcn += deltaxcn; -#ifdef DEBUG - /* - * On NTFS 1.2-, apparently can have lcn == -1 to - * indicate a hole. But we haven't verified ourselves - * whether it is really the lcn or the deltaxcn that is - * -1. So if either is found give us a message so we - * can investigate it further! - */ - if (vol->major_ver < 3) { - if (unlikely(deltaxcn == (LCN)-1)) - ntfs_error(vol->sb, "lcn delta == -1"); - if (unlikely(lcn == (LCN)-1)) - ntfs_error(vol->sb, "lcn == -1"); - } -#endif - /* Check lcn is not below -1. */ - if (unlikely(lcn < (LCN)-1)) { - ntfs_error(vol->sb, "Invalid LCN < -1 in " - "mapping pairs array."); - goto err_out; - } - /* Enter the current lcn into the runlist element. */ - rl[rlpos].lcn = lcn; - } - /* Get to the next runlist element. */ - rlpos++; - /* Increment the buffer position to the next mapping pair. */ - buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1; - } - if (unlikely(buf >= attr_end)) - goto io_error; - /* - * If there is a highest_vcn specified, it must be equal to the final - * vcn in the runlist - 1, or something has gone badly wrong. - */ - deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn); - if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) { -mpa_err: - ntfs_error(vol->sb, "Corrupt mapping pairs array in " - "non-resident attribute."); - goto err_out; - } - /* Setup not mapped runlist element if this is the base extent. */ - if (!attr->data.non_resident.lowest_vcn) { - VCN max_cluster; - - max_cluster = ((sle64_to_cpu( - attr->data.non_resident.allocated_size) + - vol->cluster_size - 1) >> - vol->cluster_size_bits) - 1; - /* - * A highest_vcn of zero means this is a single extent - * attribute so simply terminate the runlist with LCN_ENOENT). - */ - if (deltaxcn) { - /* - * If there is a difference between the highest_vcn and - * the highest cluster, the runlist is either corrupt - * or, more likely, there are more extents following - * this one. - */ - if (deltaxcn < max_cluster) { - ntfs_debug("More extents to follow; deltaxcn " - "= 0x%llx, max_cluster = " - "0x%llx", - (unsigned long long)deltaxcn, - (unsigned long long) - max_cluster); - rl[rlpos].vcn = vcn; - vcn += rl[rlpos].length = max_cluster - - deltaxcn; - rl[rlpos].lcn = LCN_RL_NOT_MAPPED; - rlpos++; - } else if (unlikely(deltaxcn > max_cluster)) { - ntfs_error(vol->sb, "Corrupt attribute. " - "deltaxcn = 0x%llx, " - "max_cluster = 0x%llx", - (unsigned long long)deltaxcn, - (unsigned long long) - max_cluster); - goto mpa_err; - } - } - rl[rlpos].lcn = LCN_ENOENT; - } else /* Not the base extent. There may be more extents to follow. */ - rl[rlpos].lcn = LCN_RL_NOT_MAPPED; - - /* Setup terminating runlist element. */ - rl[rlpos].vcn = vcn; - rl[rlpos].length = (s64)0; - /* If no existing runlist was specified, we are done. */ - if (!old_rl) { - ntfs_debug("Mapping pairs array successfully decompressed:"); - ntfs_debug_dump_runlist(rl); - return rl; - } - /* Now combine the new and old runlists checking for overlaps. */ - old_rl = ntfs_runlists_merge(old_rl, rl); - if (!IS_ERR(old_rl)) - return old_rl; - ntfs_free(rl); - ntfs_error(vol->sb, "Failed to merge runlists."); - return old_rl; -io_error: - ntfs_error(vol->sb, "Corrupt attribute."); -err_out: - ntfs_free(rl); - return ERR_PTR(-EIO); -} - -/** - * ntfs_rl_vcn_to_lcn - convert a vcn into a lcn given a runlist - * @rl: runlist to use for conversion - * @vcn: vcn to convert - * - * Convert the virtual cluster number @vcn of an attribute into a logical - * cluster number (lcn) of a device using the runlist @rl to map vcns to their - * corresponding lcns. - * - * It is up to the caller to serialize access to the runlist @rl. - * - * Since lcns must be >= 0, we use negative return codes with special meaning: - * - * Return code Meaning / Description - * ================================================== - * LCN_HOLE Hole / not allocated on disk. - * LCN_RL_NOT_MAPPED This is part of the runlist which has not been - * inserted into the runlist yet. - * LCN_ENOENT There is no such vcn in the attribute. - * - * Locking: - The caller must have locked the runlist (for reading or writing). - * - This function does not touch the lock, nor does it modify the - * runlist. - */ -LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn) -{ - int i; - - BUG_ON(vcn < 0); - /* - * If rl is NULL, assume that we have found an unmapped runlist. The - * caller can then attempt to map it and fail appropriately if - * necessary. - */ - if (unlikely(!rl)) - return LCN_RL_NOT_MAPPED; - - /* Catch out of lower bounds vcn. */ - if (unlikely(vcn < rl[0].vcn)) - return LCN_ENOENT; - - for (i = 0; likely(rl[i].length); i++) { - if (unlikely(vcn < rl[i+1].vcn)) { - if (likely(rl[i].lcn >= (LCN)0)) - return rl[i].lcn + (vcn - rl[i].vcn); - return rl[i].lcn; - } - } - /* - * The terminator element is setup to the correct value, i.e. one of - * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT. - */ - if (likely(rl[i].lcn < (LCN)0)) - return rl[i].lcn; - /* Just in case... We could replace this with BUG() some day. */ - return LCN_ENOENT; -} - -#ifdef NTFS_RW - -/** - * ntfs_rl_find_vcn_nolock - find a vcn in a runlist - * @rl: runlist to search - * @vcn: vcn to find - * - * Find the virtual cluster number @vcn in the runlist @rl and return the - * address of the runlist element containing the @vcn on success. - * - * Return NULL if @rl is NULL or @vcn is in an unmapped part/out of bounds of - * the runlist. - * - * Locking: The runlist must be locked on entry. - */ -runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, const VCN vcn) -{ - BUG_ON(vcn < 0); - if (unlikely(!rl || vcn < rl[0].vcn)) - return NULL; - while (likely(rl->length)) { - if (unlikely(vcn < rl[1].vcn)) { - if (likely(rl->lcn >= LCN_HOLE)) - return rl; - return NULL; - } - rl++; - } - if (likely(rl->lcn == LCN_ENOENT)) - return rl; - return NULL; -} - -/** - * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number - * @n: number for which to get the number of bytes for - * - * Return the number of bytes required to store @n unambiguously as - * a signed number. - * - * This is used in the context of the mapping pairs array to determine how - * many bytes will be needed in the array to store a given logical cluster - * number (lcn) or a specific run length. - * - * Return the number of bytes written. This function cannot fail. - */ -static inline int ntfs_get_nr_significant_bytes(const s64 n) -{ - s64 l = n; - int i; - s8 j; - - i = 0; - do { - l >>= 8; - i++; - } while (l != 0 && l != -1); - j = (n >> 8 * (i - 1)) & 0xff; - /* If the sign bit is wrong, we need an extra byte. */ - if ((n < 0 && j >= 0) || (n > 0 && j < 0)) - i++; - return i; -} - -/** - * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array - * @vol: ntfs volume (needed for the ntfs version) - * @rl: locked runlist to determine the size of the mapping pairs of - * @first_vcn: first vcn which to include in the mapping pairs array - * @last_vcn: last vcn which to include in the mapping pairs array - * - * Walk the locked runlist @rl and calculate the size in bytes of the mapping - * pairs array corresponding to the runlist @rl, starting at vcn @first_vcn and - * finishing with vcn @last_vcn. - * - * A @last_vcn of -1 means end of runlist and in that case the size of the - * mapping pairs array corresponding to the runlist starting at vcn @first_vcn - * and finishing at the end of the runlist is determined. - * - * This for example allows us to allocate a buffer of the right size when - * building the mapping pairs array. - * - * If @rl is NULL, just return 1 (for the single terminator byte). - * - * Return the calculated size in bytes on success. On error, return -errno. - * The following error codes are defined: - * -EINVAL - Run list contains unmapped elements. Make sure to only pass - * fully mapped runlists to this function. - * -EIO - The runlist is corrupt. - * - * Locking: @rl must be locked on entry (either for reading or writing), it - * remains locked throughout, and is left locked upon return. - */ -int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, - const runlist_element *rl, const VCN first_vcn, - const VCN last_vcn) -{ - LCN prev_lcn; - int rls; - bool the_end = false; - - BUG_ON(first_vcn < 0); - BUG_ON(last_vcn < -1); - BUG_ON(last_vcn >= 0 && first_vcn > last_vcn); - if (!rl) { - BUG_ON(first_vcn); - BUG_ON(last_vcn > 0); - return 1; - } - /* Skip to runlist element containing @first_vcn. */ - while (rl->length && first_vcn >= rl[1].vcn) - rl++; - if (unlikely((!rl->length && first_vcn > rl->vcn) || - first_vcn < rl->vcn)) - return -EINVAL; - prev_lcn = 0; - /* Always need the termining zero byte. */ - rls = 1; - /* Do the first partial run if present. */ - if (first_vcn > rl->vcn) { - s64 delta, length = rl->length; - - /* We know rl->length != 0 already. */ - if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) - goto err_out; - /* - * If @stop_vcn is given and finishes inside this run, cap the - * run length. - */ - if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { - s64 s1 = last_vcn + 1; - if (unlikely(rl[1].vcn > s1)) - length = s1 - rl->vcn; - the_end = true; - } - delta = first_vcn - rl->vcn; - /* Header byte + length. */ - rls += 1 + ntfs_get_nr_significant_bytes(length - delta); - /* - * If the logical cluster number (lcn) denotes a hole and we - * are on NTFS 3.0+, we don't store it at all, i.e. we need - * zero space. On earlier NTFS versions we just store the lcn. - * Note: this assumes that on NTFS 1.2-, holes are stored with - * an lcn of -1 and not a delta_lcn of -1 (unless both are -1). - */ - if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { - prev_lcn = rl->lcn; - if (likely(rl->lcn >= 0)) - prev_lcn += delta; - /* Change in lcn. */ - rls += ntfs_get_nr_significant_bytes(prev_lcn); - } - /* Go to next runlist element. */ - rl++; - } - /* Do the full runs. */ - for (; rl->length && !the_end; rl++) { - s64 length = rl->length; - - if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) - goto err_out; - /* - * If @stop_vcn is given and finishes inside this run, cap the - * run length. - */ - if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { - s64 s1 = last_vcn + 1; - if (unlikely(rl[1].vcn > s1)) - length = s1 - rl->vcn; - the_end = true; - } - /* Header byte + length. */ - rls += 1 + ntfs_get_nr_significant_bytes(length); - /* - * If the logical cluster number (lcn) denotes a hole and we - * are on NTFS 3.0+, we don't store it at all, i.e. we need - * zero space. On earlier NTFS versions we just store the lcn. - * Note: this assumes that on NTFS 1.2-, holes are stored with - * an lcn of -1 and not a delta_lcn of -1 (unless both are -1). - */ - if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { - /* Change in lcn. */ - rls += ntfs_get_nr_significant_bytes(rl->lcn - - prev_lcn); - prev_lcn = rl->lcn; - } - } - return rls; -err_out: - if (rl->lcn == LCN_RL_NOT_MAPPED) - rls = -EINVAL; - else - rls = -EIO; - return rls; -} - -/** - * ntfs_write_significant_bytes - write the significant bytes of a number - * @dst: destination buffer to write to - * @dst_max: pointer to last byte of destination buffer for bounds checking - * @n: number whose significant bytes to write - * - * Store in @dst, the minimum bytes of the number @n which are required to - * identify @n unambiguously as a signed number, taking care not to exceed - * @dest_max, the maximum position within @dst to which we are allowed to - * write. - * - * This is used when building the mapping pairs array of a runlist to compress - * a given logical cluster number (lcn) or a specific run length to the minimum - * size possible. - * - * Return the number of bytes written on success. On error, i.e. the - * destination buffer @dst is too small, return -ENOSPC. - */ -static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max, - const s64 n) -{ - s64 l = n; - int i; - s8 j; - - i = 0; - do { - if (unlikely(dst > dst_max)) - goto err_out; - *dst++ = l & 0xffll; - l >>= 8; - i++; - } while (l != 0 && l != -1); - j = (n >> 8 * (i - 1)) & 0xff; - /* If the sign bit is wrong, we need an extra byte. */ - if (n < 0 && j >= 0) { - if (unlikely(dst > dst_max)) - goto err_out; - i++; - *dst = (s8)-1; - } else if (n > 0 && j < 0) { - if (unlikely(dst > dst_max)) - goto err_out; - i++; - *dst = (s8)0; - } - return i; -err_out: - return -ENOSPC; -} - -/** - * ntfs_mapping_pairs_build - build the mapping pairs array from a runlist - * @vol: ntfs volume (needed for the ntfs version) - * @dst: destination buffer to which to write the mapping pairs array - * @dst_len: size of destination buffer @dst in bytes - * @rl: locked runlist for which to build the mapping pairs array - * @first_vcn: first vcn which to include in the mapping pairs array - * @last_vcn: last vcn which to include in the mapping pairs array - * @stop_vcn: first vcn outside destination buffer on success or -ENOSPC - * - * Create the mapping pairs array from the locked runlist @rl, starting at vcn - * @first_vcn and finishing with vcn @last_vcn and save the array in @dst. - * @dst_len is the size of @dst in bytes and it should be at least equal to the - * value obtained by calling ntfs_get_size_for_mapping_pairs(). - * - * A @last_vcn of -1 means end of runlist and in that case the mapping pairs - * array corresponding to the runlist starting at vcn @first_vcn and finishing - * at the end of the runlist is created. - * - * If @rl is NULL, just write a single terminator byte to @dst. - * - * On success or -ENOSPC error, if @stop_vcn is not NULL, *@stop_vcn is set to - * the first vcn outside the destination buffer. Note that on error, @dst has - * been filled with all the mapping pairs that will fit, thus it can be treated - * as partial success, in that a new attribute extent needs to be created or - * the next extent has to be used and the mapping pairs build has to be - * continued with @first_vcn set to *@stop_vcn. - * - * Return 0 on success and -errno on error. The following error codes are - * defined: - * -EINVAL - Run list contains unmapped elements. Make sure to only pass - * fully mapped runlists to this function. - * -EIO - The runlist is corrupt. - * -ENOSPC - The destination buffer is too small. - * - * Locking: @rl must be locked on entry (either for reading or writing), it - * remains locked throughout, and is left locked upon return. - */ -int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, - const int dst_len, const runlist_element *rl, - const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn) -{ - LCN prev_lcn; - s8 *dst_max, *dst_next; - int err = -ENOSPC; - bool the_end = false; - s8 len_len, lcn_len; - - BUG_ON(first_vcn < 0); - BUG_ON(last_vcn < -1); - BUG_ON(last_vcn >= 0 && first_vcn > last_vcn); - BUG_ON(dst_len < 1); - if (!rl) { - BUG_ON(first_vcn); - BUG_ON(last_vcn > 0); - if (stop_vcn) - *stop_vcn = 0; - /* Terminator byte. */ - *dst = 0; - return 0; - } - /* Skip to runlist element containing @first_vcn. */ - while (rl->length && first_vcn >= rl[1].vcn) - rl++; - if (unlikely((!rl->length && first_vcn > rl->vcn) || - first_vcn < rl->vcn)) - return -EINVAL; - /* - * @dst_max is used for bounds checking in - * ntfs_write_significant_bytes(). - */ - dst_max = dst + dst_len - 1; - prev_lcn = 0; - /* Do the first partial run if present. */ - if (first_vcn > rl->vcn) { - s64 delta, length = rl->length; - - /* We know rl->length != 0 already. */ - if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) - goto err_out; - /* - * If @stop_vcn is given and finishes inside this run, cap the - * run length. - */ - if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { - s64 s1 = last_vcn + 1; - if (unlikely(rl[1].vcn > s1)) - length = s1 - rl->vcn; - the_end = true; - } - delta = first_vcn - rl->vcn; - /* Write length. */ - len_len = ntfs_write_significant_bytes(dst + 1, dst_max, - length - delta); - if (unlikely(len_len < 0)) - goto size_err; - /* - * If the logical cluster number (lcn) denotes a hole and we - * are on NTFS 3.0+, we don't store it at all, i.e. we need - * zero space. On earlier NTFS versions we just write the lcn - * change. FIXME: Do we need to write the lcn change or just - * the lcn in that case? Not sure as I have never seen this - * case on NT4. - We assume that we just need to write the lcn - * change until someone tells us otherwise... (AIA) - */ - if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { - prev_lcn = rl->lcn; - if (likely(rl->lcn >= 0)) - prev_lcn += delta; - /* Write change in lcn. */ - lcn_len = ntfs_write_significant_bytes(dst + 1 + - len_len, dst_max, prev_lcn); - if (unlikely(lcn_len < 0)) - goto size_err; - } else - lcn_len = 0; - dst_next = dst + len_len + lcn_len + 1; - if (unlikely(dst_next > dst_max)) - goto size_err; - /* Update header byte. */ - *dst = lcn_len << 4 | len_len; - /* Position at next mapping pairs array element. */ - dst = dst_next; - /* Go to next runlist element. */ - rl++; - } - /* Do the full runs. */ - for (; rl->length && !the_end; rl++) { - s64 length = rl->length; - - if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) - goto err_out; - /* - * If @stop_vcn is given and finishes inside this run, cap the - * run length. - */ - if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { - s64 s1 = last_vcn + 1; - if (unlikely(rl[1].vcn > s1)) - length = s1 - rl->vcn; - the_end = true; - } - /* Write length. */ - len_len = ntfs_write_significant_bytes(dst + 1, dst_max, - length); - if (unlikely(len_len < 0)) - goto size_err; - /* - * If the logical cluster number (lcn) denotes a hole and we - * are on NTFS 3.0+, we don't store it at all, i.e. we need - * zero space. On earlier NTFS versions we just write the lcn - * change. FIXME: Do we need to write the lcn change or just - * the lcn in that case? Not sure as I have never seen this - * case on NT4. - We assume that we just need to write the lcn - * change until someone tells us otherwise... (AIA) - */ - if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { - /* Write change in lcn. */ - lcn_len = ntfs_write_significant_bytes(dst + 1 + - len_len, dst_max, rl->lcn - prev_lcn); - if (unlikely(lcn_len < 0)) - goto size_err; - prev_lcn = rl->lcn; - } else - lcn_len = 0; - dst_next = dst + len_len + lcn_len + 1; - if (unlikely(dst_next > dst_max)) - goto size_err; - /* Update header byte. */ - *dst = lcn_len << 4 | len_len; - /* Position at next mapping pairs array element. */ - dst = dst_next; - } - /* Success. */ - err = 0; -size_err: - /* Set stop vcn. */ - if (stop_vcn) - *stop_vcn = rl->vcn; - /* Add terminator byte. */ - *dst = 0; - return err; -err_out: - if (rl->lcn == LCN_RL_NOT_MAPPED) - err = -EINVAL; - else - err = -EIO; - return err; -} - -/** - * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn - * @vol: ntfs volume (needed for error output) - * @runlist: runlist to truncate - * @new_length: the new length of the runlist in VCNs - * - * Truncate the runlist described by @runlist as well as the memory buffer - * holding the runlist elements to a length of @new_length VCNs. - * - * If @new_length lies within the runlist, the runlist elements with VCNs of - * @new_length and above are discarded. As a special case if @new_length is - * zero, the runlist is discarded and set to NULL. - * - * If @new_length lies beyond the runlist, a sparse runlist element is added to - * the end of the runlist @runlist or if the last runlist element is a sparse - * one already, this is extended. - * - * Note, no checking is done for unmapped runlist elements. It is assumed that - * the caller has mapped any elements that need to be mapped already. - * - * Return 0 on success and -errno on error. - * - * Locking: The caller must hold @runlist->lock for writing. - */ -int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist, - const s64 new_length) -{ - runlist_element *rl; - int old_size; - - ntfs_debug("Entering for new_length 0x%llx.", (long long)new_length); - BUG_ON(!runlist); - BUG_ON(new_length < 0); - rl = runlist->rl; - if (!new_length) { - ntfs_debug("Freeing runlist."); - runlist->rl = NULL; - if (rl) - ntfs_free(rl); - return 0; - } - if (unlikely(!rl)) { - /* - * Create a runlist consisting of a sparse runlist element of - * length @new_length followed by a terminator runlist element. - */ - rl = ntfs_malloc_nofs(PAGE_SIZE); - if (unlikely(!rl)) { - ntfs_error(vol->sb, "Not enough memory to allocate " - "runlist element buffer."); - return -ENOMEM; - } - runlist->rl = rl; - rl[1].length = rl->vcn = 0; - rl->lcn = LCN_HOLE; - rl[1].vcn = rl->length = new_length; - rl[1].lcn = LCN_ENOENT; - return 0; - } - BUG_ON(new_length < rl->vcn); - /* Find @new_length in the runlist. */ - while (likely(rl->length && new_length >= rl[1].vcn)) - rl++; - /* - * If not at the end of the runlist we need to shrink it. - * If at the end of the runlist we need to expand it. - */ - if (rl->length) { - runlist_element *trl; - bool is_end; - - ntfs_debug("Shrinking runlist."); - /* Determine the runlist size. */ - trl = rl + 1; - while (likely(trl->length)) - trl++; - old_size = trl - runlist->rl + 1; - /* Truncate the run. */ - rl->length = new_length - rl->vcn; - /* - * If a run was partially truncated, make the following runlist - * element a terminator. - */ - is_end = false; - if (rl->length) { - rl++; - if (!rl->length) - is_end = true; - rl->vcn = new_length; - rl->length = 0; - } - rl->lcn = LCN_ENOENT; - /* Reallocate memory if necessary. */ - if (!is_end) { - int new_size = rl - runlist->rl + 1; - rl = ntfs_rl_realloc(runlist->rl, old_size, new_size); - if (IS_ERR(rl)) - ntfs_warning(vol->sb, "Failed to shrink " - "runlist buffer. This just " - "wastes a bit of memory " - "temporarily so we ignore it " - "and return success."); - else - runlist->rl = rl; - } - } else if (likely(/* !rl->length && */ new_length > rl->vcn)) { - ntfs_debug("Expanding runlist."); - /* - * If there is a previous runlist element and it is a sparse - * one, extend it. Otherwise need to add a new, sparse runlist - * element. - */ - if ((rl > runlist->rl) && ((rl - 1)->lcn == LCN_HOLE)) - (rl - 1)->length = new_length - (rl - 1)->vcn; - else { - /* Determine the runlist size. */ - old_size = rl - runlist->rl + 1; - /* Reallocate memory if necessary. */ - rl = ntfs_rl_realloc(runlist->rl, old_size, - old_size + 1); - if (IS_ERR(rl)) { - ntfs_error(vol->sb, "Failed to expand runlist " - "buffer, aborting."); - return PTR_ERR(rl); - } - runlist->rl = rl; - /* - * Set @rl to the same runlist element in the new - * runlist as before in the old runlist. - */ - rl += old_size - 1; - /* Add a new, sparse runlist element. */ - rl->lcn = LCN_HOLE; - rl->length = new_length - rl->vcn; - /* Add a new terminator runlist element. */ - rl++; - rl->length = 0; - } - rl->vcn = new_length; - rl->lcn = LCN_ENOENT; - } else /* if (unlikely(!rl->length && new_length == rl->vcn)) */ { - /* Runlist already has same size as requested. */ - rl->lcn = LCN_ENOENT; - } - ntfs_debug("Done."); - return 0; -} - -/** - * ntfs_rl_punch_nolock - punch a hole into a runlist - * @vol: ntfs volume (needed for error output) - * @runlist: runlist to punch a hole into - * @start: starting VCN of the hole to be created - * @length: size of the hole to be created in units of clusters - * - * Punch a hole into the runlist @runlist starting at VCN @start and of size - * @length clusters. - * - * Return 0 on success and -errno on error, in which case @runlist has not been - * modified. - * - * If @start and/or @start + @length are outside the runlist return error code - * -ENOENT. - * - * If the runlist contains unmapped or error elements between @start and @start - * + @length return error code -EINVAL. - * - * Locking: The caller must hold @runlist->lock for writing. - */ -int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist, - const VCN start, const s64 length) -{ - const VCN end = start + length; - s64 delta; - runlist_element *rl, *rl_end, *rl_real_end, *trl; - int old_size; - bool lcn_fixup = false; - - ntfs_debug("Entering for start 0x%llx, length 0x%llx.", - (long long)start, (long long)length); - BUG_ON(!runlist); - BUG_ON(start < 0); - BUG_ON(length < 0); - BUG_ON(end < 0); - rl = runlist->rl; - if (unlikely(!rl)) { - if (likely(!start && !length)) - return 0; - return -ENOENT; - } - /* Find @start in the runlist. */ - while (likely(rl->length && start >= rl[1].vcn)) - rl++; - rl_end = rl; - /* Find @end in the runlist. */ - while (likely(rl_end->length && end >= rl_end[1].vcn)) { - /* Verify there are no unmapped or error elements. */ - if (unlikely(rl_end->lcn < LCN_HOLE)) - return -EINVAL; - rl_end++; - } - /* Check the last element. */ - if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE)) - return -EINVAL; - /* This covers @start being out of bounds, too. */ - if (!rl_end->length && end > rl_end->vcn) - return -ENOENT; - if (!length) - return 0; - if (!rl->length) - return -ENOENT; - rl_real_end = rl_end; - /* Determine the runlist size. */ - while (likely(rl_real_end->length)) - rl_real_end++; - old_size = rl_real_end - runlist->rl + 1; - /* If @start is in a hole simply extend the hole. */ - if (rl->lcn == LCN_HOLE) { - /* - * If both @start and @end are in the same sparse run, we are - * done. - */ - if (end <= rl[1].vcn) { - ntfs_debug("Done (requested hole is already sparse)."); - return 0; - } -extend_hole: - /* Extend the hole. */ - rl->length = end - rl->vcn; - /* If @end is in a hole, merge it with the current one. */ - if (rl_end->lcn == LCN_HOLE) { - rl_end++; - rl->length = rl_end->vcn - rl->vcn; - } - /* We have done the hole. Now deal with the remaining tail. */ - rl++; - /* Cut out all runlist elements up to @end. */ - if (rl < rl_end) - memmove(rl, rl_end, (rl_real_end - rl_end + 1) * - sizeof(*rl)); - /* Adjust the beginning of the tail if necessary. */ - if (end > rl->vcn) { - delta = end - rl->vcn; - rl->vcn = end; - rl->length -= delta; - /* Only adjust the lcn if it is real. */ - if (rl->lcn >= 0) - rl->lcn += delta; - } -shrink_allocation: - /* Reallocate memory if the allocation changed. */ - if (rl < rl_end) { - rl = ntfs_rl_realloc(runlist->rl, old_size, - old_size - (rl_end - rl)); - if (IS_ERR(rl)) - ntfs_warning(vol->sb, "Failed to shrink " - "runlist buffer. This just " - "wastes a bit of memory " - "temporarily so we ignore it " - "and return success."); - else - runlist->rl = rl; - } - ntfs_debug("Done (extend hole)."); - return 0; - } - /* - * If @start is at the beginning of a run things are easier as there is - * no need to split the first run. - */ - if (start == rl->vcn) { - /* - * @start is at the beginning of a run. - * - * If the previous run is sparse, extend its hole. - * - * If @end is not in the same run, switch the run to be sparse - * and extend the newly created hole. - * - * Thus both of these cases reduce the problem to the above - * case of "@start is in a hole". - */ - if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) { - rl--; - goto extend_hole; - } - if (end >= rl[1].vcn) { - rl->lcn = LCN_HOLE; - goto extend_hole; - } - /* - * The final case is when @end is in the same run as @start. - * For this need to split the run into two. One run for the - * sparse region between the beginning of the old run, i.e. - * @start, and @end and one for the remaining non-sparse - * region, i.e. between @end and the end of the old run. - */ - trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1); - if (IS_ERR(trl)) - goto enomem_out; - old_size++; - if (runlist->rl != trl) { - rl = trl + (rl - runlist->rl); - rl_end = trl + (rl_end - runlist->rl); - rl_real_end = trl + (rl_real_end - runlist->rl); - runlist->rl = trl; - } -split_end: - /* Shift all the runs up by one. */ - memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl)); - /* Finally, setup the two split runs. */ - rl->lcn = LCN_HOLE; - rl->length = length; - rl++; - rl->vcn += length; - /* Only adjust the lcn if it is real. */ - if (rl->lcn >= 0 || lcn_fixup) - rl->lcn += length; - rl->length -= length; - ntfs_debug("Done (split one)."); - return 0; - } - /* - * @start is neither in a hole nor at the beginning of a run. - * - * If @end is in a hole, things are easier as simply truncating the run - * @start is in to end at @start - 1, deleting all runs after that up - * to @end, and finally extending the beginning of the run @end is in - * to be @start is all that is needed. - */ - if (rl_end->lcn == LCN_HOLE) { - /* Truncate the run containing @start. */ - rl->length = start - rl->vcn; - rl++; - /* Cut out all runlist elements up to @end. */ - if (rl < rl_end) - memmove(rl, rl_end, (rl_real_end - rl_end + 1) * - sizeof(*rl)); - /* Extend the beginning of the run @end is in to be @start. */ - rl->vcn = start; - rl->length = rl[1].vcn - start; - goto shrink_allocation; - } - /* - * If @end is not in a hole there are still two cases to distinguish. - * Either @end is or is not in the same run as @start. - * - * The second case is easier as it can be reduced to an already solved - * problem by truncating the run @start is in to end at @start - 1. - * Then, if @end is in the next run need to split the run into a sparse - * run followed by a non-sparse run (already covered above) and if @end - * is not in the next run switching it to be sparse, again reduces the - * problem to the already covered case of "@start is in a hole". - */ - if (end >= rl[1].vcn) { - /* - * If @end is not in the next run, reduce the problem to the - * case of "@start is in a hole". - */ - if (rl[1].length && end >= rl[2].vcn) { - /* Truncate the run containing @start. */ - rl->length = start - rl->vcn; - rl++; - rl->vcn = start; - rl->lcn = LCN_HOLE; - goto extend_hole; - } - trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1); - if (IS_ERR(trl)) - goto enomem_out; - old_size++; - if (runlist->rl != trl) { - rl = trl + (rl - runlist->rl); - rl_end = trl + (rl_end - runlist->rl); - rl_real_end = trl + (rl_real_end - runlist->rl); - runlist->rl = trl; - } - /* Truncate the run containing @start. */ - rl->length = start - rl->vcn; - rl++; - /* - * @end is in the next run, reduce the problem to the case - * where "@start is at the beginning of a run and @end is in - * the same run as @start". - */ - delta = rl->vcn - start; - rl->vcn = start; - if (rl->lcn >= 0) { - rl->lcn -= delta; - /* Need this in case the lcn just became negative. */ - lcn_fixup = true; - } - rl->length += delta; - goto split_end; - } - /* - * The first case from above, i.e. @end is in the same run as @start. - * We need to split the run into three. One run for the non-sparse - * region between the beginning of the old run and @start, one for the - * sparse region between @start and @end, and one for the remaining - * non-sparse region, i.e. between @end and the end of the old run. - */ - trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2); - if (IS_ERR(trl)) - goto enomem_out; - old_size += 2; - if (runlist->rl != trl) { - rl = trl + (rl - runlist->rl); - rl_end = trl + (rl_end - runlist->rl); - rl_real_end = trl + (rl_real_end - runlist->rl); - runlist->rl = trl; - } - /* Shift all the runs up by two. */ - memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl)); - /* Finally, setup the three split runs. */ - rl->length = start - rl->vcn; - rl++; - rl->vcn = start; - rl->lcn = LCN_HOLE; - rl->length = length; - rl++; - delta = end - rl->vcn; - rl->vcn = end; - rl->lcn += delta; - rl->length -= delta; - ntfs_debug("Done (split both)."); - return 0; -enomem_out: - ntfs_error(vol->sb, "Not enough memory to extend runlist buffer."); - return -ENOMEM; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h deleted file mode 100644 index 38de0a375f59e..0000000000000 --- a/fs/ntfs/runlist.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * runlist.h - Defines for runlist handling in NTFS Linux kernel driver. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2005 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_RUNLIST_H -#define _LINUX_NTFS_RUNLIST_H - -#include "types.h" -#include "layout.h" -#include "volume.h" - -/** - * runlist_element - in memory vcn to lcn mapping array element - * @vcn: starting vcn of the current array element - * @lcn: starting lcn of the current array element - * @length: length in clusters of the current array element - * - * The last vcn (in fact the last vcn + 1) is reached when length == 0. - * - * When lcn == -1 this means that the count vcns starting at vcn are not - * physically allocated (i.e. this is a hole / data is sparse). - */ -typedef struct { /* In memory vcn to lcn mapping structure element. */ - VCN vcn; /* vcn = Starting virtual cluster number. */ - LCN lcn; /* lcn = Starting logical cluster number. */ - s64 length; /* Run length in clusters. */ -} runlist_element; - -/** - * runlist - in memory vcn to lcn mapping array including a read/write lock - * @rl: pointer to an array of runlist elements - * @lock: read/write spinlock for serializing access to @rl - * - */ -typedef struct { - runlist_element *rl; - struct rw_semaphore lock; -} runlist; - -static inline void ntfs_init_runlist(runlist *rl) -{ - rl->rl = NULL; - init_rwsem(&rl->lock); -} - -typedef enum { - LCN_HOLE = -1, /* Keep this as highest value or die! */ - LCN_RL_NOT_MAPPED = -2, - LCN_ENOENT = -3, - LCN_ENOMEM = -4, - LCN_EIO = -5, -} LCN_SPECIAL_VALUES; - -extern runlist_element *ntfs_runlists_merge(runlist_element *drl, - runlist_element *srl); - -extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, - const ATTR_RECORD *attr, runlist_element *old_rl); - -extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn); - -#ifdef NTFS_RW - -extern runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, - const VCN vcn); - -extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, - const runlist_element *rl, const VCN first_vcn, - const VCN last_vcn); - -extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, - const int dst_len, const runlist_element *rl, - const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn); - -extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol, - runlist *const runlist, const s64 new_length); - -int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist, - const VCN start, const s64 length); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_RUNLIST_H */ diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c deleted file mode 100644 index 56a7d5bd33e4e..0000000000000 --- a/fs/ntfs/super.c +++ /dev/null @@ -1,3202 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. - * Copyright (c) 2001,2002 Richard Russon - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include /* For bdev_logical_block_size(). */ -#include -#include -#include -#include -#include - -#include "sysctl.h" -#include "logfile.h" -#include "quota.h" -#include "usnjrnl.h" -#include "dir.h" -#include "debug.h" -#include "index.h" -#include "inode.h" -#include "aops.h" -#include "layout.h" -#include "malloc.h" -#include "ntfs.h" - -/* Number of mounted filesystems which have compression enabled. */ -static unsigned long ntfs_nr_compression_users; - -/* A global default upcase table and a corresponding reference count. */ -static ntfschar *default_upcase; -static unsigned long ntfs_nr_upcase_users; - -/* Error constants/strings used in inode.c::ntfs_show_options(). */ -typedef enum { - /* One of these must be present, default is ON_ERRORS_CONTINUE. */ - ON_ERRORS_PANIC = 0x01, - ON_ERRORS_REMOUNT_RO = 0x02, - ON_ERRORS_CONTINUE = 0x04, - /* Optional, can be combined with any of the above. */ - ON_ERRORS_RECOVER = 0x10, -} ON_ERRORS_ACTIONS; - -const option_t on_errors_arr[] = { - { ON_ERRORS_PANIC, "panic" }, - { ON_ERRORS_REMOUNT_RO, "remount-ro", }, - { ON_ERRORS_CONTINUE, "continue", }, - { ON_ERRORS_RECOVER, "recover" }, - { 0, NULL } -}; - -/** - * simple_getbool - convert input string to a boolean value - * @s: input string to convert - * @setval: where to store the output boolean value - * - * Copied from old ntfs driver (which copied from vfat driver). - * - * "1", "yes", "true", or an empty string are converted to %true. - * "0", "no", and "false" are converted to %false. - * - * Return: %1 if the string is converted or was empty and *setval contains it; - * %0 if the string was not valid. - */ -static int simple_getbool(char *s, bool *setval) -{ - if (s) { - if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true")) - *setval = true; - else if (!strcmp(s, "0") || !strcmp(s, "no") || - !strcmp(s, "false")) - *setval = false; - else - return 0; - } else - *setval = true; - return 1; -} - -/** - * parse_options - parse the (re)mount options - * @vol: ntfs volume - * @opt: string containing the (re)mount options - * - * Parse the recognized options in @opt for the ntfs volume described by @vol. - */ -static bool parse_options(ntfs_volume *vol, char *opt) -{ - char *p, *v, *ov; - static char *utf8 = "utf8"; - int errors = 0, sloppy = 0; - kuid_t uid = INVALID_UID; - kgid_t gid = INVALID_GID; - umode_t fmask = (umode_t)-1, dmask = (umode_t)-1; - int mft_zone_multiplier = -1, on_errors = -1; - int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1; - struct nls_table *nls_map = NULL, *old_nls; - - /* I am lazy... (-8 */ -#define NTFS_GETOPT_WITH_DEFAULT(option, variable, default_value) \ - if (!strcmp(p, option)) { \ - if (!v || !*v) \ - variable = default_value; \ - else { \ - variable = simple_strtoul(ov = v, &v, 0); \ - if (*v) \ - goto needs_val; \ - } \ - } -#define NTFS_GETOPT(option, variable) \ - if (!strcmp(p, option)) { \ - if (!v || !*v) \ - goto needs_arg; \ - variable = simple_strtoul(ov = v, &v, 0); \ - if (*v) \ - goto needs_val; \ - } -#define NTFS_GETOPT_UID(option, variable) \ - if (!strcmp(p, option)) { \ - uid_t uid_value; \ - if (!v || !*v) \ - goto needs_arg; \ - uid_value = simple_strtoul(ov = v, &v, 0); \ - if (*v) \ - goto needs_val; \ - variable = make_kuid(current_user_ns(), uid_value); \ - if (!uid_valid(variable)) \ - goto needs_val; \ - } -#define NTFS_GETOPT_GID(option, variable) \ - if (!strcmp(p, option)) { \ - gid_t gid_value; \ - if (!v || !*v) \ - goto needs_arg; \ - gid_value = simple_strtoul(ov = v, &v, 0); \ - if (*v) \ - goto needs_val; \ - variable = make_kgid(current_user_ns(), gid_value); \ - if (!gid_valid(variable)) \ - goto needs_val; \ - } -#define NTFS_GETOPT_OCTAL(option, variable) \ - if (!strcmp(p, option)) { \ - if (!v || !*v) \ - goto needs_arg; \ - variable = simple_strtoul(ov = v, &v, 8); \ - if (*v) \ - goto needs_val; \ - } -#define NTFS_GETOPT_BOOL(option, variable) \ - if (!strcmp(p, option)) { \ - bool val; \ - if (!simple_getbool(v, &val)) \ - goto needs_bool; \ - variable = val; \ - } -#define NTFS_GETOPT_OPTIONS_ARRAY(option, variable, opt_array) \ - if (!strcmp(p, option)) { \ - int _i; \ - if (!v || !*v) \ - goto needs_arg; \ - ov = v; \ - if (variable == -1) \ - variable = 0; \ - for (_i = 0; opt_array[_i].str && *opt_array[_i].str; _i++) \ - if (!strcmp(opt_array[_i].str, v)) { \ - variable |= opt_array[_i].val; \ - break; \ - } \ - if (!opt_array[_i].str || !*opt_array[_i].str) \ - goto needs_val; \ - } - if (!opt || !*opt) - goto no_mount_options; - ntfs_debug("Entering with mount options string: %s", opt); - while ((p = strsep(&opt, ","))) { - if ((v = strchr(p, '='))) - *v++ = 0; - NTFS_GETOPT_UID("uid", uid) - else NTFS_GETOPT_GID("gid", gid) - else NTFS_GETOPT_OCTAL("umask", fmask = dmask) - else NTFS_GETOPT_OCTAL("fmask", fmask) - else NTFS_GETOPT_OCTAL("dmask", dmask) - else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier) - else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, true) - else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files) - else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive) - else NTFS_GETOPT_BOOL("disable_sparse", disable_sparse) - else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors, - on_errors_arr) - else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes")) - ntfs_warning(vol->sb, "Ignoring obsolete option %s.", - p); - else if (!strcmp(p, "nls") || !strcmp(p, "iocharset")) { - if (!strcmp(p, "iocharset")) - ntfs_warning(vol->sb, "Option iocharset is " - "deprecated. Please use " - "option nls= in " - "the future."); - if (!v || !*v) - goto needs_arg; -use_utf8: - old_nls = nls_map; - nls_map = load_nls(v); - if (!nls_map) { - if (!old_nls) { - ntfs_error(vol->sb, "NLS character set " - "%s not found.", v); - return false; - } - ntfs_error(vol->sb, "NLS character set %s not " - "found. Using previous one %s.", - v, old_nls->charset); - nls_map = old_nls; - } else /* nls_map */ { - unload_nls(old_nls); - } - } else if (!strcmp(p, "utf8")) { - bool val = false; - ntfs_warning(vol->sb, "Option utf8 is no longer " - "supported, using option nls=utf8. Please " - "use option nls=utf8 in the future and " - "make sure utf8 is compiled either as a " - "module or into the kernel."); - if (!v || !*v) - val = true; - else if (!simple_getbool(v, &val)) - goto needs_bool; - if (val) { - v = utf8; - goto use_utf8; - } - } else { - ntfs_error(vol->sb, "Unrecognized mount option %s.", p); - if (errors < INT_MAX) - errors++; - } -#undef NTFS_GETOPT_OPTIONS_ARRAY -#undef NTFS_GETOPT_BOOL -#undef NTFS_GETOPT -#undef NTFS_GETOPT_WITH_DEFAULT - } -no_mount_options: - if (errors && !sloppy) - return false; - if (sloppy) - ntfs_warning(vol->sb, "Sloppy option given. Ignoring " - "unrecognized mount option(s) and continuing."); - /* Keep this first! */ - if (on_errors != -1) { - if (!on_errors) { - ntfs_error(vol->sb, "Invalid errors option argument " - "or bug in options parser."); - return false; - } - } - if (nls_map) { - if (vol->nls_map && vol->nls_map != nls_map) { - ntfs_error(vol->sb, "Cannot change NLS character set " - "on remount."); - return false; - } /* else (!vol->nls_map) */ - ntfs_debug("Using NLS character set %s.", nls_map->charset); - vol->nls_map = nls_map; - } else /* (!nls_map) */ { - if (!vol->nls_map) { - vol->nls_map = load_nls_default(); - if (!vol->nls_map) { - ntfs_error(vol->sb, "Failed to load default " - "NLS character set."); - return false; - } - ntfs_debug("Using default NLS character set (%s).", - vol->nls_map->charset); - } - } - if (mft_zone_multiplier != -1) { - if (vol->mft_zone_multiplier && vol->mft_zone_multiplier != - mft_zone_multiplier) { - ntfs_error(vol->sb, "Cannot change mft_zone_multiplier " - "on remount."); - return false; - } - if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) { - ntfs_error(vol->sb, "Invalid mft_zone_multiplier. " - "Using default value, i.e. 1."); - mft_zone_multiplier = 1; - } - vol->mft_zone_multiplier = mft_zone_multiplier; - } - if (!vol->mft_zone_multiplier) - vol->mft_zone_multiplier = 1; - if (on_errors != -1) - vol->on_errors = on_errors; - if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER) - vol->on_errors |= ON_ERRORS_CONTINUE; - if (uid_valid(uid)) - vol->uid = uid; - if (gid_valid(gid)) - vol->gid = gid; - if (fmask != (umode_t)-1) - vol->fmask = fmask; - if (dmask != (umode_t)-1) - vol->dmask = dmask; - if (show_sys_files != -1) { - if (show_sys_files) - NVolSetShowSystemFiles(vol); - else - NVolClearShowSystemFiles(vol); - } - if (case_sensitive != -1) { - if (case_sensitive) - NVolSetCaseSensitive(vol); - else - NVolClearCaseSensitive(vol); - } - if (disable_sparse != -1) { - if (disable_sparse) - NVolClearSparseEnabled(vol); - else { - if (!NVolSparseEnabled(vol) && - vol->major_ver && vol->major_ver < 3) - ntfs_warning(vol->sb, "Not enabling sparse " - "support due to NTFS volume " - "version %i.%i (need at least " - "version 3.0).", vol->major_ver, - vol->minor_ver); - else - NVolSetSparseEnabled(vol); - } - } - return true; -needs_arg: - ntfs_error(vol->sb, "The %s option requires an argument.", p); - return false; -needs_bool: - ntfs_error(vol->sb, "The %s option requires a boolean argument.", p); - return false; -needs_val: - ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov); - return false; -} - -#ifdef NTFS_RW - -/** - * ntfs_write_volume_flags - write new flags to the volume information flags - * @vol: ntfs volume on which to modify the flags - * @flags: new flags value for the volume information flags - * - * Internal function. You probably want to use ntfs_{set,clear}_volume_flags() - * instead (see below). - * - * Replace the volume information flags on the volume @vol with the value - * supplied in @flags. Note, this overwrites the volume information flags, so - * make sure to combine the flags you want to modify with the old flags and use - * the result when calling ntfs_write_volume_flags(). - * - * Return 0 on success and -errno on error. - */ -static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags) -{ - ntfs_inode *ni = NTFS_I(vol->vol_ino); - MFT_RECORD *m; - VOLUME_INFORMATION *vi; - ntfs_attr_search_ctx *ctx; - int err; - - ntfs_debug("Entering, old flags = 0x%x, new flags = 0x%x.", - le16_to_cpu(vol->vol_flags), le16_to_cpu(flags)); - if (vol->vol_flags == flags) - goto done; - BUG_ON(!ni); - m = map_mft_record(ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(ni, m); - if (!ctx) { - err = -ENOMEM; - goto put_unm_err_out; - } - err = ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0, - ctx); - if (err) - goto put_unm_err_out; - vi = (VOLUME_INFORMATION*)((u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset)); - vol->vol_flags = vi->flags = flags; - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); -done: - ntfs_debug("Done."); - return 0; -put_unm_err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); -err_out: - ntfs_error(vol->sb, "Failed with error code %i.", -err); - return err; -} - -/** - * ntfs_set_volume_flags - set bits in the volume information flags - * @vol: ntfs volume on which to modify the flags - * @flags: flags to set on the volume - * - * Set the bits in @flags in the volume information flags on the volume @vol. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) -{ - flags &= VOLUME_FLAGS_MASK; - return ntfs_write_volume_flags(vol, vol->vol_flags | flags); -} - -/** - * ntfs_clear_volume_flags - clear bits in the volume information flags - * @vol: ntfs volume on which to modify the flags - * @flags: flags to clear on the volume - * - * Clear the bits in @flags in the volume information flags on the volume @vol. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_clear_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) -{ - flags &= VOLUME_FLAGS_MASK; - flags = vol->vol_flags & cpu_to_le16(~le16_to_cpu(flags)); - return ntfs_write_volume_flags(vol, flags); -} - -#endif /* NTFS_RW */ - -/** - * ntfs_remount - change the mount options of a mounted ntfs filesystem - * @sb: superblock of mounted ntfs filesystem - * @flags: remount flags - * @opt: remount options string - * - * Change the mount options of an already mounted ntfs filesystem. - * - * NOTE: The VFS sets the @sb->s_flags remount flags to @flags after - * ntfs_remount() returns successfully (i.e. returns 0). Otherwise, - * @sb->s_flags are not changed. - */ -static int ntfs_remount(struct super_block *sb, int *flags, char *opt) -{ - ntfs_volume *vol = NTFS_SB(sb); - - ntfs_debug("Entering with remount options string: %s", opt); - - sync_filesystem(sb); - -#ifndef NTFS_RW - /* For read-only compiled driver, enforce read-only flag. */ - *flags |= SB_RDONLY; -#else /* NTFS_RW */ - /* - * For the read-write compiled driver, if we are remounting read-write, - * make sure there are no volume errors and that no unsupported volume - * flags are set. Also, empty the logfile journal as it would become - * stale as soon as something is written to the volume and mark the - * volume dirty so that chkdsk is run if the volume is not umounted - * cleanly. Finally, mark the quotas out of date so Windows rescans - * the volume on boot and updates them. - * - * When remounting read-only, mark the volume clean if no volume errors - * have occurred. - */ - if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) { - static const char *es = ". Cannot remount read-write."; - - /* Remounting read-write. */ - if (NVolErrors(vol)) { - ntfs_error(sb, "Volume has errors and is read-only%s", - es); - return -EROFS; - } - if (vol->vol_flags & VOLUME_IS_DIRTY) { - ntfs_error(sb, "Volume is dirty and read-only%s", es); - return -EROFS; - } - if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { - ntfs_error(sb, "Volume has been modified by chkdsk " - "and is read-only%s", es); - return -EROFS; - } - if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { - ntfs_error(sb, "Volume has unsupported flags set " - "(0x%x) and is read-only%s", - (unsigned)le16_to_cpu(vol->vol_flags), - es); - return -EROFS; - } - if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { - ntfs_error(sb, "Failed to set dirty bit in volume " - "information flags%s", es); - return -EROFS; - } -#if 0 - // TODO: Enable this code once we start modifying anything that - // is different between NTFS 1.2 and 3.x... - /* Set NT4 compatibility flag on newer NTFS version volumes. */ - if ((vol->major_ver > 1)) { - if (ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { - ntfs_error(sb, "Failed to set NT4 " - "compatibility flag%s", es); - NVolSetErrors(vol); - return -EROFS; - } - } -#endif - if (!ntfs_empty_logfile(vol->logfile_ino)) { - ntfs_error(sb, "Failed to empty journal $LogFile%s", - es); - NVolSetErrors(vol); - return -EROFS; - } - if (!ntfs_mark_quotas_out_of_date(vol)) { - ntfs_error(sb, "Failed to mark quotas out of date%s", - es); - NVolSetErrors(vol); - return -EROFS; - } - if (!ntfs_stamp_usnjrnl(vol)) { - ntfs_error(sb, "Failed to stamp transaction log " - "($UsnJrnl)%s", es); - NVolSetErrors(vol); - return -EROFS; - } - } else if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) { - /* Remounting read-only. */ - if (!NVolErrors(vol)) { - if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) - ntfs_warning(sb, "Failed to clear dirty bit " - "in volume information " - "flags. Run chkdsk."); - } - } -#endif /* NTFS_RW */ - - // TODO: Deal with *flags. - - if (!parse_options(vol, opt)) - return -EINVAL; - - ntfs_debug("Done."); - return 0; -} - -/** - * is_boot_sector_ntfs - check whether a boot sector is a valid NTFS boot sector - * @sb: Super block of the device to which @b belongs. - * @b: Boot sector of device @sb to check. - * @silent: If 'true', all output will be silenced. - * - * is_boot_sector_ntfs() checks whether the boot sector @b is a valid NTFS boot - * sector. Returns 'true' if it is valid and 'false' if not. - * - * @sb is only needed for warning/error output, i.e. it can be NULL when silent - * is 'true'. - */ -static bool is_boot_sector_ntfs(const struct super_block *sb, - const NTFS_BOOT_SECTOR *b, const bool silent) -{ - /* - * Check that checksum == sum of u32 values from b to the checksum - * field. If checksum is zero, no checking is done. We will work when - * the checksum test fails, since some utilities update the boot sector - * ignoring the checksum which leaves the checksum out-of-date. We - * report a warning if this is the case. - */ - if ((void*)b < (void*)&b->checksum && b->checksum && !silent) { - le32 *u; - u32 i; - - for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u) - i += le32_to_cpup(u); - if (le32_to_cpu(b->checksum) != i) - ntfs_warning(sb, "Invalid boot sector checksum."); - } - /* Check OEMidentifier is "NTFS " */ - if (b->oem_id != magicNTFS) - goto not_ntfs; - /* Check bytes per sector value is between 256 and 4096. */ - if (le16_to_cpu(b->bpb.bytes_per_sector) < 0x100 || - le16_to_cpu(b->bpb.bytes_per_sector) > 0x1000) - goto not_ntfs; - /* Check sectors per cluster value is valid. */ - switch (b->bpb.sectors_per_cluster) { - case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128: - break; - default: - goto not_ntfs; - } - /* Check the cluster size is not above the maximum (64kiB). */ - if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) * - b->bpb.sectors_per_cluster > NTFS_MAX_CLUSTER_SIZE) - goto not_ntfs; - /* Check reserved/unused fields are really zero. */ - if (le16_to_cpu(b->bpb.reserved_sectors) || - le16_to_cpu(b->bpb.root_entries) || - le16_to_cpu(b->bpb.sectors) || - le16_to_cpu(b->bpb.sectors_per_fat) || - le32_to_cpu(b->bpb.large_sectors) || b->bpb.fats) - goto not_ntfs; - /* Check clusters per file mft record value is valid. */ - if ((u8)b->clusters_per_mft_record < 0xe1 || - (u8)b->clusters_per_mft_record > 0xf7) - switch (b->clusters_per_mft_record) { - case 1: case 2: case 4: case 8: case 16: case 32: case 64: - break; - default: - goto not_ntfs; - } - /* Check clusters per index block value is valid. */ - if ((u8)b->clusters_per_index_record < 0xe1 || - (u8)b->clusters_per_index_record > 0xf7) - switch (b->clusters_per_index_record) { - case 1: case 2: case 4: case 8: case 16: case 32: case 64: - break; - default: - goto not_ntfs; - } - /* - * Check for valid end of sector marker. We will work without it, but - * many BIOSes will refuse to boot from a bootsector if the magic is - * incorrect, so we emit a warning. - */ - if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55)) - ntfs_warning(sb, "Invalid end of sector marker."); - return true; -not_ntfs: - return false; -} - -/** - * read_ntfs_boot_sector - read the NTFS boot sector of a device - * @sb: super block of device to read the boot sector from - * @silent: if true, suppress all output - * - * Reads the boot sector from the device and validates it. If that fails, tries - * to read the backup boot sector, first from the end of the device a-la NT4 and - * later and then from the middle of the device a-la NT3.51 and before. - * - * If a valid boot sector is found but it is not the primary boot sector, we - * repair the primary boot sector silently (unless the device is read-only or - * the primary boot sector is not accessible). - * - * NOTE: To call this function, @sb must have the fields s_dev, the ntfs super - * block (u.ntfs_sb), nr_blocks and the device flags (s_flags) initialized - * to their respective values. - * - * Return the unlocked buffer head containing the boot sector or NULL on error. - */ -static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb, - const int silent) -{ - const char *read_err_str = "Unable to read %s boot sector."; - struct buffer_head *bh_primary, *bh_backup; - sector_t nr_blocks = NTFS_SB(sb)->nr_blocks; - - /* Try to read primary boot sector. */ - if ((bh_primary = sb_bread(sb, 0))) { - if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) - bh_primary->b_data, silent)) - return bh_primary; - if (!silent) - ntfs_error(sb, "Primary boot sector is invalid."); - } else if (!silent) - ntfs_error(sb, read_err_str, "primary"); - if (!(NTFS_SB(sb)->on_errors & ON_ERRORS_RECOVER)) { - if (bh_primary) - brelse(bh_primary); - if (!silent) - ntfs_error(sb, "Mount option errors=recover not used. " - "Aborting without trying to recover."); - return NULL; - } - /* Try to read NT4+ backup boot sector. */ - if ((bh_backup = sb_bread(sb, nr_blocks - 1))) { - if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) - bh_backup->b_data, silent)) - goto hotfix_primary_boot_sector; - brelse(bh_backup); - } else if (!silent) - ntfs_error(sb, read_err_str, "backup"); - /* Try to read NT3.51- backup boot sector. */ - if ((bh_backup = sb_bread(sb, nr_blocks >> 1))) { - if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) - bh_backup->b_data, silent)) - goto hotfix_primary_boot_sector; - if (!silent) - ntfs_error(sb, "Could not find a valid backup boot " - "sector."); - brelse(bh_backup); - } else if (!silent) - ntfs_error(sb, read_err_str, "backup"); - /* We failed. Cleanup and return. */ - if (bh_primary) - brelse(bh_primary); - return NULL; -hotfix_primary_boot_sector: - if (bh_primary) { - /* - * If we managed to read sector zero and the volume is not - * read-only, copy the found, valid backup boot sector to the - * primary boot sector. Note we only copy the actual boot - * sector structure, not the actual whole device sector as that - * may be bigger and would potentially damage the $Boot system - * file (FIXME: Would be nice to know if the backup boot sector - * on a large sector device contains the whole boot loader or - * just the first 512 bytes). - */ - if (!sb_rdonly(sb)) { - ntfs_warning(sb, "Hot-fix: Recovering invalid primary " - "boot sector from backup copy."); - memcpy(bh_primary->b_data, bh_backup->b_data, - NTFS_BLOCK_SIZE); - mark_buffer_dirty(bh_primary); - sync_dirty_buffer(bh_primary); - if (buffer_uptodate(bh_primary)) { - brelse(bh_backup); - return bh_primary; - } - ntfs_error(sb, "Hot-fix: Device write error while " - "recovering primary boot sector."); - } else { - ntfs_warning(sb, "Hot-fix: Recovery of primary boot " - "sector failed: Read-only mount."); - } - brelse(bh_primary); - } - ntfs_warning(sb, "Using backup boot sector."); - return bh_backup; -} - -/** - * parse_ntfs_boot_sector - parse the boot sector and store the data in @vol - * @vol: volume structure to initialise with data from boot sector - * @b: boot sector to parse - * - * Parse the ntfs boot sector @b and store all imporant information therein in - * the ntfs super block @vol. Return 'true' on success and 'false' on error. - */ -static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) -{ - unsigned int sectors_per_cluster_bits, nr_hidden_sects; - int clusters_per_mft_record, clusters_per_index_record; - s64 ll; - - vol->sector_size = le16_to_cpu(b->bpb.bytes_per_sector); - vol->sector_size_bits = ffs(vol->sector_size) - 1; - ntfs_debug("vol->sector_size = %i (0x%x)", vol->sector_size, - vol->sector_size); - ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits, - vol->sector_size_bits); - if (vol->sector_size < vol->sb->s_blocksize) { - ntfs_error(vol->sb, "Sector size (%i) is smaller than the " - "device block size (%lu). This is not " - "supported. Sorry.", vol->sector_size, - vol->sb->s_blocksize); - return false; - } - ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster); - sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1; - ntfs_debug("sectors_per_cluster_bits = 0x%x", - sectors_per_cluster_bits); - nr_hidden_sects = le32_to_cpu(b->bpb.hidden_sectors); - ntfs_debug("number of hidden sectors = 0x%x", nr_hidden_sects); - vol->cluster_size = vol->sector_size << sectors_per_cluster_bits; - vol->cluster_size_mask = vol->cluster_size - 1; - vol->cluster_size_bits = ffs(vol->cluster_size) - 1; - ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size, - vol->cluster_size); - ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask); - ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits); - if (vol->cluster_size < vol->sector_size) { - ntfs_error(vol->sb, "Cluster size (%i) is smaller than the " - "sector size (%i). This is not supported. " - "Sorry.", vol->cluster_size, vol->sector_size); - return false; - } - clusters_per_mft_record = b->clusters_per_mft_record; - ntfs_debug("clusters_per_mft_record = %i (0x%x)", - clusters_per_mft_record, clusters_per_mft_record); - if (clusters_per_mft_record > 0) - vol->mft_record_size = vol->cluster_size << - (ffs(clusters_per_mft_record) - 1); - else - /* - * When mft_record_size < cluster_size, clusters_per_mft_record - * = -log2(mft_record_size) bytes. mft_record_size normaly is - * 1024 bytes, which is encoded as 0xF6 (-10 in decimal). - */ - vol->mft_record_size = 1 << -clusters_per_mft_record; - vol->mft_record_size_mask = vol->mft_record_size - 1; - vol->mft_record_size_bits = ffs(vol->mft_record_size) - 1; - ntfs_debug("vol->mft_record_size = %i (0x%x)", vol->mft_record_size, - vol->mft_record_size); - ntfs_debug("vol->mft_record_size_mask = 0x%x", - vol->mft_record_size_mask); - ntfs_debug("vol->mft_record_size_bits = %i (0x%x)", - vol->mft_record_size_bits, vol->mft_record_size_bits); - /* - * We cannot support mft record sizes above the PAGE_SIZE since - * we store $MFT/$DATA, the table of mft records in the page cache. - */ - if (vol->mft_record_size > PAGE_SIZE) { - ntfs_error(vol->sb, "Mft record size (%i) exceeds the " - "PAGE_SIZE on your system (%lu). " - "This is not supported. Sorry.", - vol->mft_record_size, PAGE_SIZE); - return false; - } - /* We cannot support mft record sizes below the sector size. */ - if (vol->mft_record_size < vol->sector_size) { - ntfs_error(vol->sb, "Mft record size (%i) is smaller than the " - "sector size (%i). This is not supported. " - "Sorry.", vol->mft_record_size, - vol->sector_size); - return false; - } - clusters_per_index_record = b->clusters_per_index_record; - ntfs_debug("clusters_per_index_record = %i (0x%x)", - clusters_per_index_record, clusters_per_index_record); - if (clusters_per_index_record > 0) - vol->index_record_size = vol->cluster_size << - (ffs(clusters_per_index_record) - 1); - else - /* - * When index_record_size < cluster_size, - * clusters_per_index_record = -log2(index_record_size) bytes. - * index_record_size normaly equals 4096 bytes, which is - * encoded as 0xF4 (-12 in decimal). - */ - vol->index_record_size = 1 << -clusters_per_index_record; - vol->index_record_size_mask = vol->index_record_size - 1; - vol->index_record_size_bits = ffs(vol->index_record_size) - 1; - ntfs_debug("vol->index_record_size = %i (0x%x)", - vol->index_record_size, vol->index_record_size); - ntfs_debug("vol->index_record_size_mask = 0x%x", - vol->index_record_size_mask); - ntfs_debug("vol->index_record_size_bits = %i (0x%x)", - vol->index_record_size_bits, - vol->index_record_size_bits); - /* We cannot support index record sizes below the sector size. */ - if (vol->index_record_size < vol->sector_size) { - ntfs_error(vol->sb, "Index record size (%i) is smaller than " - "the sector size (%i). This is not " - "supported. Sorry.", vol->index_record_size, - vol->sector_size); - return false; - } - /* - * Get the size of the volume in clusters and check for 64-bit-ness. - * Windows currently only uses 32 bits to save the clusters so we do - * the same as it is much faster on 32-bit CPUs. - */ - ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits; - if ((u64)ll >= 1ULL << 32) { - ntfs_error(vol->sb, "Cannot handle 64-bit clusters. Sorry."); - return false; - } - vol->nr_clusters = ll; - ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters); - /* - * On an architecture where unsigned long is 32-bits, we restrict the - * volume size to 2TiB (2^41). On a 64-bit architecture, the compiler - * will hopefully optimize the whole check away. - */ - if (sizeof(unsigned long) < 8) { - if ((ll << vol->cluster_size_bits) >= (1ULL << 41)) { - ntfs_error(vol->sb, "Volume size (%lluTiB) is too " - "large for this architecture. " - "Maximum supported is 2TiB. Sorry.", - (unsigned long long)ll >> (40 - - vol->cluster_size_bits)); - return false; - } - } - ll = sle64_to_cpu(b->mft_lcn); - if (ll >= vol->nr_clusters) { - ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of " - "volume. Weird.", (unsigned long long)ll, - (unsigned long long)ll); - return false; - } - vol->mft_lcn = ll; - ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn); - ll = sle64_to_cpu(b->mftmirr_lcn); - if (ll >= vol->nr_clusters) { - ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end " - "of volume. Weird.", (unsigned long long)ll, - (unsigned long long)ll); - return false; - } - vol->mftmirr_lcn = ll; - ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn); -#ifdef NTFS_RW - /* - * Work out the size of the mft mirror in number of mft records. If the - * cluster size is less than or equal to the size taken by four mft - * records, the mft mirror stores the first four mft records. If the - * cluster size is bigger than the size taken by four mft records, the - * mft mirror contains as many mft records as will fit into one - * cluster. - */ - if (vol->cluster_size <= (4 << vol->mft_record_size_bits)) - vol->mftmirr_size = 4; - else - vol->mftmirr_size = vol->cluster_size >> - vol->mft_record_size_bits; - ntfs_debug("vol->mftmirr_size = %i", vol->mftmirr_size); -#endif /* NTFS_RW */ - vol->serial_no = le64_to_cpu(b->volume_serial_number); - ntfs_debug("vol->serial_no = 0x%llx", - (unsigned long long)vol->serial_no); - return true; -} - -/** - * ntfs_setup_allocators - initialize the cluster and mft allocators - * @vol: volume structure for which to setup the allocators - * - * Setup the cluster (lcn) and mft allocators to the starting values. - */ -static void ntfs_setup_allocators(ntfs_volume *vol) -{ -#ifdef NTFS_RW - LCN mft_zone_size, mft_lcn; -#endif /* NTFS_RW */ - - ntfs_debug("vol->mft_zone_multiplier = 0x%x", - vol->mft_zone_multiplier); -#ifdef NTFS_RW - /* Determine the size of the MFT zone. */ - mft_zone_size = vol->nr_clusters; - switch (vol->mft_zone_multiplier) { /* % of volume size in clusters */ - case 4: - mft_zone_size >>= 1; /* 50% */ - break; - case 3: - mft_zone_size = (mft_zone_size + - (mft_zone_size >> 1)) >> 2; /* 37.5% */ - break; - case 2: - mft_zone_size >>= 2; /* 25% */ - break; - /* case 1: */ - default: - mft_zone_size >>= 3; /* 12.5% */ - break; - } - /* Setup the mft zone. */ - vol->mft_zone_start = vol->mft_zone_pos = vol->mft_lcn; - ntfs_debug("vol->mft_zone_pos = 0x%llx", - (unsigned long long)vol->mft_zone_pos); - /* - * Calculate the mft_lcn for an unmodified NTFS volume (see mkntfs - * source) and if the actual mft_lcn is in the expected place or even - * further to the front of the volume, extend the mft_zone to cover the - * beginning of the volume as well. This is in order to protect the - * area reserved for the mft bitmap as well within the mft_zone itself. - * On non-standard volumes we do not protect it as the overhead would - * be higher than the speed increase we would get by doing it. - */ - mft_lcn = (8192 + 2 * vol->cluster_size - 1) / vol->cluster_size; - if (mft_lcn * vol->cluster_size < 16 * 1024) - mft_lcn = (16 * 1024 + vol->cluster_size - 1) / - vol->cluster_size; - if (vol->mft_zone_start <= mft_lcn) - vol->mft_zone_start = 0; - ntfs_debug("vol->mft_zone_start = 0x%llx", - (unsigned long long)vol->mft_zone_start); - /* - * Need to cap the mft zone on non-standard volumes so that it does - * not point outside the boundaries of the volume. We do this by - * halving the zone size until we are inside the volume. - */ - vol->mft_zone_end = vol->mft_lcn + mft_zone_size; - while (vol->mft_zone_end >= vol->nr_clusters) { - mft_zone_size >>= 1; - vol->mft_zone_end = vol->mft_lcn + mft_zone_size; - } - ntfs_debug("vol->mft_zone_end = 0x%llx", - (unsigned long long)vol->mft_zone_end); - /* - * Set the current position within each data zone to the start of the - * respective zone. - */ - vol->data1_zone_pos = vol->mft_zone_end; - ntfs_debug("vol->data1_zone_pos = 0x%llx", - (unsigned long long)vol->data1_zone_pos); - vol->data2_zone_pos = 0; - ntfs_debug("vol->data2_zone_pos = 0x%llx", - (unsigned long long)vol->data2_zone_pos); - - /* Set the mft data allocation position to mft record 24. */ - vol->mft_data_pos = 24; - ntfs_debug("vol->mft_data_pos = 0x%llx", - (unsigned long long)vol->mft_data_pos); -#endif /* NTFS_RW */ -} - -#ifdef NTFS_RW - -/** - * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume - * @vol: ntfs super block describing device whose mft mirror to load - * - * Return 'true' on success or 'false' on error. - */ -static bool load_and_init_mft_mirror(ntfs_volume *vol) -{ - struct inode *tmp_ino; - ntfs_inode *tmp_ni; - - ntfs_debug("Entering."); - /* Get mft mirror inode. */ - tmp_ino = ntfs_iget(vol->sb, FILE_MFTMirr); - if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { - if (!IS_ERR(tmp_ino)) - iput(tmp_ino); - /* Caller will display error message. */ - return false; - } - /* - * Re-initialize some specifics about $MFTMirr's inode as - * ntfs_read_inode() will have set up the default ones. - */ - /* Set uid and gid to root. */ - tmp_ino->i_uid = GLOBAL_ROOT_UID; - tmp_ino->i_gid = GLOBAL_ROOT_GID; - /* Regular file. No access for anyone. */ - tmp_ino->i_mode = S_IFREG; - /* No VFS initiated operations allowed for $MFTMirr. */ - tmp_ino->i_op = &ntfs_empty_inode_ops; - tmp_ino->i_fop = &ntfs_empty_file_ops; - /* Put in our special address space operations. */ - tmp_ino->i_mapping->a_ops = &ntfs_mst_aops; - tmp_ni = NTFS_I(tmp_ino); - /* The $MFTMirr, like the $MFT is multi sector transfer protected. */ - NInoSetMstProtected(tmp_ni); - NInoSetSparseDisabled(tmp_ni); - /* - * Set up our little cheat allowing us to reuse the async read io - * completion handler for directories. - */ - tmp_ni->itype.index.block_size = vol->mft_record_size; - tmp_ni->itype.index.block_size_bits = vol->mft_record_size_bits; - vol->mftmirr_ino = tmp_ino; - ntfs_debug("Done."); - return true; -} - -/** - * check_mft_mirror - compare contents of the mft mirror with the mft - * @vol: ntfs super block describing device whose mft mirror to check - * - * Return 'true' on success or 'false' on error. - * - * Note, this function also results in the mft mirror runlist being completely - * mapped into memory. The mft mirror write code requires this and will BUG() - * should it find an unmapped runlist element. - */ -static bool check_mft_mirror(ntfs_volume *vol) -{ - struct super_block *sb = vol->sb; - ntfs_inode *mirr_ni; - struct page *mft_page, *mirr_page; - u8 *kmft, *kmirr; - runlist_element *rl, rl2[2]; - pgoff_t index; - int mrecs_per_page, i; - - ntfs_debug("Entering."); - /* Compare contents of $MFT and $MFTMirr. */ - mrecs_per_page = PAGE_SIZE / vol->mft_record_size; - BUG_ON(!mrecs_per_page); - BUG_ON(!vol->mftmirr_size); - mft_page = mirr_page = NULL; - kmft = kmirr = NULL; - index = i = 0; - do { - u32 bytes; - - /* Switch pages if necessary. */ - if (!(i % mrecs_per_page)) { - if (index) { - ntfs_unmap_page(mft_page); - ntfs_unmap_page(mirr_page); - } - /* Get the $MFT page. */ - mft_page = ntfs_map_page(vol->mft_ino->i_mapping, - index); - if (IS_ERR(mft_page)) { - ntfs_error(sb, "Failed to read $MFT."); - return false; - } - kmft = page_address(mft_page); - /* Get the $MFTMirr page. */ - mirr_page = ntfs_map_page(vol->mftmirr_ino->i_mapping, - index); - if (IS_ERR(mirr_page)) { - ntfs_error(sb, "Failed to read $MFTMirr."); - goto mft_unmap_out; - } - kmirr = page_address(mirr_page); - ++index; - } - /* Do not check the record if it is not in use. */ - if (((MFT_RECORD*)kmft)->flags & MFT_RECORD_IN_USE) { - /* Make sure the record is ok. */ - if (ntfs_is_baad_recordp((le32*)kmft)) { - ntfs_error(sb, "Incomplete multi sector " - "transfer detected in mft " - "record %i.", i); -mm_unmap_out: - ntfs_unmap_page(mirr_page); -mft_unmap_out: - ntfs_unmap_page(mft_page); - return false; - } - } - /* Do not check the mirror record if it is not in use. */ - if (((MFT_RECORD*)kmirr)->flags & MFT_RECORD_IN_USE) { - if (ntfs_is_baad_recordp((le32*)kmirr)) { - ntfs_error(sb, "Incomplete multi sector " - "transfer detected in mft " - "mirror record %i.", i); - goto mm_unmap_out; - } - } - /* Get the amount of data in the current record. */ - bytes = le32_to_cpu(((MFT_RECORD*)kmft)->bytes_in_use); - if (bytes < sizeof(MFT_RECORD_OLD) || - bytes > vol->mft_record_size || - ntfs_is_baad_recordp((le32*)kmft)) { - bytes = le32_to_cpu(((MFT_RECORD*)kmirr)->bytes_in_use); - if (bytes < sizeof(MFT_RECORD_OLD) || - bytes > vol->mft_record_size || - ntfs_is_baad_recordp((le32*)kmirr)) - bytes = vol->mft_record_size; - } - /* Compare the two records. */ - if (memcmp(kmft, kmirr, bytes)) { - ntfs_error(sb, "$MFT and $MFTMirr (record %i) do not " - "match. Run ntfsfix or chkdsk.", i); - goto mm_unmap_out; - } - kmft += vol->mft_record_size; - kmirr += vol->mft_record_size; - } while (++i < vol->mftmirr_size); - /* Release the last pages. */ - ntfs_unmap_page(mft_page); - ntfs_unmap_page(mirr_page); - - /* Construct the mft mirror runlist by hand. */ - rl2[0].vcn = 0; - rl2[0].lcn = vol->mftmirr_lcn; - rl2[0].length = (vol->mftmirr_size * vol->mft_record_size + - vol->cluster_size - 1) / vol->cluster_size; - rl2[1].vcn = rl2[0].length; - rl2[1].lcn = LCN_ENOENT; - rl2[1].length = 0; - /* - * Because we have just read all of the mft mirror, we know we have - * mapped the full runlist for it. - */ - mirr_ni = NTFS_I(vol->mftmirr_ino); - down_read(&mirr_ni->runlist.lock); - rl = mirr_ni->runlist.rl; - /* Compare the two runlists. They must be identical. */ - i = 0; - do { - if (rl2[i].vcn != rl[i].vcn || rl2[i].lcn != rl[i].lcn || - rl2[i].length != rl[i].length) { - ntfs_error(sb, "$MFTMirr location mismatch. " - "Run chkdsk."); - up_read(&mirr_ni->runlist.lock); - return false; - } - } while (rl2[i++].length); - up_read(&mirr_ni->runlist.lock); - ntfs_debug("Done."); - return true; -} - -/** - * load_and_check_logfile - load and check the logfile inode for a volume - * @vol: ntfs super block describing device whose logfile to load - * - * Return 'true' on success or 'false' on error. - */ -static bool load_and_check_logfile(ntfs_volume *vol, - RESTART_PAGE_HEADER **rp) -{ - struct inode *tmp_ino; - - ntfs_debug("Entering."); - tmp_ino = ntfs_iget(vol->sb, FILE_LogFile); - if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { - if (!IS_ERR(tmp_ino)) - iput(tmp_ino); - /* Caller will display error message. */ - return false; - } - if (!ntfs_check_logfile(tmp_ino, rp)) { - iput(tmp_ino); - /* ntfs_check_logfile() will have displayed error output. */ - return false; - } - NInoSetSparseDisabled(NTFS_I(tmp_ino)); - vol->logfile_ino = tmp_ino; - ntfs_debug("Done."); - return true; -} - -#define NTFS_HIBERFIL_HEADER_SIZE 4096 - -/** - * check_windows_hibernation_status - check if Windows is suspended on a volume - * @vol: ntfs super block of device to check - * - * Check if Windows is hibernated on the ntfs volume @vol. This is done by - * looking for the file hiberfil.sys in the root directory of the volume. If - * the file is not present Windows is definitely not suspended. - * - * If hiberfil.sys exists and is less than 4kiB in size it means Windows is - * definitely suspended (this volume is not the system volume). Caveat: on a - * system with many volumes it is possible that the < 4kiB check is bogus but - * for now this should do fine. - * - * If hiberfil.sys exists and is larger than 4kiB in size, we need to read the - * hiberfil header (which is the first 4kiB). If this begins with "hibr", - * Windows is definitely suspended. If it is completely full of zeroes, - * Windows is definitely not hibernated. Any other case is treated as if - * Windows is suspended. This caters for the above mentioned caveat of a - * system with many volumes where no "hibr" magic would be present and there is - * no zero header. - * - * Return 0 if Windows is not hibernated on the volume, >0 if Windows is - * hibernated on the volume, and -errno on error. - */ -static int check_windows_hibernation_status(ntfs_volume *vol) -{ - MFT_REF mref; - struct inode *vi; - struct page *page; - u32 *kaddr, *kend; - ntfs_name *name = NULL; - int ret = 1; - static const ntfschar hiberfil[13] = { cpu_to_le16('h'), - cpu_to_le16('i'), cpu_to_le16('b'), - cpu_to_le16('e'), cpu_to_le16('r'), - cpu_to_le16('f'), cpu_to_le16('i'), - cpu_to_le16('l'), cpu_to_le16('.'), - cpu_to_le16('s'), cpu_to_le16('y'), - cpu_to_le16('s'), 0 }; - - ntfs_debug("Entering."); - /* - * Find the inode number for the hibernation file by looking up the - * filename hiberfil.sys in the root directory. - */ - inode_lock(vol->root_ino); - mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12, - &name); - inode_unlock(vol->root_ino); - if (IS_ERR_MREF(mref)) { - ret = MREF_ERR(mref); - /* If the file does not exist, Windows is not hibernated. */ - if (ret == -ENOENT) { - ntfs_debug("hiberfil.sys not present. Windows is not " - "hibernated on the volume."); - return 0; - } - /* A real error occurred. */ - ntfs_error(vol->sb, "Failed to find inode number for " - "hiberfil.sys."); - return ret; - } - /* We do not care for the type of match that was found. */ - kfree(name); - /* Get the inode. */ - vi = ntfs_iget(vol->sb, MREF(mref)); - if (IS_ERR(vi) || is_bad_inode(vi)) { - if (!IS_ERR(vi)) - iput(vi); - ntfs_error(vol->sb, "Failed to load hiberfil.sys."); - return IS_ERR(vi) ? PTR_ERR(vi) : -EIO; - } - if (unlikely(i_size_read(vi) < NTFS_HIBERFIL_HEADER_SIZE)) { - ntfs_debug("hiberfil.sys is smaller than 4kiB (0x%llx). " - "Windows is hibernated on the volume. This " - "is not the system volume.", i_size_read(vi)); - goto iput_out; - } - page = ntfs_map_page(vi->i_mapping, 0); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read from hiberfil.sys."); - ret = PTR_ERR(page); - goto iput_out; - } - kaddr = (u32*)page_address(page); - if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) { - ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is " - "hibernated on the volume. This is the " - "system volume."); - goto unm_iput_out; - } - kend = kaddr + NTFS_HIBERFIL_HEADER_SIZE/sizeof(*kaddr); - do { - if (unlikely(*kaddr)) { - ntfs_debug("hiberfil.sys is larger than 4kiB " - "(0x%llx), does not contain the " - "\"hibr\" magic, and does not have a " - "zero header. Windows is hibernated " - "on the volume. This is not the " - "system volume.", i_size_read(vi)); - goto unm_iput_out; - } - } while (++kaddr < kend); - ntfs_debug("hiberfil.sys contains a zero header. Windows is not " - "hibernated on the volume. This is the system " - "volume."); - ret = 0; -unm_iput_out: - ntfs_unmap_page(page); -iput_out: - iput(vi); - return ret; -} - -/** - * load_and_init_quota - load and setup the quota file for a volume if present - * @vol: ntfs super block describing device whose quota file to load - * - * Return 'true' on success or 'false' on error. If $Quota is not present, we - * leave vol->quota_ino as NULL and return success. - */ -static bool load_and_init_quota(ntfs_volume *vol) -{ - MFT_REF mref; - struct inode *tmp_ino; - ntfs_name *name = NULL; - static const ntfschar Quota[7] = { cpu_to_le16('$'), - cpu_to_le16('Q'), cpu_to_le16('u'), - cpu_to_le16('o'), cpu_to_le16('t'), - cpu_to_le16('a'), 0 }; - static ntfschar Q[3] = { cpu_to_le16('$'), - cpu_to_le16('Q'), 0 }; - - ntfs_debug("Entering."); - /* - * Find the inode number for the quota file by looking up the filename - * $Quota in the extended system files directory $Extend. - */ - inode_lock(vol->extend_ino); - mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6, - &name); - inode_unlock(vol->extend_ino); - if (IS_ERR_MREF(mref)) { - /* - * If the file does not exist, quotas are disabled and have - * never been enabled on this volume, just return success. - */ - if (MREF_ERR(mref) == -ENOENT) { - ntfs_debug("$Quota not present. Volume does not have " - "quotas enabled."); - /* - * No need to try to set quotas out of date if they are - * not enabled. - */ - NVolSetQuotaOutOfDate(vol); - return true; - } - /* A real error occurred. */ - ntfs_error(vol->sb, "Failed to find inode number for $Quota."); - return false; - } - /* We do not care for the type of match that was found. */ - kfree(name); - /* Get the inode. */ - tmp_ino = ntfs_iget(vol->sb, MREF(mref)); - if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { - if (!IS_ERR(tmp_ino)) - iput(tmp_ino); - ntfs_error(vol->sb, "Failed to load $Quota."); - return false; - } - vol->quota_ino = tmp_ino; - /* Get the $Q index allocation attribute. */ - tmp_ino = ntfs_index_iget(vol->quota_ino, Q, 2); - if (IS_ERR(tmp_ino)) { - ntfs_error(vol->sb, "Failed to load $Quota/$Q index."); - return false; - } - vol->quota_q_ino = tmp_ino; - ntfs_debug("Done."); - return true; -} - -/** - * load_and_init_usnjrnl - load and setup the transaction log if present - * @vol: ntfs super block describing device whose usnjrnl file to load - * - * Return 'true' on success or 'false' on error. - * - * If $UsnJrnl is not present or in the process of being disabled, we set - * NVolUsnJrnlStamped() and return success. - * - * If the $UsnJrnl $DATA/$J attribute has a size equal to the lowest valid usn, - * i.e. transaction logging has only just been enabled or the journal has been - * stamped and nothing has been logged since, we also set NVolUsnJrnlStamped() - * and return success. - */ -static bool load_and_init_usnjrnl(ntfs_volume *vol) -{ - MFT_REF mref; - struct inode *tmp_ino; - ntfs_inode *tmp_ni; - struct page *page; - ntfs_name *name = NULL; - USN_HEADER *uh; - static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'), - cpu_to_le16('U'), cpu_to_le16('s'), - cpu_to_le16('n'), cpu_to_le16('J'), - cpu_to_le16('r'), cpu_to_le16('n'), - cpu_to_le16('l'), 0 }; - static ntfschar Max[5] = { cpu_to_le16('$'), - cpu_to_le16('M'), cpu_to_le16('a'), - cpu_to_le16('x'), 0 }; - static ntfschar J[3] = { cpu_to_le16('$'), - cpu_to_le16('J'), 0 }; - - ntfs_debug("Entering."); - /* - * Find the inode number for the transaction log file by looking up the - * filename $UsnJrnl in the extended system files directory $Extend. - */ - inode_lock(vol->extend_ino); - mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8, - &name); - inode_unlock(vol->extend_ino); - if (IS_ERR_MREF(mref)) { - /* - * If the file does not exist, transaction logging is disabled, - * just return success. - */ - if (MREF_ERR(mref) == -ENOENT) { - ntfs_debug("$UsnJrnl not present. Volume does not " - "have transaction logging enabled."); -not_enabled: - /* - * No need to try to stamp the transaction log if - * transaction logging is not enabled. - */ - NVolSetUsnJrnlStamped(vol); - return true; - } - /* A real error occurred. */ - ntfs_error(vol->sb, "Failed to find inode number for " - "$UsnJrnl."); - return false; - } - /* We do not care for the type of match that was found. */ - kfree(name); - /* Get the inode. */ - tmp_ino = ntfs_iget(vol->sb, MREF(mref)); - if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) { - if (!IS_ERR(tmp_ino)) - iput(tmp_ino); - ntfs_error(vol->sb, "Failed to load $UsnJrnl."); - return false; - } - vol->usnjrnl_ino = tmp_ino; - /* - * If the transaction log is in the process of being deleted, we can - * ignore it. - */ - if (unlikely(vol->vol_flags & VOLUME_DELETE_USN_UNDERWAY)) { - ntfs_debug("$UsnJrnl in the process of being disabled. " - "Volume does not have transaction logging " - "enabled."); - goto not_enabled; - } - /* Get the $DATA/$Max attribute. */ - tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, Max, 4); - if (IS_ERR(tmp_ino)) { - ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$Max " - "attribute."); - return false; - } - vol->usnjrnl_max_ino = tmp_ino; - if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) { - ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max " - "attribute (size is 0x%llx but should be at " - "least 0x%zx bytes).", i_size_read(tmp_ino), - sizeof(USN_HEADER)); - return false; - } - /* Get the $DATA/$J attribute. */ - tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, J, 2); - if (IS_ERR(tmp_ino)) { - ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$J " - "attribute."); - return false; - } - vol->usnjrnl_j_ino = tmp_ino; - /* Verify $J is non-resident and sparse. */ - tmp_ni = NTFS_I(vol->usnjrnl_j_ino); - if (unlikely(!NInoNonResident(tmp_ni) || !NInoSparse(tmp_ni))) { - ntfs_error(vol->sb, "$UsnJrnl/$DATA/$J attribute is resident " - "and/or not sparse."); - return false; - } - /* Read the USN_HEADER from $DATA/$Max. */ - page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read from $UsnJrnl/$DATA/$Max " - "attribute."); - return false; - } - uh = (USN_HEADER*)page_address(page); - /* Sanity check the $Max. */ - if (unlikely(sle64_to_cpu(uh->allocation_delta) > - sle64_to_cpu(uh->maximum_size))) { - ntfs_error(vol->sb, "Allocation delta (0x%llx) exceeds " - "maximum size (0x%llx). $UsnJrnl is corrupt.", - (long long)sle64_to_cpu(uh->allocation_delta), - (long long)sle64_to_cpu(uh->maximum_size)); - ntfs_unmap_page(page); - return false; - } - /* - * If the transaction log has been stamped and nothing has been written - * to it since, we do not need to stamp it. - */ - if (unlikely(sle64_to_cpu(uh->lowest_valid_usn) >= - i_size_read(vol->usnjrnl_j_ino))) { - if (likely(sle64_to_cpu(uh->lowest_valid_usn) == - i_size_read(vol->usnjrnl_j_ino))) { - ntfs_unmap_page(page); - ntfs_debug("$UsnJrnl is enabled but nothing has been " - "logged since it was last stamped. " - "Treating this as if the volume does " - "not have transaction logging " - "enabled."); - goto not_enabled; - } - ntfs_error(vol->sb, "$UsnJrnl has lowest valid usn (0x%llx) " - "which is out of bounds (0x%llx). $UsnJrnl " - "is corrupt.", - (long long)sle64_to_cpu(uh->lowest_valid_usn), - i_size_read(vol->usnjrnl_j_ino)); - ntfs_unmap_page(page); - return false; - } - ntfs_unmap_page(page); - ntfs_debug("Done."); - return true; -} - -/** - * load_and_init_attrdef - load the attribute definitions table for a volume - * @vol: ntfs super block describing device whose attrdef to load - * - * Return 'true' on success or 'false' on error. - */ -static bool load_and_init_attrdef(ntfs_volume *vol) -{ - loff_t i_size; - struct super_block *sb = vol->sb; - struct inode *ino; - struct page *page; - pgoff_t index, max_index; - unsigned int size; - - ntfs_debug("Entering."); - /* Read attrdef table and setup vol->attrdef and vol->attrdef_size. */ - ino = ntfs_iget(sb, FILE_AttrDef); - if (IS_ERR(ino) || is_bad_inode(ino)) { - if (!IS_ERR(ino)) - iput(ino); - goto failed; - } - NInoSetSparseDisabled(NTFS_I(ino)); - /* The size of FILE_AttrDef must be above 0 and fit inside 31 bits. */ - i_size = i_size_read(ino); - if (i_size <= 0 || i_size > 0x7fffffff) - goto iput_failed; - vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(i_size); - if (!vol->attrdef) - goto iput_failed; - index = 0; - max_index = i_size >> PAGE_SHIFT; - size = PAGE_SIZE; - while (index < max_index) { - /* Read the attrdef table and copy it into the linear buffer. */ -read_partial_attrdef_page: - page = ntfs_map_page(ino->i_mapping, index); - if (IS_ERR(page)) - goto free_iput_failed; - memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT), - page_address(page), size); - ntfs_unmap_page(page); - } - if (size == PAGE_SIZE) { - size = i_size & ~PAGE_MASK; - if (size) - goto read_partial_attrdef_page; - } - vol->attrdef_size = i_size; - ntfs_debug("Read %llu bytes from $AttrDef.", i_size); - iput(ino); - return true; -free_iput_failed: - ntfs_free(vol->attrdef); - vol->attrdef = NULL; -iput_failed: - iput(ino); -failed: - ntfs_error(sb, "Failed to initialize attribute definition table."); - return false; -} - -#endif /* NTFS_RW */ - -/** - * load_and_init_upcase - load the upcase table for an ntfs volume - * @vol: ntfs super block describing device whose upcase to load - * - * Return 'true' on success or 'false' on error. - */ -static bool load_and_init_upcase(ntfs_volume *vol) -{ - loff_t i_size; - struct super_block *sb = vol->sb; - struct inode *ino; - struct page *page; - pgoff_t index, max_index; - unsigned int size; - int i, max; - - ntfs_debug("Entering."); - /* Read upcase table and setup vol->upcase and vol->upcase_len. */ - ino = ntfs_iget(sb, FILE_UpCase); - if (IS_ERR(ino) || is_bad_inode(ino)) { - if (!IS_ERR(ino)) - iput(ino); - goto upcase_failed; - } - /* - * The upcase size must not be above 64k Unicode characters, must not - * be zero and must be a multiple of sizeof(ntfschar). - */ - i_size = i_size_read(ino); - if (!i_size || i_size & (sizeof(ntfschar) - 1) || - i_size > 64ULL * 1024 * sizeof(ntfschar)) - goto iput_upcase_failed; - vol->upcase = (ntfschar*)ntfs_malloc_nofs(i_size); - if (!vol->upcase) - goto iput_upcase_failed; - index = 0; - max_index = i_size >> PAGE_SHIFT; - size = PAGE_SIZE; - while (index < max_index) { - /* Read the upcase table and copy it into the linear buffer. */ -read_partial_upcase_page: - page = ntfs_map_page(ino->i_mapping, index); - if (IS_ERR(page)) - goto iput_upcase_failed; - memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT), - page_address(page), size); - ntfs_unmap_page(page); - } - if (size == PAGE_SIZE) { - size = i_size & ~PAGE_MASK; - if (size) - goto read_partial_upcase_page; - } - vol->upcase_len = i_size >> UCHAR_T_SIZE_BITS; - ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).", - i_size, 64 * 1024 * sizeof(ntfschar)); - iput(ino); - mutex_lock(&ntfs_lock); - if (!default_upcase) { - ntfs_debug("Using volume specified $UpCase since default is " - "not present."); - mutex_unlock(&ntfs_lock); - return true; - } - max = default_upcase_len; - if (max > vol->upcase_len) - max = vol->upcase_len; - for (i = 0; i < max; i++) - if (vol->upcase[i] != default_upcase[i]) - break; - if (i == max) { - ntfs_free(vol->upcase); - vol->upcase = default_upcase; - vol->upcase_len = max; - ntfs_nr_upcase_users++; - mutex_unlock(&ntfs_lock); - ntfs_debug("Volume specified $UpCase matches default. Using " - "default."); - return true; - } - mutex_unlock(&ntfs_lock); - ntfs_debug("Using volume specified $UpCase since it does not match " - "the default."); - return true; -iput_upcase_failed: - iput(ino); - ntfs_free(vol->upcase); - vol->upcase = NULL; -upcase_failed: - mutex_lock(&ntfs_lock); - if (default_upcase) { - vol->upcase = default_upcase; - vol->upcase_len = default_upcase_len; - ntfs_nr_upcase_users++; - mutex_unlock(&ntfs_lock); - ntfs_error(sb, "Failed to load $UpCase from the volume. Using " - "default."); - return true; - } - mutex_unlock(&ntfs_lock); - ntfs_error(sb, "Failed to initialize upcase table."); - return false; -} - -/* - * The lcn and mft bitmap inodes are NTFS-internal inodes with - * their own special locking rules: - */ -static struct lock_class_key - lcnbmp_runlist_lock_key, lcnbmp_mrec_lock_key, - mftbmp_runlist_lock_key, mftbmp_mrec_lock_key; - -/** - * load_system_files - open the system files using normal functions - * @vol: ntfs super block describing device whose system files to load - * - * Open the system files with normal access functions and complete setting up - * the ntfs super block @vol. - * - * Return 'true' on success or 'false' on error. - */ -static bool load_system_files(ntfs_volume *vol) -{ - struct super_block *sb = vol->sb; - MFT_RECORD *m; - VOLUME_INFORMATION *vi; - ntfs_attr_search_ctx *ctx; -#ifdef NTFS_RW - RESTART_PAGE_HEADER *rp; - int err; -#endif /* NTFS_RW */ - - ntfs_debug("Entering."); -#ifdef NTFS_RW - /* Get mft mirror inode compare the contents of $MFT and $MFTMirr. */ - if (!load_and_init_mft_mirror(vol) || !check_mft_mirror(vol)) { - static const char *es1 = "Failed to load $MFTMirr"; - static const char *es2 = "$MFTMirr does not match $MFT"; - static const char *es3 = ". Run ntfsfix and/or chkdsk."; - - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - !vol->mftmirr_ino ? es1 : es2, - es3); - goto iput_mirr_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", - !vol->mftmirr_ino ? es1 : es2, es3); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", - !vol->mftmirr_ino ? es1 : es2, es3); - /* This will prevent a read-write remount. */ - NVolSetErrors(vol); - } -#endif /* NTFS_RW */ - /* Get mft bitmap attribute inode. */ - vol->mftbmp_ino = ntfs_attr_iget(vol->mft_ino, AT_BITMAP, NULL, 0); - if (IS_ERR(vol->mftbmp_ino)) { - ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute."); - goto iput_mirr_err_out; - } - lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->runlist.lock, - &mftbmp_runlist_lock_key); - lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->mrec_lock, - &mftbmp_mrec_lock_key); - /* Read upcase table and setup @vol->upcase and @vol->upcase_len. */ - if (!load_and_init_upcase(vol)) - goto iput_mftbmp_err_out; -#ifdef NTFS_RW - /* - * Read attribute definitions table and setup @vol->attrdef and - * @vol->attrdef_size. - */ - if (!load_and_init_attrdef(vol)) - goto iput_upcase_err_out; -#endif /* NTFS_RW */ - /* - * Get the cluster allocation bitmap inode and verify the size, no - * need for any locking at this stage as we are already running - * exclusively as we are mount in progress task. - */ - vol->lcnbmp_ino = ntfs_iget(sb, FILE_Bitmap); - if (IS_ERR(vol->lcnbmp_ino) || is_bad_inode(vol->lcnbmp_ino)) { - if (!IS_ERR(vol->lcnbmp_ino)) - iput(vol->lcnbmp_ino); - goto bitmap_failed; - } - lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->runlist.lock, - &lcnbmp_runlist_lock_key); - lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->mrec_lock, - &lcnbmp_mrec_lock_key); - - NInoSetSparseDisabled(NTFS_I(vol->lcnbmp_ino)); - if ((vol->nr_clusters + 7) >> 3 > i_size_read(vol->lcnbmp_ino)) { - iput(vol->lcnbmp_ino); -bitmap_failed: - ntfs_error(sb, "Failed to load $Bitmap."); - goto iput_attrdef_err_out; - } - /* - * Get the volume inode and setup our cache of the volume flags and - * version. - */ - vol->vol_ino = ntfs_iget(sb, FILE_Volume); - if (IS_ERR(vol->vol_ino) || is_bad_inode(vol->vol_ino)) { - if (!IS_ERR(vol->vol_ino)) - iput(vol->vol_ino); -volume_failed: - ntfs_error(sb, "Failed to load $Volume."); - goto iput_lcnbmp_err_out; - } - m = map_mft_record(NTFS_I(vol->vol_ino)); - if (IS_ERR(m)) { -iput_volume_failed: - iput(vol->vol_ino); - goto volume_failed; - } - if (!(ctx = ntfs_attr_get_search_ctx(NTFS_I(vol->vol_ino), m))) { - ntfs_error(sb, "Failed to get attribute search context."); - goto get_ctx_vol_failed; - } - if (ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0, - ctx) || ctx->attr->non_resident || ctx->attr->flags) { -err_put_vol: - ntfs_attr_put_search_ctx(ctx); -get_ctx_vol_failed: - unmap_mft_record(NTFS_I(vol->vol_ino)); - goto iput_volume_failed; - } - vi = (VOLUME_INFORMATION*)((char*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset)); - /* Some bounds checks. */ - if ((u8*)vi < (u8*)ctx->attr || (u8*)vi + - le32_to_cpu(ctx->attr->data.resident.value_length) > - (u8*)ctx->attr + le32_to_cpu(ctx->attr->length)) - goto err_put_vol; - /* Copy the volume flags and version to the ntfs_volume structure. */ - vol->vol_flags = vi->flags; - vol->major_ver = vi->major_ver; - vol->minor_ver = vi->minor_ver; - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(NTFS_I(vol->vol_ino)); - pr_info("volume version %i.%i.\n", vol->major_ver, - vol->minor_ver); - if (vol->major_ver < 3 && NVolSparseEnabled(vol)) { - ntfs_warning(vol->sb, "Disabling sparse support due to NTFS " - "volume version %i.%i (need at least version " - "3.0).", vol->major_ver, vol->minor_ver); - NVolClearSparseEnabled(vol); - } -#ifdef NTFS_RW - /* Make sure that no unsupported volume flags are set. */ - if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { - static const char *es1a = "Volume is dirty"; - static const char *es1b = "Volume has been modified by chkdsk"; - static const char *es1c = "Volume has unsupported flags set"; - static const char *es2a = ". Run chkdsk and mount in Windows."; - static const char *es2b = ". Mount in Windows."; - const char *es1, *es2; - - es2 = es2a; - if (vol->vol_flags & VOLUME_IS_DIRTY) - es1 = es1a; - else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { - es1 = es1b; - es2 = es2b; - } else { - es1 = es1c; - ntfs_warning(sb, "Unsupported volume flags 0x%x " - "encountered.", - (unsigned)le16_to_cpu(vol->vol_flags)); - } - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - goto iput_vol_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); - /* - * Do not set NVolErrors() because ntfs_remount() re-checks the - * flags which we need to do in case any flags have changed. - */ - } - /* - * Get the inode for the logfile, check it and determine if the volume - * was shutdown cleanly. - */ - rp = NULL; - if (!load_and_check_logfile(vol, &rp) || - !ntfs_is_logfile_clean(vol->logfile_ino, rp)) { - static const char *es1a = "Failed to load $LogFile"; - static const char *es1b = "$LogFile is not clean"; - static const char *es2 = ". Mount in Windows."; - const char *es1; - - es1 = !vol->logfile_ino ? es1a : es1b; - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - if (vol->logfile_ino) { - BUG_ON(!rp); - ntfs_free(rp); - } - goto iput_logfile_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); - /* This will prevent a read-write remount. */ - NVolSetErrors(vol); - } - ntfs_free(rp); -#endif /* NTFS_RW */ - /* Get the root directory inode so we can do path lookups. */ - vol->root_ino = ntfs_iget(sb, FILE_root); - if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) { - if (!IS_ERR(vol->root_ino)) - iput(vol->root_ino); - ntfs_error(sb, "Failed to load root directory."); - goto iput_logfile_err_out; - } -#ifdef NTFS_RW - /* - * Check if Windows is suspended to disk on the target volume. If it - * is hibernated, we must not write *anything* to the disk so set - * NVolErrors() without setting the dirty volume flag and mount - * read-only. This will prevent read-write remounting and it will also - * prevent all writes. - */ - err = check_windows_hibernation_status(vol); - if (unlikely(err)) { - static const char *es1a = "Failed to determine if Windows is " - "hibernated"; - static const char *es1b = "Windows is hibernated"; - static const char *es2 = ". Run chkdsk."; - const char *es1; - - es1 = err < 0 ? es1a : es1b; - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - goto iput_root_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); - /* This will prevent a read-write remount. */ - NVolSetErrors(vol); - } - /* If (still) a read-write mount, mark the volume dirty. */ - if (!sb_rdonly(sb) && ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { - static const char *es1 = "Failed to set dirty bit in volume " - "information flags"; - static const char *es2 = ". Run chkdsk."; - - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_root_err_out; - } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= SB_RDONLY; - /* - * Do not set NVolErrors() because ntfs_remount() might manage - * to set the dirty flag in which case all would be well. - */ - } -#if 0 - // TODO: Enable this code once we start modifying anything that is - // different between NTFS 1.2 and 3.x... - /* - * If (still) a read-write mount, set the NT4 compatibility flag on - * newer NTFS version volumes. - */ - if (!(sb->s_flags & SB_RDONLY) && (vol->major_ver > 1) && - ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { - static const char *es1 = "Failed to set NT4 compatibility flag"; - static const char *es2 = ". Run chkdsk."; - - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_root_err_out; - } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= SB_RDONLY; - NVolSetErrors(vol); - } -#endif - /* If (still) a read-write mount, empty the logfile. */ - if (!sb_rdonly(sb) && !ntfs_empty_logfile(vol->logfile_ino)) { - static const char *es1 = "Failed to empty $LogFile"; - static const char *es2 = ". Mount in Windows."; - - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_root_err_out; - } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= SB_RDONLY; - NVolSetErrors(vol); - } -#endif /* NTFS_RW */ - /* If on NTFS versions before 3.0, we are done. */ - if (unlikely(vol->major_ver < 3)) - return true; - /* NTFS 3.0+ specific initialization. */ - /* Get the security descriptors inode. */ - vol->secure_ino = ntfs_iget(sb, FILE_Secure); - if (IS_ERR(vol->secure_ino) || is_bad_inode(vol->secure_ino)) { - if (!IS_ERR(vol->secure_ino)) - iput(vol->secure_ino); - ntfs_error(sb, "Failed to load $Secure."); - goto iput_root_err_out; - } - // TODO: Initialize security. - /* Get the extended system files' directory inode. */ - vol->extend_ino = ntfs_iget(sb, FILE_Extend); - if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) || - !S_ISDIR(vol->extend_ino->i_mode)) { - if (!IS_ERR(vol->extend_ino)) - iput(vol->extend_ino); - ntfs_error(sb, "Failed to load $Extend."); - goto iput_sec_err_out; - } -#ifdef NTFS_RW - /* Find the quota file, load it if present, and set it up. */ - if (!load_and_init_quota(vol)) { - static const char *es1 = "Failed to load $Quota"; - static const char *es2 = ". Run chkdsk."; - - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - goto iput_quota_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); - /* This will prevent a read-write remount. */ - NVolSetErrors(vol); - } - /* If (still) a read-write mount, mark the quotas out of date. */ - if (!sb_rdonly(sb) && !ntfs_mark_quotas_out_of_date(vol)) { - static const char *es1 = "Failed to mark quotas out of date"; - static const char *es2 = ". Run chkdsk."; - - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_quota_err_out; - } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= SB_RDONLY; - NVolSetErrors(vol); - } - /* - * Find the transaction log file ($UsnJrnl), load it if present, check - * it, and set it up. - */ - if (!load_and_init_usnjrnl(vol)) { - static const char *es1 = "Failed to load $UsnJrnl"; - static const char *es2 = ". Run chkdsk."; - - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - goto iput_usnjrnl_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); - /* This will prevent a read-write remount. */ - NVolSetErrors(vol); - } - /* If (still) a read-write mount, stamp the transaction log. */ - if (!sb_rdonly(sb) && !ntfs_stamp_usnjrnl(vol)) { - static const char *es1 = "Failed to stamp transaction log " - "($UsnJrnl)"; - static const char *es2 = ". Run chkdsk."; - - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_usnjrnl_err_out; - } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= SB_RDONLY; - NVolSetErrors(vol); - } -#endif /* NTFS_RW */ - return true; -#ifdef NTFS_RW -iput_usnjrnl_err_out: - iput(vol->usnjrnl_j_ino); - iput(vol->usnjrnl_max_ino); - iput(vol->usnjrnl_ino); -iput_quota_err_out: - iput(vol->quota_q_ino); - iput(vol->quota_ino); - iput(vol->extend_ino); -#endif /* NTFS_RW */ -iput_sec_err_out: - iput(vol->secure_ino); -iput_root_err_out: - iput(vol->root_ino); -iput_logfile_err_out: -#ifdef NTFS_RW - iput(vol->logfile_ino); -iput_vol_err_out: -#endif /* NTFS_RW */ - iput(vol->vol_ino); -iput_lcnbmp_err_out: - iput(vol->lcnbmp_ino); -iput_attrdef_err_out: - vol->attrdef_size = 0; - if (vol->attrdef) { - ntfs_free(vol->attrdef); - vol->attrdef = NULL; - } -#ifdef NTFS_RW -iput_upcase_err_out: -#endif /* NTFS_RW */ - vol->upcase_len = 0; - mutex_lock(&ntfs_lock); - if (vol->upcase == default_upcase) { - ntfs_nr_upcase_users--; - vol->upcase = NULL; - } - mutex_unlock(&ntfs_lock); - if (vol->upcase) { - ntfs_free(vol->upcase); - vol->upcase = NULL; - } -iput_mftbmp_err_out: - iput(vol->mftbmp_ino); -iput_mirr_err_out: -#ifdef NTFS_RW - iput(vol->mftmirr_ino); -#endif /* NTFS_RW */ - return false; -} - -/** - * ntfs_put_super - called by the vfs to unmount a volume - * @sb: vfs superblock of volume to unmount - * - * ntfs_put_super() is called by the VFS (from fs/super.c::do_umount()) when - * the volume is being unmounted (umount system call has been invoked) and it - * releases all inodes and memory belonging to the NTFS specific part of the - * super block. - */ -static void ntfs_put_super(struct super_block *sb) -{ - ntfs_volume *vol = NTFS_SB(sb); - - ntfs_debug("Entering."); - -#ifdef NTFS_RW - /* - * Commit all inodes while they are still open in case some of them - * cause others to be dirtied. - */ - ntfs_commit_inode(vol->vol_ino); - - /* NTFS 3.0+ specific. */ - if (vol->major_ver >= 3) { - if (vol->usnjrnl_j_ino) - ntfs_commit_inode(vol->usnjrnl_j_ino); - if (vol->usnjrnl_max_ino) - ntfs_commit_inode(vol->usnjrnl_max_ino); - if (vol->usnjrnl_ino) - ntfs_commit_inode(vol->usnjrnl_ino); - if (vol->quota_q_ino) - ntfs_commit_inode(vol->quota_q_ino); - if (vol->quota_ino) - ntfs_commit_inode(vol->quota_ino); - if (vol->extend_ino) - ntfs_commit_inode(vol->extend_ino); - if (vol->secure_ino) - ntfs_commit_inode(vol->secure_ino); - } - - ntfs_commit_inode(vol->root_ino); - - down_write(&vol->lcnbmp_lock); - ntfs_commit_inode(vol->lcnbmp_ino); - up_write(&vol->lcnbmp_lock); - - down_write(&vol->mftbmp_lock); - ntfs_commit_inode(vol->mftbmp_ino); - up_write(&vol->mftbmp_lock); - - if (vol->logfile_ino) - ntfs_commit_inode(vol->logfile_ino); - - if (vol->mftmirr_ino) - ntfs_commit_inode(vol->mftmirr_ino); - ntfs_commit_inode(vol->mft_ino); - - /* - * If a read-write mount and no volume errors have occurred, mark the - * volume clean. Also, re-commit all affected inodes. - */ - if (!sb_rdonly(sb)) { - if (!NVolErrors(vol)) { - if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) - ntfs_warning(sb, "Failed to clear dirty bit " - "in volume information " - "flags. Run chkdsk."); - ntfs_commit_inode(vol->vol_ino); - ntfs_commit_inode(vol->root_ino); - if (vol->mftmirr_ino) - ntfs_commit_inode(vol->mftmirr_ino); - ntfs_commit_inode(vol->mft_ino); - } else { - ntfs_warning(sb, "Volume has errors. Leaving volume " - "marked dirty. Run chkdsk."); - } - } -#endif /* NTFS_RW */ - - iput(vol->vol_ino); - vol->vol_ino = NULL; - - /* NTFS 3.0+ specific clean up. */ - if (vol->major_ver >= 3) { -#ifdef NTFS_RW - if (vol->usnjrnl_j_ino) { - iput(vol->usnjrnl_j_ino); - vol->usnjrnl_j_ino = NULL; - } - if (vol->usnjrnl_max_ino) { - iput(vol->usnjrnl_max_ino); - vol->usnjrnl_max_ino = NULL; - } - if (vol->usnjrnl_ino) { - iput(vol->usnjrnl_ino); - vol->usnjrnl_ino = NULL; - } - if (vol->quota_q_ino) { - iput(vol->quota_q_ino); - vol->quota_q_ino = NULL; - } - if (vol->quota_ino) { - iput(vol->quota_ino); - vol->quota_ino = NULL; - } -#endif /* NTFS_RW */ - if (vol->extend_ino) { - iput(vol->extend_ino); - vol->extend_ino = NULL; - } - if (vol->secure_ino) { - iput(vol->secure_ino); - vol->secure_ino = NULL; - } - } - - iput(vol->root_ino); - vol->root_ino = NULL; - - down_write(&vol->lcnbmp_lock); - iput(vol->lcnbmp_ino); - vol->lcnbmp_ino = NULL; - up_write(&vol->lcnbmp_lock); - - down_write(&vol->mftbmp_lock); - iput(vol->mftbmp_ino); - vol->mftbmp_ino = NULL; - up_write(&vol->mftbmp_lock); - -#ifdef NTFS_RW - if (vol->logfile_ino) { - iput(vol->logfile_ino); - vol->logfile_ino = NULL; - } - if (vol->mftmirr_ino) { - /* Re-commit the mft mirror and mft just in case. */ - ntfs_commit_inode(vol->mftmirr_ino); - ntfs_commit_inode(vol->mft_ino); - iput(vol->mftmirr_ino); - vol->mftmirr_ino = NULL; - } - /* - * We should have no dirty inodes left, due to - * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as - * the underlying mft records are written out and cleaned. - */ - ntfs_commit_inode(vol->mft_ino); - write_inode_now(vol->mft_ino, 1); -#endif /* NTFS_RW */ - - iput(vol->mft_ino); - vol->mft_ino = NULL; - - /* Throw away the table of attribute definitions. */ - vol->attrdef_size = 0; - if (vol->attrdef) { - ntfs_free(vol->attrdef); - vol->attrdef = NULL; - } - vol->upcase_len = 0; - /* - * Destroy the global default upcase table if necessary. Also decrease - * the number of upcase users if we are a user. - */ - mutex_lock(&ntfs_lock); - if (vol->upcase == default_upcase) { - ntfs_nr_upcase_users--; - vol->upcase = NULL; - } - if (!ntfs_nr_upcase_users && default_upcase) { - ntfs_free(default_upcase); - default_upcase = NULL; - } - if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users) - free_compression_buffers(); - mutex_unlock(&ntfs_lock); - if (vol->upcase) { - ntfs_free(vol->upcase); - vol->upcase = NULL; - } - - unload_nls(vol->nls_map); - - sb->s_fs_info = NULL; - kfree(vol); -} - -/** - * get_nr_free_clusters - return the number of free clusters on a volume - * @vol: ntfs volume for which to obtain free cluster count - * - * Calculate the number of free clusters on the mounted NTFS volume @vol. We - * actually calculate the number of clusters in use instead because this - * allows us to not care about partial pages as these will be just zero filled - * and hence not be counted as allocated clusters. - * - * The only particularity is that clusters beyond the end of the logical ntfs - * volume will be marked as allocated to prevent errors which means we have to - * discount those at the end. This is important as the cluster bitmap always - * has a size in multiples of 8 bytes, i.e. up to 63 clusters could be outside - * the logical volume and marked in use when they are not as they do not exist. - * - * If any pages cannot be read we assume all clusters in the erroring pages are - * in use. This means we return an underestimate on errors which is better than - * an overestimate. - */ -static s64 get_nr_free_clusters(ntfs_volume *vol) -{ - s64 nr_free = vol->nr_clusters; - struct address_space *mapping = vol->lcnbmp_ino->i_mapping; - struct page *page; - pgoff_t index, max_index; - - ntfs_debug("Entering."); - /* Serialize accesses to the cluster bitmap. */ - down_read(&vol->lcnbmp_lock); - /* - * Convert the number of bits into bytes rounded up, then convert into - * multiples of PAGE_SIZE, rounding up so that if we have one - * full and one partial page max_index = 2. - */ - max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_SIZE - 1) >> - PAGE_SHIFT; - /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */ - ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.", - max_index, PAGE_SIZE / 4); - for (index = 0; index < max_index; index++) { - unsigned long *kaddr; - - /* - * Read the page from page cache, getting it from backing store - * if necessary, and increment the use count. - */ - page = read_mapping_page(mapping, index, NULL); - /* Ignore pages which errored synchronously. */ - if (IS_ERR(page)) { - ntfs_debug("read_mapping_page() error. Skipping " - "page (index 0x%lx).", index); - nr_free -= PAGE_SIZE * 8; - continue; - } - kaddr = kmap_atomic(page); - /* - * Subtract the number of set bits. If this - * is the last page and it is partial we don't really care as - * it just means we do a little extra work but it won't affect - * the result as all out of range bytes are set to zero by - * ntfs_readpage(). - */ - nr_free -= bitmap_weight(kaddr, - PAGE_SIZE * BITS_PER_BYTE); - kunmap_atomic(kaddr); - put_page(page); - } - ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1); - /* - * Fixup for eventual bits outside logical ntfs volume (see function - * description above). - */ - if (vol->nr_clusters & 63) - nr_free += 64 - (vol->nr_clusters & 63); - up_read(&vol->lcnbmp_lock); - /* If errors occurred we may well have gone below zero, fix this. */ - if (nr_free < 0) - nr_free = 0; - ntfs_debug("Exiting."); - return nr_free; -} - -/** - * __get_nr_free_mft_records - return the number of free inodes on a volume - * @vol: ntfs volume for which to obtain free inode count - * @nr_free: number of mft records in filesystem - * @max_index: maximum number of pages containing set bits - * - * Calculate the number of free mft records (inodes) on the mounted NTFS - * volume @vol. We actually calculate the number of mft records in use instead - * because this allows us to not care about partial pages as these will be just - * zero filled and hence not be counted as allocated mft record. - * - * If any pages cannot be read we assume all mft records in the erroring pages - * are in use. This means we return an underestimate on errors which is better - * than an overestimate. - * - * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing. - */ -static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, - s64 nr_free, const pgoff_t max_index) -{ - struct address_space *mapping = vol->mftbmp_ino->i_mapping; - struct page *page; - pgoff_t index; - - ntfs_debug("Entering."); - /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */ - ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = " - "0x%lx.", max_index, PAGE_SIZE / 4); - for (index = 0; index < max_index; index++) { - unsigned long *kaddr; - - /* - * Read the page from page cache, getting it from backing store - * if necessary, and increment the use count. - */ - page = read_mapping_page(mapping, index, NULL); - /* Ignore pages which errored synchronously. */ - if (IS_ERR(page)) { - ntfs_debug("read_mapping_page() error. Skipping " - "page (index 0x%lx).", index); - nr_free -= PAGE_SIZE * 8; - continue; - } - kaddr = kmap_atomic(page); - /* - * Subtract the number of set bits. If this - * is the last page and it is partial we don't really care as - * it just means we do a little extra work but it won't affect - * the result as all out of range bytes are set to zero by - * ntfs_readpage(). - */ - nr_free -= bitmap_weight(kaddr, - PAGE_SIZE * BITS_PER_BYTE); - kunmap_atomic(kaddr); - put_page(page); - } - ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.", - index - 1); - /* If errors occurred we may well have gone below zero, fix this. */ - if (nr_free < 0) - nr_free = 0; - ntfs_debug("Exiting."); - return nr_free; -} - -/** - * ntfs_statfs - return information about mounted NTFS volume - * @dentry: dentry from mounted volume - * @sfs: statfs structure in which to return the information - * - * Return information about the mounted NTFS volume @dentry in the statfs structure - * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is - * called). We interpret the values to be correct of the moment in time at - * which we are called. Most values are variable otherwise and this isn't just - * the free values but the totals as well. For example we can increase the - * total number of file nodes if we run out and we can keep doing this until - * there is no more space on the volume left at all. - * - * Called from vfs_statfs which is used to handle the statfs, fstatfs, and - * ustat system calls. - * - * Return 0 on success or -errno on error. - */ -static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) -{ - struct super_block *sb = dentry->d_sb; - s64 size; - ntfs_volume *vol = NTFS_SB(sb); - ntfs_inode *mft_ni = NTFS_I(vol->mft_ino); - pgoff_t max_index; - unsigned long flags; - - ntfs_debug("Entering."); - /* Type of filesystem. */ - sfs->f_type = NTFS_SB_MAGIC; - /* Optimal transfer block size. */ - sfs->f_bsize = PAGE_SIZE; - /* - * Total data blocks in filesystem in units of f_bsize and since - * inodes are also stored in data blocs ($MFT is a file) this is just - * the total clusters. - */ - sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >> - PAGE_SHIFT; - /* Free data blocks in filesystem in units of f_bsize. */ - size = get_nr_free_clusters(vol) << vol->cluster_size_bits >> - PAGE_SHIFT; - if (size < 0LL) - size = 0LL; - /* Free blocks avail to non-superuser, same as above on NTFS. */ - sfs->f_bavail = sfs->f_bfree = size; - /* Serialize accesses to the inode bitmap. */ - down_read(&vol->mftbmp_lock); - read_lock_irqsave(&mft_ni->size_lock, flags); - size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits; - /* - * Convert the maximum number of set bits into bytes rounded up, then - * convert into multiples of PAGE_SIZE, rounding up so that if we - * have one full and one partial page max_index = 2. - */ - max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits) - + 7) >> 3) + PAGE_SIZE - 1) >> PAGE_SHIFT; - read_unlock_irqrestore(&mft_ni->size_lock, flags); - /* Number of inodes in filesystem (at this point in time). */ - sfs->f_files = size; - /* Free inodes in fs (based on current total count). */ - sfs->f_ffree = __get_nr_free_mft_records(vol, size, max_index); - up_read(&vol->mftbmp_lock); - /* - * File system id. This is extremely *nix flavour dependent and even - * within Linux itself all fs do their own thing. I interpret this to - * mean a unique id associated with the mounted fs and not the id - * associated with the filesystem driver, the latter is already given - * by the filesystem type in sfs->f_type. Thus we use the 64-bit - * volume serial number splitting it into two 32-bit parts. We enter - * the least significant 32-bits in f_fsid[0] and the most significant - * 32-bits in f_fsid[1]. - */ - sfs->f_fsid = u64_to_fsid(vol->serial_no); - /* Maximum length of filenames. */ - sfs->f_namelen = NTFS_MAX_NAME_LEN; - return 0; -} - -#ifdef NTFS_RW -static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc) -{ - return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL); -} -#endif - -/* - * The complete super operations. - */ -static const struct super_operations ntfs_sops = { - .alloc_inode = ntfs_alloc_big_inode, /* VFS: Allocate new inode. */ - .free_inode = ntfs_free_big_inode, /* VFS: Deallocate inode. */ -#ifdef NTFS_RW - .write_inode = ntfs_write_inode, /* VFS: Write dirty inode to - disk. */ -#endif /* NTFS_RW */ - .put_super = ntfs_put_super, /* Syscall: umount. */ - .statfs = ntfs_statfs, /* Syscall: statfs */ - .remount_fs = ntfs_remount, /* Syscall: mount -o remount. */ - .evict_inode = ntfs_evict_big_inode, /* VFS: Called when an inode is - removed from memory. */ - .show_options = ntfs_show_options, /* Show mount options in - proc. */ -}; - -/** - * ntfs_fill_super - mount an ntfs filesystem - * @sb: super block of ntfs filesystem to mount - * @opt: string containing the mount options - * @silent: silence error output - * - * ntfs_fill_super() is called by the VFS to mount the device described by @sb - * with the mount otions in @data with the NTFS filesystem. - * - * If @silent is true, remain silent even if errors are detected. This is used - * during bootup, when the kernel tries to mount the root filesystem with all - * registered filesystems one after the other until one succeeds. This implies - * that all filesystems except the correct one will quite correctly and - * expectedly return an error, but nobody wants to see error messages when in - * fact this is what is supposed to happen. - * - * NOTE: @sb->s_flags contains the mount options flags. - */ -static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) -{ - ntfs_volume *vol; - struct buffer_head *bh; - struct inode *tmp_ino; - int blocksize, result; - - /* - * We do a pretty difficult piece of bootstrap by reading the - * MFT (and other metadata) from disk into memory. We'll only - * release this metadata during umount, so the locking patterns - * observed during bootstrap do not count. So turn off the - * observation of locking patterns (strictly for this context - * only) while mounting NTFS. [The validator is still active - * otherwise, even for this context: it will for example record - * lock class registrations.] - */ - lockdep_off(); - ntfs_debug("Entering."); -#ifndef NTFS_RW - sb->s_flags |= SB_RDONLY; -#endif /* ! NTFS_RW */ - /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */ - sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS); - vol = NTFS_SB(sb); - if (!vol) { - if (!silent) - ntfs_error(sb, "Allocation of NTFS volume structure " - "failed. Aborting mount..."); - lockdep_on(); - return -ENOMEM; - } - /* Initialize ntfs_volume structure. */ - *vol = (ntfs_volume) { - .sb = sb, - /* - * Default is group and other don't have any access to files or - * directories while owner has full access. Further, files by - * default are not executable but directories are of course - * browseable. - */ - .fmask = 0177, - .dmask = 0077, - }; - init_rwsem(&vol->mftbmp_lock); - init_rwsem(&vol->lcnbmp_lock); - - /* By default, enable sparse support. */ - NVolSetSparseEnabled(vol); - - /* Important to get the mount options dealt with now. */ - if (!parse_options(vol, (char*)opt)) - goto err_out_now; - - /* We support sector sizes up to the PAGE_SIZE. */ - if (bdev_logical_block_size(sb->s_bdev) > PAGE_SIZE) { - if (!silent) - ntfs_error(sb, "Device has unsupported sector size " - "(%i). The maximum supported sector " - "size on this architecture is %lu " - "bytes.", - bdev_logical_block_size(sb->s_bdev), - PAGE_SIZE); - goto err_out_now; - } - /* - * Setup the device access block size to NTFS_BLOCK_SIZE or the hard - * sector size, whichever is bigger. - */ - blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE); - if (blocksize < NTFS_BLOCK_SIZE) { - if (!silent) - ntfs_error(sb, "Unable to set device block size."); - goto err_out_now; - } - BUG_ON(blocksize != sb->s_blocksize); - ntfs_debug("Set device block size to %i bytes (block size bits %i).", - blocksize, sb->s_blocksize_bits); - /* Determine the size of the device in units of block_size bytes. */ - vol->nr_blocks = sb_bdev_nr_blocks(sb); - if (!vol->nr_blocks) { - if (!silent) - ntfs_error(sb, "Unable to determine device size."); - goto err_out_now; - } - /* Read the boot sector and return unlocked buffer head to it. */ - if (!(bh = read_ntfs_boot_sector(sb, silent))) { - if (!silent) - ntfs_error(sb, "Not an NTFS volume."); - goto err_out_now; - } - /* - * Extract the data from the boot sector and setup the ntfs volume - * using it. - */ - result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data); - brelse(bh); - if (!result) { - if (!silent) - ntfs_error(sb, "Unsupported NTFS filesystem."); - goto err_out_now; - } - /* - * If the boot sector indicates a sector size bigger than the current - * device block size, switch the device block size to the sector size. - * TODO: It may be possible to support this case even when the set - * below fails, we would just be breaking up the i/o for each sector - * into multiple blocks for i/o purposes but otherwise it should just - * work. However it is safer to leave disabled until someone hits this - * error message and then we can get them to try it without the setting - * so we know for sure that it works. - */ - if (vol->sector_size > blocksize) { - blocksize = sb_set_blocksize(sb, vol->sector_size); - if (blocksize != vol->sector_size) { - if (!silent) - ntfs_error(sb, "Unable to set device block " - "size to sector size (%i).", - vol->sector_size); - goto err_out_now; - } - BUG_ON(blocksize != sb->s_blocksize); - vol->nr_blocks = sb_bdev_nr_blocks(sb); - ntfs_debug("Changed device block size to %i bytes (block size " - "bits %i) to match volume sector size.", - blocksize, sb->s_blocksize_bits); - } - /* Initialize the cluster and mft allocators. */ - ntfs_setup_allocators(vol); - /* Setup remaining fields in the super block. */ - sb->s_magic = NTFS_SB_MAGIC; - /* - * Ntfs allows 63 bits for the file size, i.e. correct would be: - * sb->s_maxbytes = ~0ULL >> 1; - * But the kernel uses a long as the page cache page index which on - * 32-bit architectures is only 32-bits. MAX_LFS_FILESIZE is kernel - * defined to the maximum the page cache page index can cope with - * without overflowing the index or to 2^63 - 1, whichever is smaller. - */ - sb->s_maxbytes = MAX_LFS_FILESIZE; - /* Ntfs measures time in 100ns intervals. */ - sb->s_time_gran = 100; - /* - * Now load the metadata required for the page cache and our address - * space operations to function. We do this by setting up a specialised - * read_inode method and then just calling the normal iget() to obtain - * the inode for $MFT which is sufficient to allow our normal inode - * operations and associated address space operations to function. - */ - sb->s_op = &ntfs_sops; - tmp_ino = new_inode(sb); - if (!tmp_ino) { - if (!silent) - ntfs_error(sb, "Failed to load essential metadata."); - goto err_out_now; - } - tmp_ino->i_ino = FILE_MFT; - insert_inode_hash(tmp_ino); - if (ntfs_read_inode_mount(tmp_ino) < 0) { - if (!silent) - ntfs_error(sb, "Failed to load essential metadata."); - goto iput_tmp_ino_err_out_now; - } - mutex_lock(&ntfs_lock); - /* - * The current mount is a compression user if the cluster size is - * less than or equal 4kiB. - */ - if (vol->cluster_size <= 4096 && !ntfs_nr_compression_users++) { - result = allocate_compression_buffers(); - if (result) { - ntfs_error(NULL, "Failed to allocate buffers " - "for compression engine."); - ntfs_nr_compression_users--; - mutex_unlock(&ntfs_lock); - goto iput_tmp_ino_err_out_now; - } - } - /* - * Generate the global default upcase table if necessary. Also - * temporarily increment the number of upcase users to avoid race - * conditions with concurrent (u)mounts. - */ - if (!default_upcase) - default_upcase = generate_default_upcase(); - ntfs_nr_upcase_users++; - mutex_unlock(&ntfs_lock); - /* - * From now on, ignore @silent parameter. If we fail below this line, - * it will be due to a corrupt fs or a system error, so we report it. - */ - /* - * Open the system files with normal access functions and complete - * setting up the ntfs super block. - */ - if (!load_system_files(vol)) { - ntfs_error(sb, "Failed to load system files."); - goto unl_upcase_iput_tmp_ino_err_out_now; - } - - /* We grab a reference, simulating an ntfs_iget(). */ - ihold(vol->root_ino); - if ((sb->s_root = d_make_root(vol->root_ino))) { - ntfs_debug("Exiting, status successful."); - /* Release the default upcase if it has no users. */ - mutex_lock(&ntfs_lock); - if (!--ntfs_nr_upcase_users && default_upcase) { - ntfs_free(default_upcase); - default_upcase = NULL; - } - mutex_unlock(&ntfs_lock); - sb->s_export_op = &ntfs_export_ops; - lockdep_on(); - return 0; - } - ntfs_error(sb, "Failed to allocate root directory."); - /* Clean up after the successful load_system_files() call from above. */ - // TODO: Use ntfs_put_super() instead of repeating all this code... - // FIXME: Should mark the volume clean as the error is most likely - // -ENOMEM. - iput(vol->vol_ino); - vol->vol_ino = NULL; - /* NTFS 3.0+ specific clean up. */ - if (vol->major_ver >= 3) { -#ifdef NTFS_RW - if (vol->usnjrnl_j_ino) { - iput(vol->usnjrnl_j_ino); - vol->usnjrnl_j_ino = NULL; - } - if (vol->usnjrnl_max_ino) { - iput(vol->usnjrnl_max_ino); - vol->usnjrnl_max_ino = NULL; - } - if (vol->usnjrnl_ino) { - iput(vol->usnjrnl_ino); - vol->usnjrnl_ino = NULL; - } - if (vol->quota_q_ino) { - iput(vol->quota_q_ino); - vol->quota_q_ino = NULL; - } - if (vol->quota_ino) { - iput(vol->quota_ino); - vol->quota_ino = NULL; - } -#endif /* NTFS_RW */ - if (vol->extend_ino) { - iput(vol->extend_ino); - vol->extend_ino = NULL; - } - if (vol->secure_ino) { - iput(vol->secure_ino); - vol->secure_ino = NULL; - } - } - iput(vol->root_ino); - vol->root_ino = NULL; - iput(vol->lcnbmp_ino); - vol->lcnbmp_ino = NULL; - iput(vol->mftbmp_ino); - vol->mftbmp_ino = NULL; -#ifdef NTFS_RW - if (vol->logfile_ino) { - iput(vol->logfile_ino); - vol->logfile_ino = NULL; - } - if (vol->mftmirr_ino) { - iput(vol->mftmirr_ino); - vol->mftmirr_ino = NULL; - } -#endif /* NTFS_RW */ - /* Throw away the table of attribute definitions. */ - vol->attrdef_size = 0; - if (vol->attrdef) { - ntfs_free(vol->attrdef); - vol->attrdef = NULL; - } - vol->upcase_len = 0; - mutex_lock(&ntfs_lock); - if (vol->upcase == default_upcase) { - ntfs_nr_upcase_users--; - vol->upcase = NULL; - } - mutex_unlock(&ntfs_lock); - if (vol->upcase) { - ntfs_free(vol->upcase); - vol->upcase = NULL; - } - if (vol->nls_map) { - unload_nls(vol->nls_map); - vol->nls_map = NULL; - } - /* Error exit code path. */ -unl_upcase_iput_tmp_ino_err_out_now: - /* - * Decrease the number of upcase users and destroy the global default - * upcase table if necessary. - */ - mutex_lock(&ntfs_lock); - if (!--ntfs_nr_upcase_users && default_upcase) { - ntfs_free(default_upcase); - default_upcase = NULL; - } - if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users) - free_compression_buffers(); - mutex_unlock(&ntfs_lock); -iput_tmp_ino_err_out_now: - iput(tmp_ino); - if (vol->mft_ino && vol->mft_ino != tmp_ino) - iput(vol->mft_ino); - vol->mft_ino = NULL; - /* Errors at this stage are irrelevant. */ -err_out_now: - sb->s_fs_info = NULL; - kfree(vol); - ntfs_debug("Failed, returning -EINVAL."); - lockdep_on(); - return -EINVAL; -} - -/* - * This is a slab cache to optimize allocations and deallocations of Unicode - * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN - * (255) Unicode characters + a terminating NULL Unicode character. - */ -struct kmem_cache *ntfs_name_cache; - -/* Slab caches for efficient allocation/deallocation of inodes. */ -struct kmem_cache *ntfs_inode_cache; -struct kmem_cache *ntfs_big_inode_cache; - -/* Init once constructor for the inode slab cache. */ -static void ntfs_big_inode_init_once(void *foo) -{ - ntfs_inode *ni = (ntfs_inode *)foo; - - inode_init_once(VFS_I(ni)); -} - -/* - * Slab caches to optimize allocations and deallocations of attribute search - * contexts and index contexts, respectively. - */ -struct kmem_cache *ntfs_attr_ctx_cache; -struct kmem_cache *ntfs_index_ctx_cache; - -/* Driver wide mutex. */ -DEFINE_MUTEX(ntfs_lock); - -static struct dentry *ntfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super); -} - -static struct file_system_type ntfs_fs_type = { - .owner = THIS_MODULE, - .name = "ntfs", - .mount = ntfs_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; -MODULE_ALIAS_FS("ntfs"); - -/* Stable names for the slab caches. */ -static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache"; -static const char ntfs_attr_ctx_cache_name[] = "ntfs_attr_ctx_cache"; -static const char ntfs_name_cache_name[] = "ntfs_name_cache"; -static const char ntfs_inode_cache_name[] = "ntfs_inode_cache"; -static const char ntfs_big_inode_cache_name[] = "ntfs_big_inode_cache"; - -static int __init init_ntfs_fs(void) -{ - int err = 0; - - /* This may be ugly but it results in pretty output so who cares. (-8 */ - pr_info("driver " NTFS_VERSION " [Flags: R/" -#ifdef NTFS_RW - "W" -#else - "O" -#endif -#ifdef DEBUG - " DEBUG" -#endif -#ifdef MODULE - " MODULE" -#endif - "].\n"); - - ntfs_debug("Debug messages are enabled."); - - ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name, - sizeof(ntfs_index_context), 0 /* offset */, - SLAB_HWCACHE_ALIGN, NULL /* ctor */); - if (!ntfs_index_ctx_cache) { - pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name); - goto ictx_err_out; - } - ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name, - sizeof(ntfs_attr_search_ctx), 0 /* offset */, - SLAB_HWCACHE_ALIGN, NULL /* ctor */); - if (!ntfs_attr_ctx_cache) { - pr_crit("NTFS: Failed to create %s!\n", - ntfs_attr_ctx_cache_name); - goto actx_err_out; - } - - ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name, - (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!ntfs_name_cache) { - pr_crit("Failed to create %s!\n", ntfs_name_cache_name); - goto name_err_out; - } - - ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name, - sizeof(ntfs_inode), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); - if (!ntfs_inode_cache) { - pr_crit("Failed to create %s!\n", ntfs_inode_cache_name); - goto inode_err_out; - } - - ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name, - sizeof(big_ntfs_inode), 0, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| - SLAB_ACCOUNT, ntfs_big_inode_init_once); - if (!ntfs_big_inode_cache) { - pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name); - goto big_inode_err_out; - } - - /* Register the ntfs sysctls. */ - err = ntfs_sysctl(1); - if (err) { - pr_crit("Failed to register NTFS sysctls!\n"); - goto sysctl_err_out; - } - - err = register_filesystem(&ntfs_fs_type); - if (!err) { - ntfs_debug("NTFS driver registered successfully."); - return 0; /* Success! */ - } - pr_crit("Failed to register NTFS filesystem driver!\n"); - - /* Unregister the ntfs sysctls. */ - ntfs_sysctl(0); -sysctl_err_out: - kmem_cache_destroy(ntfs_big_inode_cache); -big_inode_err_out: - kmem_cache_destroy(ntfs_inode_cache); -inode_err_out: - kmem_cache_destroy(ntfs_name_cache); -name_err_out: - kmem_cache_destroy(ntfs_attr_ctx_cache); -actx_err_out: - kmem_cache_destroy(ntfs_index_ctx_cache); -ictx_err_out: - if (!err) { - pr_crit("Aborting NTFS filesystem driver registration...\n"); - err = -ENOMEM; - } - return err; -} - -static void __exit exit_ntfs_fs(void) -{ - ntfs_debug("Unregistering NTFS driver."); - - unregister_filesystem(&ntfs_fs_type); - - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - kmem_cache_destroy(ntfs_big_inode_cache); - kmem_cache_destroy(ntfs_inode_cache); - kmem_cache_destroy(ntfs_name_cache); - kmem_cache_destroy(ntfs_attr_ctx_cache); - kmem_cache_destroy(ntfs_index_ctx_cache); - /* Unregister the ntfs sysctls. */ - ntfs_sysctl(0); -} - -MODULE_AUTHOR("Anton Altaparmakov "); -MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc."); -MODULE_VERSION(NTFS_VERSION); -MODULE_LICENSE("GPL"); -#ifdef DEBUG -module_param(debug_msgs, bint, 0); -MODULE_PARM_DESC(debug_msgs, "Enable debug messages."); -#endif - -module_init(init_ntfs_fs) -module_exit(exit_ntfs_fs) diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c deleted file mode 100644 index 4e980170d86a4..0000000000000 --- a/fs/ntfs/sysctl.c +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of - * the Linux-NTFS project. Adapted from the old NTFS driver, - * Copyright (C) 1997 Martin von Löwis, Régis Duchesne - * - * Copyright (c) 2002-2005 Anton Altaparmakov - */ - -#ifdef DEBUG - -#include - -#ifdef CONFIG_SYSCTL - -#include -#include - -#include "sysctl.h" -#include "debug.h" - -/* Definition of the ntfs sysctl. */ -static struct ctl_table ntfs_sysctls[] = { - { - .procname = "ntfs-debug", - .data = &debug_msgs, /* Data pointer and size. */ - .maxlen = sizeof(debug_msgs), - .mode = 0644, /* Mode, proc handler. */ - .proc_handler = proc_dointvec - }, -}; - -/* Storage for the sysctls header. */ -static struct ctl_table_header *sysctls_root_table; - -/** - * ntfs_sysctl - add or remove the debug sysctl - * @add: add (1) or remove (0) the sysctl - * - * Add or remove the debug sysctl. Return 0 on success or -errno on error. - */ -int ntfs_sysctl(int add) -{ - if (add) { - BUG_ON(sysctls_root_table); - sysctls_root_table = register_sysctl("fs", ntfs_sysctls); - if (!sysctls_root_table) - return -ENOMEM; - } else { - BUG_ON(!sysctls_root_table); - unregister_sysctl_table(sysctls_root_table); - sysctls_root_table = NULL; - } - return 0; -} - -#endif /* CONFIG_SYSCTL */ -#endif /* DEBUG */ diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h deleted file mode 100644 index 96bb2299d2d55..0000000000000 --- a/fs/ntfs/sysctl.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of - * the Linux-NTFS project. Adapted from the old NTFS driver, - * Copyright (C) 1997 Martin von Löwis, Régis Duchesne - * - * Copyright (c) 2002-2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_SYSCTL_H -#define _LINUX_NTFS_SYSCTL_H - - -#if defined(DEBUG) && defined(CONFIG_SYSCTL) - -extern int ntfs_sysctl(int add); - -#else - -/* Just return success. */ -static inline int ntfs_sysctl(int add) -{ - return 0; -} - -#endif /* DEBUG && CONFIG_SYSCTL */ -#endif /* _LINUX_NTFS_SYSCTL_H */ diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h deleted file mode 100644 index 6b63261300ccb..0000000000000 --- a/fs/ntfs/time.h +++ /dev/null @@ -1,89 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * time.h - NTFS time conversion functions. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2005 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_TIME_H -#define _LINUX_NTFS_TIME_H - -#include /* For current_kernel_time(). */ -#include /* For do_div(). */ - -#include "endian.h" - -#define NTFS_TIME_OFFSET ((s64)(369 * 365 + 89) * 24 * 3600 * 10000000) - -/** - * utc2ntfs - convert Linux UTC time to NTFS time - * @ts: Linux UTC time to convert to NTFS time - * - * Convert the Linux UTC time @ts to its corresponding NTFS time and return - * that in little endian format. - * - * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec - * and a long tv_nsec where tv_sec is the number of 1-second intervals since - * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second - * intervals since the value of tv_sec. - * - * NTFS uses Microsoft's standard time format which is stored in a s64 and is - * measured as the number of 100-nano-second intervals since 1st January 1601, - * 00:00:00 UTC. - */ -static inline sle64 utc2ntfs(const struct timespec64 ts) -{ - /* - * Convert the seconds to 100ns intervals, add the nano-seconds - * converted to 100ns intervals, and then add the NTFS time offset. - */ - return cpu_to_sle64((s64)ts.tv_sec * 10000000 + ts.tv_nsec / 100 + - NTFS_TIME_OFFSET); -} - -/** - * get_current_ntfs_time - get the current time in little endian NTFS format - * - * Get the current time from the Linux kernel, convert it to its corresponding - * NTFS time and return that in little endian format. - */ -static inline sle64 get_current_ntfs_time(void) -{ - struct timespec64 ts; - - ktime_get_coarse_real_ts64(&ts); - return utc2ntfs(ts); -} - -/** - * ntfs2utc - convert NTFS time to Linux time - * @time: NTFS time (little endian) to convert to Linux UTC - * - * Convert the little endian NTFS time @time to its corresponding Linux UTC - * time and return that in cpu format. - * - * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec - * and a long tv_nsec where tv_sec is the number of 1-second intervals since - * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second - * intervals since the value of tv_sec. - * - * NTFS uses Microsoft's standard time format which is stored in a s64 and is - * measured as the number of 100 nano-second intervals since 1st January 1601, - * 00:00:00 UTC. - */ -static inline struct timespec64 ntfs2utc(const sle64 time) -{ - struct timespec64 ts; - - /* Subtract the NTFS time offset. */ - u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET); - /* - * Convert the time to 1-second intervals and the remainder to - * 1-nano-second intervals. - */ - ts.tv_nsec = do_div(t, 10000000) * 100; - ts.tv_sec = t; - return ts; -} - -#endif /* _LINUX_NTFS_TIME_H */ diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h deleted file mode 100644 index 9a47859e7a065..0000000000000 --- a/fs/ntfs/types.h +++ /dev/null @@ -1,55 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * types.h - Defines for NTFS Linux kernel driver specific types. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2005 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_TYPES_H -#define _LINUX_NTFS_TYPES_H - -#include - -typedef __le16 le16; -typedef __le32 le32; -typedef __le64 le64; -typedef __u16 __bitwise sle16; -typedef __u32 __bitwise sle32; -typedef __u64 __bitwise sle64; - -/* 2-byte Unicode character type. */ -typedef le16 ntfschar; -#define UCHAR_T_SIZE_BITS 1 - -/* - * Clusters are signed 64-bit values on NTFS volumes. We define two types, LCN - * and VCN, to allow for type checking and better code readability. - */ -typedef s64 VCN; -typedef sle64 leVCN; -typedef s64 LCN; -typedef sle64 leLCN; - -/* - * The NTFS journal $LogFile uses log sequence numbers which are signed 64-bit - * values. We define our own type LSN, to allow for type checking and better - * code readability. - */ -typedef s64 LSN; -typedef sle64 leLSN; - -/* - * The NTFS transaction log $UsnJrnl uses usn which are signed 64-bit values. - * We define our own type USN, to allow for type checking and better code - * readability. - */ -typedef s64 USN; -typedef sle64 leUSN; - -typedef enum { - CASE_SENSITIVE = 0, - IGNORE_CASE = 1, -} IGNORE_CASE_BOOL; - -#endif /* _LINUX_NTFS_TYPES_H */ diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c deleted file mode 100644 index a6b6c64f14a93..0000000000000 --- a/fs/ntfs/unistr.c +++ /dev/null @@ -1,384 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2006 Anton Altaparmakov - */ - -#include - -#include "types.h" -#include "debug.h" -#include "ntfs.h" - -/* - * IMPORTANT - * ========= - * - * All these routines assume that the Unicode characters are in little endian - * encoding inside the strings!!! - */ - -/* - * This is used by the name collation functions to quickly determine what - * characters are (in)valid. - */ -static const u8 legal_ansi_char_array[0x40] = { - 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, - 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, - - 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, - 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, - - 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17, - 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00, - - 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, - 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18, -}; - -/** - * ntfs_are_names_equal - compare two Unicode names for equality - * @s1: name to compare to @s2 - * @s1_len: length in Unicode characters of @s1 - * @s2: name to compare to @s1 - * @s2_len: length in Unicode characters of @s2 - * @ic: ignore case bool - * @upcase: upcase table (only if @ic == IGNORE_CASE) - * @upcase_size: length in Unicode characters of @upcase (if present) - * - * Compare the names @s1 and @s2 and return 'true' (1) if the names are - * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE, - * the @upcase table is used to performa a case insensitive comparison. - */ -bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, - const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_size) -{ - if (s1_len != s2_len) - return false; - if (ic == CASE_SENSITIVE) - return !ntfs_ucsncmp(s1, s2, s1_len); - return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size); -} - -/** - * ntfs_collate_names - collate two Unicode names - * @name1: first Unicode name to compare - * @name2: second Unicode name to compare - * @err_val: if @name1 contains an invalid character return this value - * @ic: either CASE_SENSITIVE or IGNORE_CASE - * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE) - * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE) - * - * ntfs_collate_names collates two Unicode names and returns: - * - * -1 if the first name collates before the second one, - * 0 if the names match, - * 1 if the second name collates before the first one, or - * @err_val if an invalid character is found in @name1 during the comparison. - * - * The following characters are considered invalid: '"', '*', '<', '>' and '?'. - */ -int ntfs_collate_names(const ntfschar *name1, const u32 name1_len, - const ntfschar *name2, const u32 name2_len, - const int err_val, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_len) -{ - u32 cnt, min_len; - u16 c1, c2; - - min_len = name1_len; - if (name1_len > name2_len) - min_len = name2_len; - for (cnt = 0; cnt < min_len; ++cnt) { - c1 = le16_to_cpu(*name1++); - c2 = le16_to_cpu(*name2++); - if (ic) { - if (c1 < upcase_len) - c1 = le16_to_cpu(upcase[c1]); - if (c2 < upcase_len) - c2 = le16_to_cpu(upcase[c2]); - } - if (c1 < 64 && legal_ansi_char_array[c1] & 8) - return err_val; - if (c1 < c2) - return -1; - if (c1 > c2) - return 1; - } - if (name1_len < name2_len) - return -1; - if (name1_len == name2_len) - return 0; - /* name1_len > name2_len */ - c1 = le16_to_cpu(*name1); - if (c1 < 64 && legal_ansi_char_array[c1] & 8) - return err_val; - return 1; -} - -/** - * ntfs_ucsncmp - compare two little endian Unicode strings - * @s1: first string - * @s2: second string - * @n: maximum unicode characters to compare - * - * Compare the first @n characters of the Unicode strings @s1 and @s2, - * The strings in little endian format and appropriate le16_to_cpu() - * conversion is performed on non-little endian machines. - * - * The function returns an integer less than, equal to, or greater than zero - * if @s1 (or the first @n Unicode characters thereof) is found, respectively, - * to be less than, to match, or be greater than @s2. - */ -int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n) -{ - u16 c1, c2; - size_t i; - - for (i = 0; i < n; ++i) { - c1 = le16_to_cpu(s1[i]); - c2 = le16_to_cpu(s2[i]); - if (c1 < c2) - return -1; - if (c1 > c2) - return 1; - if (!c1) - break; - } - return 0; -} - -/** - * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case - * @s1: first string - * @s2: second string - * @n: maximum unicode characters to compare - * @upcase: upcase table - * @upcase_size: upcase table size in Unicode characters - * - * Compare the first @n characters of the Unicode strings @s1 and @s2, - * ignoring case. The strings in little endian format and appropriate - * le16_to_cpu() conversion is performed on non-little endian machines. - * - * Each character is uppercased using the @upcase table before the comparison. - * - * The function returns an integer less than, equal to, or greater than zero - * if @s1 (or the first @n Unicode characters thereof) is found, respectively, - * to be less than, to match, or be greater than @s2. - */ -int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, - const ntfschar *upcase, const u32 upcase_size) -{ - size_t i; - u16 c1, c2; - - for (i = 0; i < n; ++i) { - if ((c1 = le16_to_cpu(s1[i])) < upcase_size) - c1 = le16_to_cpu(upcase[c1]); - if ((c2 = le16_to_cpu(s2[i])) < upcase_size) - c2 = le16_to_cpu(upcase[c2]); - if (c1 < c2) - return -1; - if (c1 > c2) - return 1; - if (!c1) - break; - } - return 0; -} - -void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase, - const u32 upcase_len) -{ - u32 i; - u16 u; - - for (i = 0; i < name_len; i++) - if ((u = le16_to_cpu(name[i])) < upcase_len) - name[i] = upcase[u]; -} - -void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr, - const ntfschar *upcase, const u32 upcase_len) -{ - ntfs_upcase_name((ntfschar*)&file_name_attr->file_name, - file_name_attr->file_name_length, upcase, upcase_len); -} - -int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1, - FILE_NAME_ATTR *file_name_attr2, - const int err_val, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_len) -{ - return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name, - file_name_attr1->file_name_length, - (ntfschar*)&file_name_attr2->file_name, - file_name_attr2->file_name_length, - err_val, ic, upcase, upcase_len); -} - -/** - * ntfs_nlstoucs - convert NLS string to little endian Unicode string - * @vol: ntfs volume which we are working with - * @ins: input NLS string buffer - * @ins_len: length of input string in bytes - * @outs: on return contains the allocated output Unicode string buffer - * - * Convert the input string @ins, which is in whatever format the loaded NLS - * map dictates, into a little endian, 2-byte Unicode string. - * - * This function allocates the string and the caller is responsible for - * calling kmem_cache_free(ntfs_name_cache, *@outs); when finished with it. - * - * On success the function returns the number of Unicode characters written to - * the output string *@outs (>= 0), not counting the terminating Unicode NULL - * character. *@outs is set to the allocated output string buffer. - * - * On error, a negative number corresponding to the error code is returned. In - * that case the output string is not allocated. Both *@outs and *@outs_len - * are then undefined. - * - * This might look a bit odd due to fast path optimization... - */ -int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, - const int ins_len, ntfschar **outs) -{ - struct nls_table *nls = vol->nls_map; - ntfschar *ucs; - wchar_t wc; - int i, o, wc_len; - - /* We do not trust outside sources. */ - if (likely(ins)) { - ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS); - if (likely(ucs)) { - for (i = o = 0; i < ins_len; i += wc_len) { - wc_len = nls->char2uni(ins + i, ins_len - i, - &wc); - if (likely(wc_len >= 0 && - o < NTFS_MAX_NAME_LEN)) { - if (likely(wc)) { - ucs[o++] = cpu_to_le16(wc); - continue; - } /* else if (!wc) */ - break; - } /* else if (wc_len < 0 || - o >= NTFS_MAX_NAME_LEN) */ - goto name_err; - } - ucs[o] = 0; - *outs = ucs; - return o; - } /* else if (!ucs) */ - ntfs_error(vol->sb, "Failed to allocate buffer for converted " - "name from ntfs_name_cache."); - return -ENOMEM; - } /* else if (!ins) */ - ntfs_error(vol->sb, "Received NULL pointer."); - return -EINVAL; -name_err: - kmem_cache_free(ntfs_name_cache, ucs); - if (wc_len < 0) { - ntfs_error(vol->sb, "Name using character set %s contains " - "characters that cannot be converted to " - "Unicode.", nls->charset); - i = -EILSEQ; - } else /* if (o >= NTFS_MAX_NAME_LEN) */ { - ntfs_error(vol->sb, "Name is too long (maximum length for a " - "name on NTFS is %d Unicode characters.", - NTFS_MAX_NAME_LEN); - i = -ENAMETOOLONG; - } - return i; -} - -/** - * ntfs_ucstonls - convert little endian Unicode string to NLS string - * @vol: ntfs volume which we are working with - * @ins: input Unicode string buffer - * @ins_len: length of input string in Unicode characters - * @outs: on return contains the (allocated) output NLS string buffer - * @outs_len: length of output string buffer in bytes - * - * Convert the input little endian, 2-byte Unicode string @ins, of length - * @ins_len into the string format dictated by the loaded NLS. - * - * If *@outs is NULL, this function allocates the string and the caller is - * responsible for calling kfree(*@outs); when finished with it. In this case - * @outs_len is ignored and can be 0. - * - * On success the function returns the number of bytes written to the output - * string *@outs (>= 0), not counting the terminating NULL byte. If the output - * string buffer was allocated, *@outs is set to it. - * - * On error, a negative number corresponding to the error code is returned. In - * that case the output string is not allocated. The contents of *@outs are - * then undefined. - * - * This might look a bit odd due to fast path optimization... - */ -int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, - const int ins_len, unsigned char **outs, int outs_len) -{ - struct nls_table *nls = vol->nls_map; - unsigned char *ns; - int i, o, ns_len, wc; - - /* We don't trust outside sources. */ - if (ins) { - ns = *outs; - ns_len = outs_len; - if (ns && !ns_len) { - wc = -ENAMETOOLONG; - goto conversion_err; - } - if (!ns) { - ns_len = ins_len * NLS_MAX_CHARSET_SIZE; - ns = kmalloc(ns_len + 1, GFP_NOFS); - if (!ns) - goto mem_err_out; - } - for (i = o = 0; i < ins_len; i++) { -retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o, - ns_len - o); - if (wc > 0) { - o += wc; - continue; - } else if (!wc) - break; - else if (wc == -ENAMETOOLONG && ns != *outs) { - unsigned char *tc; - /* Grow in multiples of 64 bytes. */ - tc = kmalloc((ns_len + 64) & - ~63, GFP_NOFS); - if (tc) { - memcpy(tc, ns, ns_len); - ns_len = ((ns_len + 64) & ~63) - 1; - kfree(ns); - ns = tc; - goto retry; - } /* No memory so goto conversion_error; */ - } /* wc < 0, real error. */ - goto conversion_err; - } - ns[o] = 0; - *outs = ns; - return o; - } /* else (!ins) */ - ntfs_error(vol->sb, "Received NULL pointer."); - return -EINVAL; -conversion_err: - ntfs_error(vol->sb, "Unicode name contains characters that cannot be " - "converted to character set %s. You might want to " - "try to use the mount option nls=utf8.", nls->charset); - if (ns != *outs) - kfree(ns); - if (wc != -ENAMETOOLONG) - wc = -EILSEQ; - return wc; -mem_err_out: - ntfs_error(vol->sb, "Failed to allocate name!"); - return -ENOMEM; -} diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c deleted file mode 100644 index 4ebe84a78dea5..0000000000000 --- a/fs/ntfs/upcase.c +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * upcase.c - Generate the full NTFS Unicode upcase table in little endian. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001 Richard Russon - * Copyright (c) 2001-2006 Anton Altaparmakov - */ - -#include "malloc.h" -#include "ntfs.h" - -ntfschar *generate_default_upcase(void) -{ - static const int uc_run_table[][3] = { /* Start, End, Add */ - {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74}, - {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86}, - {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100}, - {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128}, - {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112}, - {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126}, - {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8}, - {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8}, - {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8}, - {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7}, - {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16}, - {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26}, - {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32}, - {0} - }; - - static const int uc_dup_table[][2] = { /* Start, End */ - {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC}, - {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB}, - {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5}, - {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9}, - {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95}, - {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9}, - {0} - }; - - static const int uc_word_table[][2] = { /* Offset, Value */ - {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196}, - {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C}, - {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D}, - {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F}, - {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9}, - {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE}, - {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7}, - {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197}, - {0} - }; - - int i, r; - ntfschar *uc; - - uc = ntfs_malloc_nofs(default_upcase_len * sizeof(ntfschar)); - if (!uc) - return uc; - memset(uc, 0, default_upcase_len * sizeof(ntfschar)); - /* Generate the little endian Unicode upcase table used by ntfs. */ - for (i = 0; i < default_upcase_len; i++) - uc[i] = cpu_to_le16(i); - for (r = 0; uc_run_table[r][0]; r++) - for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++) - le16_add_cpu(&uc[i], uc_run_table[r][2]); - for (r = 0; uc_dup_table[r][0]; r++) - for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2) - le16_add_cpu(&uc[i + 1], -1); - for (r = 0; uc_word_table[r][0]; r++) - uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]); - return uc; -} diff --git a/fs/ntfs/usnjrnl.c b/fs/ntfs/usnjrnl.c deleted file mode 100644 index 9097a0b4ef253..0000000000000 --- a/fs/ntfs/usnjrnl.c +++ /dev/null @@ -1,70 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * usnjrnl.h - NTFS kernel transaction log ($UsnJrnl) handling. Part of the - * Linux-NTFS project. - * - * Copyright (c) 2005 Anton Altaparmakov - */ - -#ifdef NTFS_RW - -#include -#include -#include - -#include "aops.h" -#include "debug.h" -#include "endian.h" -#include "time.h" -#include "types.h" -#include "usnjrnl.h" -#include "volume.h" - -/** - * ntfs_stamp_usnjrnl - stamp the transaction log ($UsnJrnl) on an ntfs volume - * @vol: ntfs volume on which to stamp the transaction log - * - * Stamp the transaction log ($UsnJrnl) on the ntfs volume @vol and return - * 'true' on success and 'false' on error. - * - * This function assumes that the transaction log has already been loaded and - * consistency checked by a call to fs/ntfs/super.c::load_and_init_usnjrnl(). - */ -bool ntfs_stamp_usnjrnl(ntfs_volume *vol) -{ - ntfs_debug("Entering."); - if (likely(!NVolUsnJrnlStamped(vol))) { - sle64 stamp; - struct page *page; - USN_HEADER *uh; - - page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read from " - "$UsnJrnl/$DATA/$Max attribute."); - return false; - } - uh = (USN_HEADER*)page_address(page); - stamp = get_current_ntfs_time(); - ntfs_debug("Stamping transaction log ($UsnJrnl): old " - "journal_id 0x%llx, old lowest_valid_usn " - "0x%llx, new journal_id 0x%llx, new " - "lowest_valid_usn 0x%llx.", - (long long)sle64_to_cpu(uh->journal_id), - (long long)sle64_to_cpu(uh->lowest_valid_usn), - (long long)sle64_to_cpu(stamp), - i_size_read(vol->usnjrnl_j_ino)); - uh->lowest_valid_usn = - cpu_to_sle64(i_size_read(vol->usnjrnl_j_ino)); - uh->journal_id = stamp; - flush_dcache_page(page); - set_page_dirty(page); - ntfs_unmap_page(page); - /* Set the flag so we do not have to do it again on remount. */ - NVolSetUsnJrnlStamped(vol); - } - ntfs_debug("Done."); - return true; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h deleted file mode 100644 index 85f531b593955..0000000000000 --- a/fs/ntfs/usnjrnl.h +++ /dev/null @@ -1,191 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * usnjrnl.h - Defines for NTFS kernel transaction log ($UsnJrnl) handling. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2005 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_USNJRNL_H -#define _LINUX_NTFS_USNJRNL_H - -#ifdef NTFS_RW - -#include "types.h" -#include "endian.h" -#include "layout.h" -#include "volume.h" - -/* - * Transaction log ($UsnJrnl) organization: - * - * The transaction log records whenever a file is modified in any way. So for - * example it will record that file "blah" was written to at a particular time - * but not what was written. If will record that a file was deleted or - * created, that a file was truncated, etc. See below for all the reason - * codes used. - * - * The transaction log is in the $Extend directory which is in the root - * directory of each volume. If it is not present it means transaction - * logging is disabled. If it is present it means transaction logging is - * either enabled or in the process of being disabled in which case we can - * ignore it as it will go away as soon as Windows gets its hands on it. - * - * To determine whether the transaction logging is enabled or in the process - * of being disabled, need to check the volume flags in the - * $VOLUME_INFORMATION attribute in the $Volume system file (which is present - * in the root directory and has a fixed mft record number, see layout.h). - * If the flag VOLUME_DELETE_USN_UNDERWAY is set it means the transaction log - * is in the process of being disabled and if this flag is clear it means the - * transaction log is enabled. - * - * The transaction log consists of two parts; the $DATA/$Max attribute as well - * as the $DATA/$J attribute. $Max is a header describing the transaction - * log whilst $J is the transaction log data itself as a sequence of variable - * sized USN_RECORDs (see below for all the structures). - * - * We do not care about transaction logging at this point in time but we still - * need to let windows know that the transaction log is out of date. To do - * this we need to stamp the transaction log. This involves setting the - * lowest_valid_usn field in the $DATA/$Max attribute to the usn to be used - * for the next added USN_RECORD to the $DATA/$J attribute as well as - * generating a new journal_id in $DATA/$Max. - * - * The journal_id is as of the current version (2.0) of the transaction log - * simply the 64-bit timestamp of when the journal was either created or last - * stamped. - * - * To determine the next usn there are two ways. The first is to parse - * $DATA/$J and to find the last USN_RECORD in it and to add its record_length - * to its usn (which is the byte offset in the $DATA/$J attribute). The - * second is simply to take the data size of the attribute. Since the usns - * are simply byte offsets into $DATA/$J, this is exactly the next usn. For - * obvious reasons we use the second method as it is much simpler and faster. - * - * As an aside, note that to actually disable the transaction log, one would - * need to set the VOLUME_DELETE_USN_UNDERWAY flag (see above), then go - * through all the mft records on the volume and set the usn field in their - * $STANDARD_INFORMATION attribute to zero. Once that is done, one would need - * to delete the transaction log file, i.e. \$Extent\$UsnJrnl, and finally, - * one would need to clear the VOLUME_DELETE_USN_UNDERWAY flag. - * - * Note that if a volume is unmounted whilst the transaction log is being - * disabled, the process will continue the next time the volume is mounted. - * This is why we can safely mount read-write when we see a transaction log - * in the process of being deleted. - */ - -/* Some $UsnJrnl related constants. */ -#define UsnJrnlMajorVer 2 -#define UsnJrnlMinorVer 0 - -/* - * $DATA/$Max attribute. This is (always?) resident and has a fixed size of - * 32 bytes. It contains the header describing the transaction log. - */ -typedef struct { -/*Ofs*/ -/* 0*/sle64 maximum_size; /* The maximum on-disk size of the $DATA/$J - attribute. */ -/* 8*/sle64 allocation_delta; /* Number of bytes by which to increase the - size of the $DATA/$J attribute. */ -/*0x10*/sle64 journal_id; /* Current id of the transaction log. */ -/*0x18*/leUSN lowest_valid_usn; /* Lowest valid usn in $DATA/$J for the - current journal_id. */ -/* sizeof() = 32 (0x20) bytes */ -} __attribute__ ((__packed__)) USN_HEADER; - -/* - * Reason flags (32-bit). Cumulative flags describing the change(s) to the - * file since it was last opened. I think the names speak for themselves but - * if you disagree check out the descriptions in the Linux NTFS project NTFS - * documentation: http://www.linux-ntfs.org/ - */ -enum { - USN_REASON_DATA_OVERWRITE = cpu_to_le32(0x00000001), - USN_REASON_DATA_EXTEND = cpu_to_le32(0x00000002), - USN_REASON_DATA_TRUNCATION = cpu_to_le32(0x00000004), - USN_REASON_NAMED_DATA_OVERWRITE = cpu_to_le32(0x00000010), - USN_REASON_NAMED_DATA_EXTEND = cpu_to_le32(0x00000020), - USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040), - USN_REASON_FILE_CREATE = cpu_to_le32(0x00000100), - USN_REASON_FILE_DELETE = cpu_to_le32(0x00000200), - USN_REASON_EA_CHANGE = cpu_to_le32(0x00000400), - USN_REASON_SECURITY_CHANGE = cpu_to_le32(0x00000800), - USN_REASON_RENAME_OLD_NAME = cpu_to_le32(0x00001000), - USN_REASON_RENAME_NEW_NAME = cpu_to_le32(0x00002000), - USN_REASON_INDEXABLE_CHANGE = cpu_to_le32(0x00004000), - USN_REASON_BASIC_INFO_CHANGE = cpu_to_le32(0x00008000), - USN_REASON_HARD_LINK_CHANGE = cpu_to_le32(0x00010000), - USN_REASON_COMPRESSION_CHANGE = cpu_to_le32(0x00020000), - USN_REASON_ENCRYPTION_CHANGE = cpu_to_le32(0x00040000), - USN_REASON_OBJECT_ID_CHANGE = cpu_to_le32(0x00080000), - USN_REASON_REPARSE_POINT_CHANGE = cpu_to_le32(0x00100000), - USN_REASON_STREAM_CHANGE = cpu_to_le32(0x00200000), - USN_REASON_CLOSE = cpu_to_le32(0x80000000), -}; - -typedef le32 USN_REASON_FLAGS; - -/* - * Source info flags (32-bit). Information about the source of the change(s) - * to the file. For detailed descriptions of what these mean, see the Linux - * NTFS project NTFS documentation: - * http://www.linux-ntfs.org/ - */ -enum { - USN_SOURCE_DATA_MANAGEMENT = cpu_to_le32(0x00000001), - USN_SOURCE_AUXILIARY_DATA = cpu_to_le32(0x00000002), - USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004), -}; - -typedef le32 USN_SOURCE_INFO_FLAGS; - -/* - * $DATA/$J attribute. This is always non-resident, is marked as sparse, and - * is of variabled size. It consists of a sequence of variable size - * USN_RECORDS. The minimum allocated_size is allocation_delta as - * specified in $DATA/$Max. When the maximum_size specified in $DATA/$Max is - * exceeded by more than allocation_delta bytes, allocation_delta bytes are - * allocated and appended to the $DATA/$J attribute and an equal number of - * bytes at the beginning of the attribute are freed and made sparse. Note the - * making sparse only happens at volume checkpoints and hence the actual - * $DATA/$J size can exceed maximum_size + allocation_delta temporarily. - */ -typedef struct { -/*Ofs*/ -/* 0*/le32 length; /* Byte size of this record (8-byte - aligned). */ -/* 4*/le16 major_ver; /* Major version of the transaction log used - for this record. */ -/* 6*/le16 minor_ver; /* Minor version of the transaction log used - for this record. */ -/* 8*/leMFT_REF mft_reference;/* The mft reference of the file (or - directory) described by this record. */ -/*0x10*/leMFT_REF parent_directory;/* The mft reference of the parent - directory of the file described by this - record. */ -/*0x18*/leUSN usn; /* The usn of this record. Equals the offset - within the $DATA/$J attribute. */ -/*0x20*/sle64 time; /* Time when this record was created. */ -/*0x28*/USN_REASON_FLAGS reason;/* Reason flags (see above). */ -/*0x2c*/USN_SOURCE_INFO_FLAGS source_info;/* Source info flags (see above). */ -/*0x30*/le32 security_id; /* File security_id copied from - $STANDARD_INFORMATION. */ -/*0x34*/FILE_ATTR_FLAGS file_attributes; /* File attributes copied from - $STANDARD_INFORMATION or $FILE_NAME (not - sure which). */ -/*0x38*/le16 file_name_size; /* Size of the file name in bytes. */ -/*0x3a*/le16 file_name_offset; /* Offset to the file name in bytes from the - start of this record. */ -/*0x3c*/ntfschar file_name[0]; /* Use when creating only. When reading use - file_name_offset to determine the location - of the name. */ -/* sizeof() = 60 (0x3c) bytes */ -} __attribute__ ((__packed__)) USN_RECORD; - -extern bool ntfs_stamp_usnjrnl(ntfs_volume *vol); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_USNJRNL_H */ diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h deleted file mode 100644 index 930a9ae8a0534..0000000000000 --- a/fs/ntfs/volume.h +++ /dev/null @@ -1,164 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part - * of the Linux-NTFS project. - * - * Copyright (c) 2001-2006 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_VOLUME_H -#define _LINUX_NTFS_VOLUME_H - -#include -#include - -#include "types.h" -#include "layout.h" - -/* - * The NTFS in memory super block structure. - */ -typedef struct { - /* - * FIXME: Reorder to have commonly used together element within the - * same cache line, aiming at a cache line size of 32 bytes. Aim for - * 64 bytes for less commonly used together elements. Put most commonly - * used elements to front of structure. Obviously do this only when the - * structure has stabilized... (AIA) - */ - /* Device specifics. */ - struct super_block *sb; /* Pointer back to the super_block. */ - LCN nr_blocks; /* Number of sb->s_blocksize bytes - sized blocks on the device. */ - /* Configuration provided by user at mount time. */ - unsigned long flags; /* Miscellaneous flags, see below. */ - kuid_t uid; /* uid that files will be mounted as. */ - kgid_t gid; /* gid that files will be mounted as. */ - umode_t fmask; /* The mask for file permissions. */ - umode_t dmask; /* The mask for directory - permissions. */ - u8 mft_zone_multiplier; /* Initial mft zone multiplier. */ - u8 on_errors; /* What to do on filesystem errors. */ - /* NTFS bootsector provided information. */ - u16 sector_size; /* in bytes */ - u8 sector_size_bits; /* log2(sector_size) */ - u32 cluster_size; /* in bytes */ - u32 cluster_size_mask; /* cluster_size - 1 */ - u8 cluster_size_bits; /* log2(cluster_size) */ - u32 mft_record_size; /* in bytes */ - u32 mft_record_size_mask; /* mft_record_size - 1 */ - u8 mft_record_size_bits; /* log2(mft_record_size) */ - u32 index_record_size; /* in bytes */ - u32 index_record_size_mask; /* index_record_size - 1 */ - u8 index_record_size_bits; /* log2(index_record_size) */ - LCN nr_clusters; /* Volume size in clusters == number of - bits in lcn bitmap. */ - LCN mft_lcn; /* Cluster location of mft data. */ - LCN mftmirr_lcn; /* Cluster location of copy of mft. */ - u64 serial_no; /* The volume serial number. */ - /* Mount specific NTFS information. */ - u32 upcase_len; /* Number of entries in upcase[]. */ - ntfschar *upcase; /* The upcase table. */ - - s32 attrdef_size; /* Size of the attribute definition - table in bytes. */ - ATTR_DEF *attrdef; /* Table of attribute definitions. - Obtained from FILE_AttrDef. */ - -#ifdef NTFS_RW - /* Variables used by the cluster and mft allocators. */ - s64 mft_data_pos; /* Mft record number at which to - allocate the next mft record. */ - LCN mft_zone_start; /* First cluster of the mft zone. */ - LCN mft_zone_end; /* First cluster beyond the mft zone. */ - LCN mft_zone_pos; /* Current position in the mft zone. */ - LCN data1_zone_pos; /* Current position in the first data - zone. */ - LCN data2_zone_pos; /* Current position in the second data - zone. */ -#endif /* NTFS_RW */ - - struct inode *mft_ino; /* The VFS inode of $MFT. */ - - struct inode *mftbmp_ino; /* Attribute inode for $MFT/$BITMAP. */ - struct rw_semaphore mftbmp_lock; /* Lock for serializing accesses to the - mft record bitmap ($MFT/$BITMAP). */ -#ifdef NTFS_RW - struct inode *mftmirr_ino; /* The VFS inode of $MFTMirr. */ - int mftmirr_size; /* Size of mft mirror in mft records. */ - - struct inode *logfile_ino; /* The VFS inode of $LogFile. */ -#endif /* NTFS_RW */ - - struct inode *lcnbmp_ino; /* The VFS inode of $Bitmap. */ - struct rw_semaphore lcnbmp_lock; /* Lock for serializing accesses to the - cluster bitmap ($Bitmap/$DATA). */ - - struct inode *vol_ino; /* The VFS inode of $Volume. */ - VOLUME_FLAGS vol_flags; /* Volume flags. */ - u8 major_ver; /* Ntfs major version of volume. */ - u8 minor_ver; /* Ntfs minor version of volume. */ - - struct inode *root_ino; /* The VFS inode of the root - directory. */ - struct inode *secure_ino; /* The VFS inode of $Secure (NTFS3.0+ - only, otherwise NULL). */ - struct inode *extend_ino; /* The VFS inode of $Extend (NTFS3.0+ - only, otherwise NULL). */ -#ifdef NTFS_RW - /* $Quota stuff is NTFS3.0+ specific. Unused/NULL otherwise. */ - struct inode *quota_ino; /* The VFS inode of $Quota. */ - struct inode *quota_q_ino; /* Attribute inode for $Quota/$Q. */ - /* $UsnJrnl stuff is NTFS3.0+ specific. Unused/NULL otherwise. */ - struct inode *usnjrnl_ino; /* The VFS inode of $UsnJrnl. */ - struct inode *usnjrnl_max_ino; /* Attribute inode for $UsnJrnl/$Max. */ - struct inode *usnjrnl_j_ino; /* Attribute inode for $UsnJrnl/$J. */ -#endif /* NTFS_RW */ - struct nls_table *nls_map; -} ntfs_volume; - -/* - * Defined bits for the flags field in the ntfs_volume structure. - */ -typedef enum { - NV_Errors, /* 1: Volume has errors, prevent remount rw. */ - NV_ShowSystemFiles, /* 1: Return system files in ntfs_readdir(). */ - NV_CaseSensitive, /* 1: Treat file names as case sensitive and - create filenames in the POSIX namespace. - Otherwise be case insensitive but still - create file names in POSIX namespace. */ - NV_LogFileEmpty, /* 1: $LogFile journal is empty. */ - NV_QuotaOutOfDate, /* 1: $Quota is out of date. */ - NV_UsnJrnlStamped, /* 1: $UsnJrnl has been stamped. */ - NV_SparseEnabled, /* 1: May create sparse files. */ -} ntfs_volume_flags; - -/* - * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo() - * functions. - */ -#define DEFINE_NVOL_BIT_OPS(flag) \ -static inline int NVol##flag(ntfs_volume *vol) \ -{ \ - return test_bit(NV_##flag, &(vol)->flags); \ -} \ -static inline void NVolSet##flag(ntfs_volume *vol) \ -{ \ - set_bit(NV_##flag, &(vol)->flags); \ -} \ -static inline void NVolClear##flag(ntfs_volume *vol) \ -{ \ - clear_bit(NV_##flag, &(vol)->flags); \ -} - -/* Emit the ntfs volume bitops functions. */ -DEFINE_NVOL_BIT_OPS(Errors) -DEFINE_NVOL_BIT_OPS(ShowSystemFiles) -DEFINE_NVOL_BIT_OPS(CaseSensitive) -DEFINE_NVOL_BIT_OPS(LogFileEmpty) -DEFINE_NVOL_BIT_OPS(QuotaOutOfDate) -DEFINE_NVOL_BIT_OPS(UsnJrnlStamped) -DEFINE_NVOL_BIT_OPS(SparseEnabled) - -#endif /* _LINUX_NTFS_VOLUME_H */ -- GitLab From c2427e70c1630d98966375fffc2b713ab9768a94 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 22 Jan 2024 10:08:07 -0800 Subject: [PATCH 0073/1519] x86/resctrl: Implement new mba_MBps throttling heuristic The mba_MBps feedback loop increases throttling when a group is using more bandwidth than the target set by the user in the schemata file, and decreases throttling when below target. To avoid possibly stepping throttling up and down on every poll a flag "delta_comp" is set whenever throttling is changed to indicate that the actual change in bandwidth should be recorded on the next poll in "delta_bw". Throttling is only reduced if the current bandwidth plus delta_bw is below the user target. This algorithm works well if the workload has steady bandwidth needs. But it can go badly wrong if the workload moves to a different phase just as the throttling level changed. E.g. if the workload becomes essentially idle right as throttling level is increased, the value calculated for delta_bw will be more or less the old bandwidth level. If the workload then resumes, Linux may never reduce throttling because current bandwidth plus delta_bw is above the target set by the user. Implement a simpler heuristic by assuming that in the worst case the currently measured bandwidth is being controlled by the current level of throttling. Compute how much it may increase if throttling is relaxed to the next higher level. If that is still below the user target, then it is ok to reduce the amount of throttling. Fixes: ba0f26d8529c ("x86/intel_rdt/mba_sc: Prepare for feedback loop") Reported-by: Xiaochen Shen Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Xiaochen Shen Link: https://lore.kernel.org/r/20240122180807.70518-1-tony.luck@intel.com --- arch/x86/kernel/cpu/resctrl/internal.h | 4 --- arch/x86/kernel/cpu/resctrl/monitor.c | 42 ++++++-------------------- 2 files changed, 10 insertions(+), 36 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index e3dc35a00a197..52e7e7deee106 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -295,14 +295,10 @@ struct rftype { * struct mbm_state - status for each MBM counter in each domain * @prev_bw_bytes: Previous bytes value read for bandwidth calculation * @prev_bw: The most recent bandwidth in MBps - * @delta_bw: Difference between the current and previous bandwidth - * @delta_comp: Indicates whether to compute the delta_bw */ struct mbm_state { u64 prev_bw_bytes; u32 prev_bw; - u32 delta_bw; - bool delta_comp; }; /** diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index acca577e2b066..3a6c069614eb8 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -440,9 +440,6 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) cur_bw = bytes / SZ_1M; - if (m->delta_comp) - m->delta_bw = abs(cur_bw - m->prev_bw); - m->delta_comp = false; m->prev_bw = cur_bw; } @@ -520,11 +517,11 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) { u32 closid, rmid, cur_msr_val, new_msr_val; struct mbm_state *pmbm_data, *cmbm_data; - u32 cur_bw, delta_bw, user_bw; struct rdt_resource *r_mba; struct rdt_domain *dom_mba; struct list_head *head; struct rdtgroup *entry; + u32 cur_bw, user_bw; if (!is_mbm_local_enabled()) return; @@ -543,7 +540,6 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) cur_bw = pmbm_data->prev_bw; user_bw = dom_mba->mbps_val[closid]; - delta_bw = pmbm_data->delta_bw; /* MBA resource doesn't support CDP */ cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); @@ -555,49 +551,31 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) list_for_each_entry(entry, head, mon.crdtgrp_list) { cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; cur_bw += cmbm_data->prev_bw; - delta_bw += cmbm_data->delta_bw; } /* * Scale up/down the bandwidth linearly for the ctrl group. The * bandwidth step is the bandwidth granularity specified by the * hardware. - * - * The delta_bw is used when increasing the bandwidth so that we - * dont alternately increase and decrease the control values - * continuously. - * - * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if - * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep - * switching between 90 and 110 continuously if we only check - * cur_bw < user_bw. + * Always increase throttling if current bandwidth is above the + * target set by user. + * But avoid thrashing up and down on every poll by checking + * whether a decrease in throttling is likely to push the group + * back over target. E.g. if currently throttling to 30% of bandwidth + * on a system with 10% granularity steps, check whether moving to + * 40% would go past the limit by multiplying current bandwidth by + * "(30 + 10) / 30". */ if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { new_msr_val = cur_msr_val - r_mba->membw.bw_gran; } else if (cur_msr_val < MAX_MBA_BW && - (user_bw > (cur_bw + delta_bw))) { + (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { new_msr_val = cur_msr_val + r_mba->membw.bw_gran; } else { return; } resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); - - /* - * Delta values are updated dynamically package wise for each - * rdtgrp every time the throttle MSR changes value. - * - * This is because (1)the increase in bandwidth is not perfectly - * linear and only "approximately" linear even when the hardware - * says it is linear.(2)Also since MBA is a core specific - * mechanism, the delta values vary based on number of cores used - * by the rdtgrp. - */ - pmbm_data->delta_comp = true; - list_for_each_entry(entry, head, mon.crdtgrp_list) { - cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; - cmbm_data->delta_comp = true; - } } static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) -- GitLab From 06b8db3a7dde43cc7c412517c93c85d13a4557f8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 23 Jan 2024 17:14:24 -0800 Subject: [PATCH 0074/1519] fs: remove NTFS classic from docum. index With the remove of the NTFS classic filesystem, also remove its documentation entry from the filesystems index to prevent a kernel-doc warning: Documentation/filesystems/index.rst:63: WARNING: toctree contains reference to nonexisting document 'filesystems/ntfs' Fixes: 9c67092ed339 ("fs: Remove NTFS classic") Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20240124011424.731-1-rdunlap@infradead.org Cc: Matthew Wilcox (Oracle) Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Cc: Namjae Jeon Cc: Dave Chinner Cc: Anton Altaparmakov Cc: Jonathan Corbet Cc: Signed-off-by: Christian Brauner --- Documentation/filesystems/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index e18bc5ae3b35f..0ea1e44fa0282 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -98,7 +98,6 @@ Documentation for filesystem implementations. isofs nilfs2 nfs/index - ntfs ntfs3 ocfs2 ocfs2-online-filecheck -- GitLab From 3f3174996be6b4312c38f54d5969f5d5b75fec9e Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 22 Jan 2024 22:13:59 -0600 Subject: [PATCH 0075/1519] RAS: Introduce AMD Address Translation Library AMD Zen-based systems report memory errors through Machine Check banks representing Unified Memory Controllers (UMCs). The address value reported for DRAM ECC errors is a "normalized address" that is relative to the UMC. This normalized address must be converted to a system physical address to be usable by the OS. Support for this address translation was introduced to the MCA subsystem with Zen1 systems. The code was later moved to the AMD64 EDAC module, since this was the only user of the code at the time. However, there are uses for this translation outside of EDAC. The system physical address can be used in MCA for preemptive page offlining as done in some MCA notifier functions. Also, this translation is needed as the basis of similar functionality needed for some CXL configurations on AMD systems. Introduce a common address translation library that can be used for multiple subsystems including MCA, EDAC, and CXL. Include support for UMC normalized to system physical address translation for current CPU systems. The Data Fabric Indirect register access offsets and one of the register fields were changed. Default to the current offsets and register field definition. And fallback to the older values if running on a "legacy" system. Provide built-in code to facilitate the loading and unloading of the library module without affecting other modules or built-in code. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240123041401.79812-2-yazen.ghannam@amd.com --- MAINTAINERS | 6 + drivers/ras/Kconfig | 1 + drivers/ras/Makefile | 2 + drivers/ras/amd/atl/Kconfig | 20 + drivers/ras/amd/atl/Makefile | 18 + drivers/ras/amd/atl/access.c | 106 +++++ drivers/ras/amd/atl/core.c | 225 ++++++++++ drivers/ras/amd/atl/dehash.c | 407 ++++++++++++++++++ drivers/ras/amd/atl/denormalize.c | 617 +++++++++++++++++++++++++++ drivers/ras/amd/atl/internal.h | 297 +++++++++++++ drivers/ras/amd/atl/map.c | 665 ++++++++++++++++++++++++++++++ drivers/ras/amd/atl/reg_fields.h | 603 +++++++++++++++++++++++++++ drivers/ras/amd/atl/system.c | 281 +++++++++++++ drivers/ras/amd/atl/umc.c | 41 ++ drivers/ras/ras.c | 31 ++ include/linux/ras.h | 16 + 16 files changed, 3336 insertions(+) create mode 100644 drivers/ras/amd/atl/Kconfig create mode 100644 drivers/ras/amd/atl/Makefile create mode 100644 drivers/ras/amd/atl/access.c create mode 100644 drivers/ras/amd/atl/core.c create mode 100644 drivers/ras/amd/atl/dehash.c create mode 100644 drivers/ras/amd/atl/denormalize.c create mode 100644 drivers/ras/amd/atl/internal.h create mode 100644 drivers/ras/amd/atl/map.c create mode 100644 drivers/ras/amd/atl/reg_fields.h create mode 100644 drivers/ras/amd/atl/system.c create mode 100644 drivers/ras/amd/atl/umc.c diff --git a/MAINTAINERS b/MAINTAINERS index 8d1052fa6a692..25537a37338e5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -897,6 +897,12 @@ Q: https://patchwork.kernel.org/project/linux-rdma/list/ F: drivers/infiniband/hw/efa/ F: include/uapi/rdma/efa-abi.h +AMD ADDRESS TRANSLATION LIBRARY (ATL) +M: Yazen Ghannam +L: linux-edac@vger.kernel.org +S: Supported +F: drivers/ras/amd/atl/* + AMD AXI W1 DRIVER M: Kris Chaplin R: Thomas Delev diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig index c2a236f2e8460..2e969f59c0cac 100644 --- a/drivers/ras/Kconfig +++ b/drivers/ras/Kconfig @@ -32,5 +32,6 @@ menuconfig RAS if RAS source "arch/x86/ras/Kconfig" +source "drivers/ras/amd/atl/Kconfig" endif diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile index 6f0404f501071..3fac80f580052 100644 --- a/drivers/ras/Makefile +++ b/drivers/ras/Makefile @@ -2,3 +2,5 @@ obj-$(CONFIG_RAS) += ras.o obj-$(CONFIG_DEBUG_FS) += debugfs.o obj-$(CONFIG_RAS_CEC) += cec.o + +obj-y += amd/atl/ diff --git a/drivers/ras/amd/atl/Kconfig b/drivers/ras/amd/atl/Kconfig new file mode 100644 index 0000000000000..a43513a700f1d --- /dev/null +++ b/drivers/ras/amd/atl/Kconfig @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# AMD Address Translation Library Kconfig +# +# Copyright (c) 2023, Advanced Micro Devices, Inc. +# All Rights Reserved. +# +# Author: Yazen Ghannam + +config AMD_ATL + tristate "AMD Address Translation Library" + depends on AMD_NB && X86_64 && RAS + default N + help + This library includes support for implementation-specific + address translation procedures needed for various error + handling cases. + + Enable this option if using DRAM ECC on Zen-based systems + and OS-based error handling. diff --git a/drivers/ras/amd/atl/Makefile b/drivers/ras/amd/atl/Makefile new file mode 100644 index 0000000000000..4acd5f05bd9c2 --- /dev/null +++ b/drivers/ras/amd/atl/Makefile @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# AMD Address Translation Library Makefile +# +# Copyright (c) 2023, Advanced Micro Devices, Inc. +# All Rights Reserved. +# +# Author: Yazen Ghannam + +amd_atl-y := access.o +amd_atl-y += core.o +amd_atl-y += dehash.o +amd_atl-y += denormalize.o +amd_atl-y += map.o +amd_atl-y += system.o +amd_atl-y += umc.o + +obj-$(CONFIG_AMD_ATL) += amd_atl.o diff --git a/drivers/ras/amd/atl/access.c b/drivers/ras/amd/atl/access.c new file mode 100644 index 0000000000000..f6dd87bb2c35a --- /dev/null +++ b/drivers/ras/amd/atl/access.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Address Translation Library + * + * access.c : DF Indirect Access functions + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +#include "internal.h" + +/* Protect the PCI config register pairs used for DF indirect access. */ +static DEFINE_MUTEX(df_indirect_mutex); + +/* + * Data Fabric Indirect Access uses FICAA/FICAD. + * + * Fabric Indirect Configuration Access Address (FICAA): constructed based + * on the device's Instance Id and the PCI function and register offset of + * the desired register. + * + * Fabric Indirect Configuration Access Data (FICAD): there are FICAD + * low and high registers but so far only the low register is needed. + * + * Use Instance Id 0xFF to indicate a broadcast read. + */ +#define DF_BROADCAST 0xFF + +#define DF_FICAA_INST_EN BIT(0) +#define DF_FICAA_REG_NUM GENMASK(10, 1) +#define DF_FICAA_FUNC_NUM GENMASK(13, 11) +#define DF_FICAA_INST_ID GENMASK(23, 16) + +#define DF_FICAA_REG_NUM_LEGACY GENMASK(10, 2) + +static int __df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) +{ + u32 ficaa_addr = 0x8C, ficad_addr = 0xB8; + struct pci_dev *F4; + int err = -ENODEV; + u32 ficaa = 0; + + if (node >= amd_nb_num()) + goto out; + + F4 = node_to_amd_nb(node)->link; + if (!F4) + goto out; + + /* Enable instance-specific access. */ + if (instance_id != DF_BROADCAST) { + ficaa |= FIELD_PREP(DF_FICAA_INST_EN, 1); + ficaa |= FIELD_PREP(DF_FICAA_INST_ID, instance_id); + } + + /* + * The two least-significant bits are masked when inputing the + * register offset to FICAA. + */ + reg >>= 2; + + if (df_cfg.flags.legacy_ficaa) { + ficaa_addr = 0x5C; + ficad_addr = 0x98; + + ficaa |= FIELD_PREP(DF_FICAA_REG_NUM_LEGACY, reg); + } else { + ficaa |= FIELD_PREP(DF_FICAA_REG_NUM, reg); + } + + ficaa |= FIELD_PREP(DF_FICAA_FUNC_NUM, func); + + mutex_lock(&df_indirect_mutex); + + err = pci_write_config_dword(F4, ficaa_addr, ficaa); + if (err) { + pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa); + goto out_unlock; + } + + err = pci_read_config_dword(F4, ficad_addr, lo); + if (err) + pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa); + + pr_debug("node=%u inst=0x%x func=0x%x reg=0x%x val=0x%x", + node, instance_id, func, reg << 2, *lo); + +out_unlock: + mutex_unlock(&df_indirect_mutex); + +out: + return err; +} + +int df_indirect_read_instance(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) +{ + return __df_indirect_read(node, func, reg, instance_id, lo); +} + +int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo) +{ + return __df_indirect_read(node, func, reg, DF_BROADCAST, lo); +} diff --git a/drivers/ras/amd/atl/core.c b/drivers/ras/amd/atl/core.c new file mode 100644 index 0000000000000..6dc4e06305f75 --- /dev/null +++ b/drivers/ras/amd/atl/core.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Address Translation Library + * + * core.c : Module init and base translation functions + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +#include +#include + +#include "internal.h" + +struct df_config df_cfg __read_mostly; + +static int addr_over_limit(struct addr_ctx *ctx) +{ + u64 dram_limit_addr; + + if (df_cfg.rev >= DF4) + dram_limit_addr = FIELD_GET(DF4_DRAM_LIMIT_ADDR, ctx->map.limit); + else + dram_limit_addr = FIELD_GET(DF2_DRAM_LIMIT_ADDR, ctx->map.limit); + + dram_limit_addr <<= DF_DRAM_BASE_LIMIT_LSB; + dram_limit_addr |= GENMASK(DF_DRAM_BASE_LIMIT_LSB - 1, 0); + + /* Is calculated system address above DRAM limit address? */ + if (ctx->ret_addr > dram_limit_addr) { + atl_debug(ctx, "Calculated address (0x%016llx) > DRAM limit (0x%016llx)", + ctx->ret_addr, dram_limit_addr); + return -EINVAL; + } + + return 0; +} + +static bool legacy_hole_en(struct addr_ctx *ctx) +{ + u32 reg = ctx->map.base; + + if (df_cfg.rev >= DF4) + reg = ctx->map.ctl; + + return FIELD_GET(DF_LEGACY_MMIO_HOLE_EN, reg); +} + +static int add_legacy_hole(struct addr_ctx *ctx) +{ + u32 dram_hole_base; + u8 func = 0; + + if (!legacy_hole_en(ctx)) + return 0; + + if (df_cfg.rev >= DF4) + func = 7; + + if (df_indirect_read_broadcast(ctx->node_id, func, 0x104, &dram_hole_base)) + return -EINVAL; + + dram_hole_base &= DF_DRAM_HOLE_BASE_MASK; + + if (ctx->ret_addr >= dram_hole_base) + ctx->ret_addr += (BIT_ULL(32) - dram_hole_base); + + return 0; +} + +static u64 get_base_addr(struct addr_ctx *ctx) +{ + u64 base_addr; + + if (df_cfg.rev >= DF4) + base_addr = FIELD_GET(DF4_BASE_ADDR, ctx->map.base); + else + base_addr = FIELD_GET(DF2_BASE_ADDR, ctx->map.base); + + return base_addr << DF_DRAM_BASE_LIMIT_LSB; +} + +static int add_base_and_hole(struct addr_ctx *ctx) +{ + ctx->ret_addr += get_base_addr(ctx); + + if (add_legacy_hole(ctx)) + return -EINVAL; + + return 0; +} + +static bool late_hole_remove(struct addr_ctx *ctx) +{ + if (df_cfg.rev == DF3p5) + return true; + + if (df_cfg.rev == DF4) + return true; + + if (ctx->map.intlv_mode == DF3_6CHAN) + return true; + + return false; +} + +unsigned long norm_to_sys_addr(u8 socket_id, u8 die_id, u8 coh_st_inst_id, unsigned long addr) +{ + struct addr_ctx ctx; + + if (df_cfg.rev == UNKNOWN) + return -EINVAL; + + memset(&ctx, 0, sizeof(ctx)); + + /* Start from the normalized address */ + ctx.ret_addr = addr; + ctx.inst_id = coh_st_inst_id; + + ctx.inputs.norm_addr = addr; + ctx.inputs.socket_id = socket_id; + ctx.inputs.die_id = die_id; + ctx.inputs.coh_st_inst_id = coh_st_inst_id; + + if (determine_node_id(&ctx, socket_id, die_id)) + return -EINVAL; + + if (get_address_map(&ctx)) + return -EINVAL; + + if (denormalize_address(&ctx)) + return -EINVAL; + + if (!late_hole_remove(&ctx) && add_base_and_hole(&ctx)) + return -EINVAL; + + if (dehash_address(&ctx)) + return -EINVAL; + + if (late_hole_remove(&ctx) && add_base_and_hole(&ctx)) + return -EINVAL; + + if (addr_over_limit(&ctx)) + return -EINVAL; + + return ctx.ret_addr; +} + +static void check_for_legacy_df_access(void) +{ + /* + * All Zen-based systems before Family 19h use the legacy + * DF Indirect Access (FICAA/FICAD) offsets. + */ + if (boot_cpu_data.x86 < 0x19) { + df_cfg.flags.legacy_ficaa = true; + return; + } + + /* All systems after Family 19h use the current offsets. */ + if (boot_cpu_data.x86 > 0x19) + return; + + /* Some Family 19h systems use the legacy offsets. */ + switch (boot_cpu_data.x86_model) { + case 0x00 ... 0x0f: + case 0x20 ... 0x5f: + df_cfg.flags.legacy_ficaa = true; + } +} + +/* + * This library provides functionality for AMD-based systems with a Data Fabric. + * The set of systems with a Data Fabric is equivalent to the set of Zen-based systems + * and the set of systems with the Scalable MCA feature at this time. However, these + * are technically independent things. + * + * It's possible to match on the PCI IDs of the Data Fabric devices, but this will be + * an ever expanding list. Instead, match on the SMCA and Zen features to cover all + * relevant systems. + */ +static const struct x86_cpu_id amd_atl_cpuids[] = { + X86_MATCH_FEATURE(X86_FEATURE_SMCA, NULL), + X86_MATCH_FEATURE(X86_FEATURE_ZEN, NULL), + { } +}; +MODULE_DEVICE_TABLE(x86cpu, amd_atl_cpuids); + +static int __init amd_atl_init(void) +{ + if (!x86_match_cpu(amd_atl_cpuids)) + return -ENODEV; + + if (!amd_nb_num()) + return -ENODEV; + + check_for_legacy_df_access(); + + if (get_df_system_info()) + return -ENODEV; + + /* Increment this module's recount so that it can't be easily unloaded. */ + __module_get(THIS_MODULE); + amd_atl_register_decoder(convert_umc_mca_addr_to_sys_addr); + + pr_info("AMD Address Translation Library initialized"); + return 0; +} + +/* + * Exit function is only needed for testing and debug. Module unload must be + * forced to override refcount check. + */ +static void __exit amd_atl_exit(void) +{ + amd_atl_unregister_decoder(); +} + +module_init(amd_atl_init); +module_exit(amd_atl_exit); + +MODULE_LICENSE("GPL"); diff --git a/drivers/ras/amd/atl/dehash.c b/drivers/ras/amd/atl/dehash.c new file mode 100644 index 0000000000000..6f414926e6fe1 --- /dev/null +++ b/drivers/ras/amd/atl/dehash.c @@ -0,0 +1,407 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Address Translation Library + * + * dehash.c : Functions to account for hashing bits + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +#include "internal.h" + +/* + * Verify the interleave bits are correct in the different interleaving + * settings. + * + * If @num_intlv_dies and/or @num_intlv_sockets are 1, it means the + * respective interleaving is disabled. + */ +static inline bool map_bits_valid(struct addr_ctx *ctx, u8 bit1, u8 bit2, + u8 num_intlv_dies, u8 num_intlv_sockets) +{ + if (!(ctx->map.intlv_bit_pos == bit1 || ctx->map.intlv_bit_pos == bit2)) { + pr_debug("Invalid interleave bit: %u", ctx->map.intlv_bit_pos); + return false; + } + + if (ctx->map.num_intlv_dies > num_intlv_dies) { + pr_debug("Invalid number of interleave dies: %u", ctx->map.num_intlv_dies); + return false; + } + + if (ctx->map.num_intlv_sockets > num_intlv_sockets) { + pr_debug("Invalid number of interleave sockets: %u", ctx->map.num_intlv_sockets); + return false; + } + + return true; +} + +static int df2_dehash_addr(struct addr_ctx *ctx) +{ + u8 hashed_bit, intlv_bit, intlv_bit_pos; + + if (!map_bits_valid(ctx, 8, 9, 1, 1)) + return -EINVAL; + + intlv_bit_pos = ctx->map.intlv_bit_pos; + intlv_bit = !!(BIT_ULL(intlv_bit_pos) & ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(12), ctx->ret_addr); + hashed_bit ^= FIELD_GET(BIT_ULL(18), ctx->ret_addr); + hashed_bit ^= FIELD_GET(BIT_ULL(21), ctx->ret_addr); + hashed_bit ^= FIELD_GET(BIT_ULL(30), ctx->ret_addr); + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(intlv_bit_pos); + + return 0; +} + +static int df3_dehash_addr(struct addr_ctx *ctx) +{ + bool hash_ctl_64k, hash_ctl_2M, hash_ctl_1G; + u8 hashed_bit, intlv_bit, intlv_bit_pos; + + if (!map_bits_valid(ctx, 8, 9, 1, 1)) + return -EINVAL; + + hash_ctl_64k = FIELD_GET(DF3_HASH_CTL_64K, ctx->map.ctl); + hash_ctl_2M = FIELD_GET(DF3_HASH_CTL_2M, ctx->map.ctl); + hash_ctl_1G = FIELD_GET(DF3_HASH_CTL_1G, ctx->map.ctl); + + intlv_bit_pos = ctx->map.intlv_bit_pos; + intlv_bit = !!(BIT_ULL(intlv_bit_pos) & ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(14), ctx->ret_addr); + hashed_bit ^= FIELD_GET(BIT_ULL(18), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(23), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(32), ctx->ret_addr) & hash_ctl_1G; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(intlv_bit_pos); + + /* Calculation complete for 2 channels. Continue for 4 and 8 channels. */ + if (ctx->map.intlv_mode == DF3_COD4_2CHAN_HASH) + return 0; + + intlv_bit = FIELD_GET(BIT_ULL(12), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(16), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(21), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(30), ctx->ret_addr) & hash_ctl_1G; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(12); + + /* Calculation complete for 4 channels. Continue for 8 channels. */ + if (ctx->map.intlv_mode == DF3_COD2_4CHAN_HASH) + return 0; + + intlv_bit = FIELD_GET(BIT_ULL(13), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(17), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(22), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(31), ctx->ret_addr) & hash_ctl_1G; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(13); + + return 0; +} + +static int df3_6chan_dehash_addr(struct addr_ctx *ctx) +{ + u8 intlv_bit_pos = ctx->map.intlv_bit_pos; + u8 hashed_bit, intlv_bit, num_intlv_bits; + bool hash_ctl_2M, hash_ctl_1G; + + if (ctx->map.intlv_mode != DF3_6CHAN) { + atl_debug_on_bad_intlv_mode(ctx); + return -EINVAL; + } + + num_intlv_bits = ilog2(ctx->map.num_intlv_chan) + 1; + + hash_ctl_2M = FIELD_GET(DF3_HASH_CTL_2M, ctx->map.ctl); + hash_ctl_1G = FIELD_GET(DF3_HASH_CTL_1G, ctx->map.ctl); + + intlv_bit = !!(BIT_ULL(intlv_bit_pos) & ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= !!(BIT_ULL(intlv_bit_pos + num_intlv_bits) & ctx->ret_addr); + hashed_bit ^= FIELD_GET(BIT_ULL(23), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(32), ctx->ret_addr) & hash_ctl_1G; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(intlv_bit_pos); + + intlv_bit_pos++; + intlv_bit = !!(BIT_ULL(intlv_bit_pos) & ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(21), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(30), ctx->ret_addr) & hash_ctl_1G; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(intlv_bit_pos); + + intlv_bit_pos++; + intlv_bit = !!(BIT_ULL(intlv_bit_pos) & ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(22), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(31), ctx->ret_addr) & hash_ctl_1G; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(intlv_bit_pos); + + return 0; +} + +static int df4_dehash_addr(struct addr_ctx *ctx) +{ + bool hash_ctl_64k, hash_ctl_2M, hash_ctl_1G; + u8 hashed_bit, intlv_bit; + + if (!map_bits_valid(ctx, 8, 8, 1, 2)) + return -EINVAL; + + hash_ctl_64k = FIELD_GET(DF4_HASH_CTL_64K, ctx->map.ctl); + hash_ctl_2M = FIELD_GET(DF4_HASH_CTL_2M, ctx->map.ctl); + hash_ctl_1G = FIELD_GET(DF4_HASH_CTL_1G, ctx->map.ctl); + + intlv_bit = FIELD_GET(BIT_ULL(8), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(16), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(21), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(30), ctx->ret_addr) & hash_ctl_1G; + + if (ctx->map.num_intlv_sockets == 1) + hashed_bit ^= FIELD_GET(BIT_ULL(14), ctx->ret_addr); + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(8); + + /* + * Hashing is possible with socket interleaving, so check the total number + * of channels in the system rather than DRAM map interleaving mode. + * + * Calculation complete for 2 channels. Continue for 4, 8, and 16 channels. + */ + if (ctx->map.total_intlv_chan <= 2) + return 0; + + intlv_bit = FIELD_GET(BIT_ULL(12), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(17), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(22), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(31), ctx->ret_addr) & hash_ctl_1G; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(12); + + /* Calculation complete for 4 channels. Continue for 8 and 16 channels. */ + if (ctx->map.total_intlv_chan <= 4) + return 0; + + intlv_bit = FIELD_GET(BIT_ULL(13), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(18), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(23), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(32), ctx->ret_addr) & hash_ctl_1G; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(13); + + /* Calculation complete for 8 channels. Continue for 16 channels. */ + if (ctx->map.total_intlv_chan <= 8) + return 0; + + intlv_bit = FIELD_GET(BIT_ULL(14), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(19), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(24), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(33), ctx->ret_addr) & hash_ctl_1G; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(14); + + return 0; +} + +static int df4p5_dehash_addr(struct addr_ctx *ctx) +{ + bool hash_ctl_64k, hash_ctl_2M, hash_ctl_1G, hash_ctl_1T; + u8 hashed_bit, intlv_bit; + u64 rehash_vector; + + if (!map_bits_valid(ctx, 8, 8, 1, 2)) + return -EINVAL; + + hash_ctl_64k = FIELD_GET(DF4_HASH_CTL_64K, ctx->map.ctl); + hash_ctl_2M = FIELD_GET(DF4_HASH_CTL_2M, ctx->map.ctl); + hash_ctl_1G = FIELD_GET(DF4_HASH_CTL_1G, ctx->map.ctl); + hash_ctl_1T = FIELD_GET(DF4_HASH_CTL_1T, ctx->map.ctl); + + /* + * Generate a unique address to determine which bits + * need to be dehashed. + * + * Start with a contiguous bitmask for the total + * number of channels starting at bit 8. + * + * Then make a gap in the proper place based on + * interleave mode. + */ + rehash_vector = ctx->map.total_intlv_chan - 1; + rehash_vector <<= 8; + + if (ctx->map.intlv_mode == DF4p5_NPS2_4CHAN_1K_HASH || + ctx->map.intlv_mode == DF4p5_NPS1_8CHAN_1K_HASH || + ctx->map.intlv_mode == DF4p5_NPS1_16CHAN_1K_HASH) + rehash_vector = expand_bits(10, 2, rehash_vector); + else + rehash_vector = expand_bits(9, 3, rehash_vector); + + if (rehash_vector & BIT_ULL(8)) { + intlv_bit = FIELD_GET(BIT_ULL(8), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(16), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(21), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(30), ctx->ret_addr) & hash_ctl_1G; + hashed_bit ^= FIELD_GET(BIT_ULL(40), ctx->ret_addr) & hash_ctl_1T; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(8); + } + + if (rehash_vector & BIT_ULL(9)) { + intlv_bit = FIELD_GET(BIT_ULL(9), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(17), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(22), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(31), ctx->ret_addr) & hash_ctl_1G; + hashed_bit ^= FIELD_GET(BIT_ULL(41), ctx->ret_addr) & hash_ctl_1T; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(9); + } + + if (rehash_vector & BIT_ULL(12)) { + intlv_bit = FIELD_GET(BIT_ULL(12), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(18), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(23), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(32), ctx->ret_addr) & hash_ctl_1G; + hashed_bit ^= FIELD_GET(BIT_ULL(42), ctx->ret_addr) & hash_ctl_1T; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(12); + } + + if (rehash_vector & BIT_ULL(13)) { + intlv_bit = FIELD_GET(BIT_ULL(13), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(19), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(24), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(33), ctx->ret_addr) & hash_ctl_1G; + hashed_bit ^= FIELD_GET(BIT_ULL(43), ctx->ret_addr) & hash_ctl_1T; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(13); + } + + if (rehash_vector & BIT_ULL(14)) { + intlv_bit = FIELD_GET(BIT_ULL(14), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(20), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(25), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(34), ctx->ret_addr) & hash_ctl_1G; + hashed_bit ^= FIELD_GET(BIT_ULL(44), ctx->ret_addr) & hash_ctl_1T; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(14); + } + + return 0; +} + +int dehash_address(struct addr_ctx *ctx) +{ + switch (ctx->map.intlv_mode) { + /* No hashing cases. */ + case NONE: + case NOHASH_2CHAN: + case NOHASH_4CHAN: + case NOHASH_8CHAN: + case NOHASH_16CHAN: + case NOHASH_32CHAN: + /* Hashing bits handled earlier during CS ID calculation. */ + case DF4_NPS4_3CHAN_HASH: + case DF4_NPS2_5CHAN_HASH: + case DF4_NPS2_6CHAN_HASH: + case DF4_NPS1_10CHAN_HASH: + case DF4_NPS1_12CHAN_HASH: + case DF4p5_NPS2_6CHAN_1K_HASH: + case DF4p5_NPS2_6CHAN_2K_HASH: + case DF4p5_NPS1_10CHAN_1K_HASH: + case DF4p5_NPS1_10CHAN_2K_HASH: + case DF4p5_NPS1_12CHAN_1K_HASH: + case DF4p5_NPS1_12CHAN_2K_HASH: + case DF4p5_NPS0_24CHAN_1K_HASH: + case DF4p5_NPS0_24CHAN_2K_HASH: + /* No hash physical address bits, so nothing to do. */ + case DF4p5_NPS4_3CHAN_1K_HASH: + case DF4p5_NPS4_3CHAN_2K_HASH: + case DF4p5_NPS2_5CHAN_1K_HASH: + case DF4p5_NPS2_5CHAN_2K_HASH: + return 0; + + case DF2_2CHAN_HASH: + return df2_dehash_addr(ctx); + + case DF3_COD4_2CHAN_HASH: + case DF3_COD2_4CHAN_HASH: + case DF3_COD1_8CHAN_HASH: + return df3_dehash_addr(ctx); + + case DF3_6CHAN: + return df3_6chan_dehash_addr(ctx); + + case DF4_NPS4_2CHAN_HASH: + case DF4_NPS2_4CHAN_HASH: + case DF4_NPS1_8CHAN_HASH: + return df4_dehash_addr(ctx); + + case DF4p5_NPS4_2CHAN_1K_HASH: + case DF4p5_NPS4_2CHAN_2K_HASH: + case DF4p5_NPS2_4CHAN_2K_HASH: + case DF4p5_NPS2_4CHAN_1K_HASH: + case DF4p5_NPS1_8CHAN_1K_HASH: + case DF4p5_NPS1_8CHAN_2K_HASH: + case DF4p5_NPS1_16CHAN_1K_HASH: + case DF4p5_NPS1_16CHAN_2K_HASH: + return df4p5_dehash_addr(ctx); + + default: + atl_debug_on_bad_intlv_mode(ctx); + return -EINVAL; + } +} diff --git a/drivers/ras/amd/atl/denormalize.c b/drivers/ras/amd/atl/denormalize.c new file mode 100644 index 0000000000000..01f1d0fb67995 --- /dev/null +++ b/drivers/ras/amd/atl/denormalize.c @@ -0,0 +1,617 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Address Translation Library + * + * denormalize.c : Functions to account for interleaving bits + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +#include "internal.h" + +/* + * Returns the Destination Fabric ID. This is the first (lowest) + * COH_ST Fabric ID used within a DRAM Address map. + */ +static u16 get_dst_fabric_id(struct addr_ctx *ctx) +{ + switch (df_cfg.rev) { + case DF2: return FIELD_GET(DF2_DST_FABRIC_ID, ctx->map.limit); + case DF3: return FIELD_GET(DF3_DST_FABRIC_ID, ctx->map.limit); + case DF3p5: return FIELD_GET(DF3p5_DST_FABRIC_ID, ctx->map.limit); + case DF4: return FIELD_GET(DF4_DST_FABRIC_ID, ctx->map.ctl); + case DF4p5: return FIELD_GET(DF4p5_DST_FABRIC_ID, ctx->map.ctl); + default: + atl_debug_on_bad_df_rev(); + return 0; + } +} + +/* + * Make a contiguous gap in address for N bits starting at bit P. + * + * Example: + * address bits: [20:0] + * # of interleave bits (n): 3 + * starting interleave bit (p): 8 + * + * expanded address bits: [20+n : n+p][n+p-1 : p][p-1 : 0] + * [23 : 11][10 : 8][7 : 0] + */ +static u64 make_space_for_coh_st_id_at_intlv_bit(struct addr_ctx *ctx) +{ + return expand_bits(ctx->map.intlv_bit_pos, + ctx->map.total_intlv_bits, + ctx->ret_addr); +} + +/* + * Make two gaps in address for N bits. + * First gap is a single bit at bit P. + * Second gap is the remaining N-1 bits at bit 12. + * + * Example: + * address bits: [20:0] + * # of interleave bits (n): 3 + * starting interleave bit (p): 8 + * + * First gap + * expanded address bits: [20+1 : p+1][p][p-1 : 0] + * [21 : 9][8][7 : 0] + * + * Second gap uses result from first. + * r = n - 1; remaining interleave bits + * expanded address bits: [21+r : 12+r][12+r-1: 12][11 : 0] + * [23 : 14][13 : 12][11 : 0] + */ +static u64 make_space_for_coh_st_id_split_2_1(struct addr_ctx *ctx) +{ + /* Make a single space at the interleave bit. */ + u64 denorm_addr = expand_bits(ctx->map.intlv_bit_pos, 1, ctx->ret_addr); + + /* Done if there's only a single interleave bit. */ + if (ctx->map.total_intlv_bits <= 1) + return denorm_addr; + + /* Make spaces for the remaining interleave bits starting at bit 12. */ + return expand_bits(12, ctx->map.total_intlv_bits - 1, denorm_addr); +} + +/* + * Take the current calculated address and shift enough bits in the middle + * to make a gap where the interleave bits will be inserted. + */ +static u64 make_space_for_coh_st_id(struct addr_ctx *ctx) +{ + switch (ctx->map.intlv_mode) { + case NOHASH_2CHAN: + case NOHASH_4CHAN: + case NOHASH_8CHAN: + case NOHASH_16CHAN: + case NOHASH_32CHAN: + case DF2_2CHAN_HASH: + return make_space_for_coh_st_id_at_intlv_bit(ctx); + + case DF3_COD4_2CHAN_HASH: + case DF3_COD2_4CHAN_HASH: + case DF3_COD1_8CHAN_HASH: + case DF4_NPS4_2CHAN_HASH: + case DF4_NPS2_4CHAN_HASH: + case DF4_NPS1_8CHAN_HASH: + case DF4p5_NPS4_2CHAN_1K_HASH: + case DF4p5_NPS4_2CHAN_2K_HASH: + case DF4p5_NPS2_4CHAN_2K_HASH: + case DF4p5_NPS1_8CHAN_2K_HASH: + case DF4p5_NPS1_16CHAN_2K_HASH: + return make_space_for_coh_st_id_split_2_1(ctx); + default: + atl_debug_on_bad_intlv_mode(ctx); + return ~0ULL; + } +} + +static u16 get_coh_st_id_df2(struct addr_ctx *ctx) +{ + u8 num_socket_intlv_bits = ilog2(ctx->map.num_intlv_sockets); + u8 num_die_intlv_bits = ilog2(ctx->map.num_intlv_dies); + u8 num_intlv_bits; + u16 coh_st_id, mask; + + coh_st_id = ctx->coh_st_fabric_id - get_dst_fabric_id(ctx); + + /* Channel interleave bits */ + num_intlv_bits = order_base_2(ctx->map.num_intlv_chan); + mask = GENMASK(num_intlv_bits - 1, 0); + coh_st_id &= mask; + + /* Die interleave bits */ + if (num_die_intlv_bits) { + u16 die_bits; + + mask = GENMASK(num_die_intlv_bits - 1, 0); + die_bits = ctx->coh_st_fabric_id & df_cfg.die_id_mask; + die_bits >>= df_cfg.die_id_shift; + + coh_st_id |= (die_bits & mask) << num_intlv_bits; + num_intlv_bits += num_die_intlv_bits; + } + + /* Socket interleave bits */ + if (num_socket_intlv_bits) { + u16 socket_bits; + + mask = GENMASK(num_socket_intlv_bits - 1, 0); + socket_bits = ctx->coh_st_fabric_id & df_cfg.socket_id_mask; + socket_bits >>= df_cfg.socket_id_shift; + + coh_st_id |= (socket_bits & mask) << num_intlv_bits; + } + + return coh_st_id; +} + +static u16 get_coh_st_id_df4(struct addr_ctx *ctx) +{ + /* + * Start with the original component mask and the number of interleave + * bits for the channels in this map. + */ + u8 num_intlv_bits = ilog2(ctx->map.num_intlv_chan); + u16 mask = df_cfg.component_id_mask; + + u16 socket_bits; + + /* Set the derived Coherent Station ID to the input Coherent Station Fabric ID. */ + u16 coh_st_id = ctx->coh_st_fabric_id & mask; + + /* + * Subtract the "base" Destination Fabric ID. + * This accounts for systems with disabled Coherent Stations. + */ + coh_st_id -= get_dst_fabric_id(ctx) & mask; + + /* + * Generate and use a new mask based on the number of bits + * needed for channel interleaving in this map. + */ + mask = GENMASK(num_intlv_bits - 1, 0); + coh_st_id &= mask; + + /* Done if socket interleaving is not enabled. */ + if (ctx->map.num_intlv_sockets <= 1) + return coh_st_id; + + /* + * Figure out how many bits are needed for the number of + * interleaved sockets. And shift the derived Coherent Station ID to account + * for these. + */ + num_intlv_bits = ilog2(ctx->map.num_intlv_sockets); + coh_st_id <<= num_intlv_bits; + + /* Generate a new mask for the socket interleaving bits. */ + mask = GENMASK(num_intlv_bits - 1, 0); + + /* Get the socket interleave bits from the original Coherent Station Fabric ID. */ + socket_bits = (ctx->coh_st_fabric_id & df_cfg.socket_id_mask) >> df_cfg.socket_id_shift; + + /* Apply the appropriate socket bits to the derived Coherent Station ID. */ + coh_st_id |= socket_bits & mask; + + return coh_st_id; +} + +/* + * Derive the correct Coherent Station ID that represents the interleave bits + * used within the system physical address. This accounts for the + * interleave mode, number of interleaved channels/dies/sockets, and + * other system/mode-specific bit swizzling. + * + * Returns: Coherent Station ID on success. + * All bits set on error. + */ +static u16 calculate_coh_st_id(struct addr_ctx *ctx) +{ + switch (ctx->map.intlv_mode) { + case NOHASH_2CHAN: + case NOHASH_4CHAN: + case NOHASH_8CHAN: + case NOHASH_16CHAN: + case NOHASH_32CHAN: + case DF3_COD4_2CHAN_HASH: + case DF3_COD2_4CHAN_HASH: + case DF3_COD1_8CHAN_HASH: + case DF2_2CHAN_HASH: + return get_coh_st_id_df2(ctx); + + case DF4_NPS4_2CHAN_HASH: + case DF4_NPS2_4CHAN_HASH: + case DF4_NPS1_8CHAN_HASH: + case DF4p5_NPS4_2CHAN_1K_HASH: + case DF4p5_NPS4_2CHAN_2K_HASH: + case DF4p5_NPS2_4CHAN_2K_HASH: + case DF4p5_NPS1_8CHAN_2K_HASH: + case DF4p5_NPS1_16CHAN_2K_HASH: + return get_coh_st_id_df4(ctx); + + /* COH_ST ID is simply the COH_ST Fabric ID adjusted by the Destination Fabric ID. */ + case DF4p5_NPS2_4CHAN_1K_HASH: + case DF4p5_NPS1_8CHAN_1K_HASH: + case DF4p5_NPS1_16CHAN_1K_HASH: + return ctx->coh_st_fabric_id - get_dst_fabric_id(ctx); + + default: + atl_debug_on_bad_intlv_mode(ctx); + return ~0; + } +} + +static u64 insert_coh_st_id_at_intlv_bit(struct addr_ctx *ctx, u64 denorm_addr, u16 coh_st_id) +{ + return denorm_addr | (coh_st_id << ctx->map.intlv_bit_pos); +} + +static u64 insert_coh_st_id_split_2_1(struct addr_ctx *ctx, u64 denorm_addr, u16 coh_st_id) +{ + /* Insert coh_st_id[0] at the interleave bit. */ + denorm_addr |= (coh_st_id & BIT(0)) << ctx->map.intlv_bit_pos; + + /* Insert coh_st_id[2:1] at bit 12. */ + denorm_addr |= (coh_st_id & GENMASK(2, 1)) << 11; + + return denorm_addr; +} + +static u64 insert_coh_st_id_split_2_2(struct addr_ctx *ctx, u64 denorm_addr, u16 coh_st_id) +{ + /* Insert coh_st_id[1:0] at bit 8. */ + denorm_addr |= (coh_st_id & GENMASK(1, 0)) << 8; + + /* + * Insert coh_st_id[n:2] at bit 12. 'n' could be 2 or 3. + * Grab both because bit 3 will be clear if unused. + */ + denorm_addr |= (coh_st_id & GENMASK(3, 2)) << 10; + + return denorm_addr; +} + +static u64 insert_coh_st_id(struct addr_ctx *ctx, u64 denorm_addr, u16 coh_st_id) +{ + switch (ctx->map.intlv_mode) { + case NOHASH_2CHAN: + case NOHASH_4CHAN: + case NOHASH_8CHAN: + case NOHASH_16CHAN: + case NOHASH_32CHAN: + case DF2_2CHAN_HASH: + return insert_coh_st_id_at_intlv_bit(ctx, denorm_addr, coh_st_id); + + case DF3_COD4_2CHAN_HASH: + case DF3_COD2_4CHAN_HASH: + case DF3_COD1_8CHAN_HASH: + case DF4_NPS4_2CHAN_HASH: + case DF4_NPS2_4CHAN_HASH: + case DF4_NPS1_8CHAN_HASH: + case DF4p5_NPS4_2CHAN_1K_HASH: + case DF4p5_NPS4_2CHAN_2K_HASH: + case DF4p5_NPS2_4CHAN_2K_HASH: + case DF4p5_NPS1_8CHAN_2K_HASH: + case DF4p5_NPS1_16CHAN_2K_HASH: + return insert_coh_st_id_split_2_1(ctx, denorm_addr, coh_st_id); + + case DF4p5_NPS2_4CHAN_1K_HASH: + case DF4p5_NPS1_8CHAN_1K_HASH: + case DF4p5_NPS1_16CHAN_1K_HASH: + return insert_coh_st_id_split_2_2(ctx, denorm_addr, coh_st_id); + + default: + atl_debug_on_bad_intlv_mode(ctx); + return ~0ULL; + } +} + +static u16 get_logical_coh_st_fabric_id(struct addr_ctx *ctx) +{ + u16 component_id, log_fabric_id; + + /* Start with the physical COH_ST Fabric ID. */ + u16 phys_fabric_id = ctx->coh_st_fabric_id; + + /* Skip logical ID lookup if remapping is disabled. */ + if (!FIELD_GET(DF4_REMAP_EN, ctx->map.ctl) && + ctx->map.intlv_mode != DF3_6CHAN) + return phys_fabric_id; + + /* Mask off the Node ID bits to get the "local" Component ID. */ + component_id = phys_fabric_id & df_cfg.component_id_mask; + + /* + * Search the list of logical Component IDs for the one that + * matches this physical Component ID. + */ + for (log_fabric_id = 0; log_fabric_id < MAX_COH_ST_CHANNELS; log_fabric_id++) { + if (ctx->map.remap_array[log_fabric_id] == component_id) + break; + } + + if (log_fabric_id == MAX_COH_ST_CHANNELS) + atl_debug(ctx, "COH_ST remap entry not found for 0x%x", + log_fabric_id); + + /* Get the Node ID bits from the physical and apply to the logical. */ + return (phys_fabric_id & df_cfg.node_id_mask) | log_fabric_id; +} + +static int denorm_addr_common(struct addr_ctx *ctx) +{ + u64 denorm_addr; + u16 coh_st_id; + + /* + * Convert the original physical COH_ST Fabric ID to a logical value. + * This is required for non-power-of-two and other interleaving modes. + */ + ctx->coh_st_fabric_id = get_logical_coh_st_fabric_id(ctx); + + denorm_addr = make_space_for_coh_st_id(ctx); + coh_st_id = calculate_coh_st_id(ctx); + ctx->ret_addr = insert_coh_st_id(ctx, denorm_addr, coh_st_id); + return 0; +} + +static int denorm_addr_df3_6chan(struct addr_ctx *ctx) +{ + u16 coh_st_id = ctx->coh_st_fabric_id & df_cfg.component_id_mask; + u8 total_intlv_bits = ctx->map.total_intlv_bits; + u8 low_bit, intlv_bit = ctx->map.intlv_bit_pos; + u64 msb_intlv_bits, temp_addr_a, temp_addr_b; + u8 np2_bits = ctx->map.np2_bits; + + if (ctx->map.intlv_mode != DF3_6CHAN) + return -EINVAL; + + /* + * 'np2_bits' holds the number of bits needed to cover the + * amount of memory (rounded up) in this map using 64K chunks. + * + * Example: + * Total memory in map: 6GB + * Rounded up to next power-of-2: 8GB + * Number of 64K chunks: 0x20000 + * np2_bits = log2(# of chunks): 17 + * + * Get the two most-significant interleave bits from the + * input address based on the following: + * + * [15 + np2_bits - total_intlv_bits : 14 + np2_bits - total_intlv_bits] + */ + low_bit = 14 + np2_bits - total_intlv_bits; + msb_intlv_bits = ctx->ret_addr >> low_bit; + msb_intlv_bits &= 0x3; + + /* + * If MSB are 11b, then logical COH_ST ID is 6 or 7. + * Need to adjust based on the mod3 result. + */ + if (msb_intlv_bits == 3) { + u8 addr_mod, phys_addr_msb, msb_coh_st_id; + + /* Get the remaining interleave bits from the input address. */ + temp_addr_b = GENMASK_ULL(low_bit - 1, intlv_bit) & ctx->ret_addr; + temp_addr_b >>= intlv_bit; + + /* Calculate the logical COH_ST offset based on mod3. */ + addr_mod = temp_addr_b % 3; + + /* Get COH_ST ID bits [2:1]. */ + msb_coh_st_id = (coh_st_id >> 1) & 0x3; + + /* Get the bit that starts the physical address bits. */ + phys_addr_msb = (intlv_bit + np2_bits + 1); + phys_addr_msb &= BIT(0); + phys_addr_msb++; + phys_addr_msb *= 3 - addr_mod + msb_coh_st_id; + phys_addr_msb %= 3; + + /* Move the physical address MSB to the correct place. */ + temp_addr_b |= phys_addr_msb << (low_bit - total_intlv_bits - intlv_bit); + + /* Generate a new COH_ST ID as follows: coh_st_id = [1, 1, coh_st_id[0]] */ + coh_st_id &= BIT(0); + coh_st_id |= GENMASK(2, 1); + } else { + temp_addr_b = GENMASK_ULL(63, intlv_bit) & ctx->ret_addr; + temp_addr_b >>= intlv_bit; + } + + temp_addr_a = GENMASK_ULL(intlv_bit - 1, 0) & ctx->ret_addr; + temp_addr_b <<= intlv_bit + total_intlv_bits; + + ctx->ret_addr = temp_addr_a | temp_addr_b; + ctx->ret_addr |= coh_st_id << intlv_bit; + return 0; +} + +static int denorm_addr_df4_np2(struct addr_ctx *ctx) +{ + bool hash_ctl_64k, hash_ctl_2M, hash_ctl_1G; + u16 group, group_offset, log_coh_st_offset; + unsigned int mod_value, shift_value; + u16 mask = df_cfg.component_id_mask; + u64 temp_addr_a, temp_addr_b; + u8 hash_pa8, hashed_bit; + + switch (ctx->map.intlv_mode) { + case DF4_NPS4_3CHAN_HASH: + mod_value = 3; + shift_value = 13; + break; + case DF4_NPS2_6CHAN_HASH: + mod_value = 3; + shift_value = 12; + break; + case DF4_NPS1_12CHAN_HASH: + mod_value = 3; + shift_value = 11; + break; + case DF4_NPS2_5CHAN_HASH: + mod_value = 5; + shift_value = 13; + break; + case DF4_NPS1_10CHAN_HASH: + mod_value = 5; + shift_value = 12; + break; + default: + atl_debug_on_bad_intlv_mode(ctx); + return -EINVAL; + }; + + if (ctx->map.num_intlv_sockets == 1) { + hash_pa8 = BIT_ULL(shift_value) & ctx->ret_addr; + temp_addr_a = remove_bits(shift_value, shift_value, ctx->ret_addr); + } else { + hash_pa8 = (ctx->coh_st_fabric_id & df_cfg.socket_id_mask); + hash_pa8 >>= df_cfg.socket_id_shift; + temp_addr_a = ctx->ret_addr; + } + + /* Make a gap for the real bit [8]. */ + temp_addr_a = expand_bits(8, 1, temp_addr_a); + + /* Make an additional gap for bits [13:12], as appropriate.*/ + if (ctx->map.intlv_mode == DF4_NPS2_6CHAN_HASH || + ctx->map.intlv_mode == DF4_NPS1_10CHAN_HASH) { + temp_addr_a = expand_bits(13, 1, temp_addr_a); + } else if (ctx->map.intlv_mode == DF4_NPS1_12CHAN_HASH) { + temp_addr_a = expand_bits(12, 2, temp_addr_a); + } + + /* Keep bits [13:0]. */ + temp_addr_a &= GENMASK_ULL(13, 0); + + /* Get the appropriate high bits. */ + shift_value += 1 - ilog2(ctx->map.num_intlv_sockets); + temp_addr_b = GENMASK_ULL(63, shift_value) & ctx->ret_addr; + temp_addr_b >>= shift_value; + temp_addr_b *= mod_value; + + /* + * Coherent Stations are divided into groups. + * + * Multiples of 3 (mod3) are divided into quadrants. + * e.g. NP4_3CHAN -> [0, 1, 2] [6, 7, 8] + * [3, 4, 5] [9, 10, 11] + * + * Multiples of 5 (mod5) are divided into sides. + * e.g. NP2_5CHAN -> [0, 1, 2, 3, 4] [5, 6, 7, 8, 9] + */ + + /* + * Calculate the logical offset for the COH_ST within its DRAM Address map. + * e.g. if map includes [5, 6, 7, 8, 9] and target instance is '8', then + * log_coh_st_offset = 8 - 5 = 3 + */ + log_coh_st_offset = (ctx->coh_st_fabric_id & mask) - (get_dst_fabric_id(ctx) & mask); + + /* + * Figure out the group number. + * + * Following above example, + * log_coh_st_offset = 3 + * mod_value = 5 + * group = 3 / 5 = 0 + */ + group = log_coh_st_offset / mod_value; + + /* + * Figure out the offset within the group. + * + * Following above example, + * log_coh_st_offset = 3 + * mod_value = 5 + * group_offset = 3 % 5 = 3 + */ + group_offset = log_coh_st_offset % mod_value; + + /* Adjust group_offset if the hashed bit [8] is set. */ + if (hash_pa8) { + if (!group_offset) + group_offset = mod_value - 1; + else + group_offset--; + } + + /* Add in the group offset to the high bits. */ + temp_addr_b += group_offset; + + /* Shift the high bits to the proper starting position. */ + temp_addr_b <<= 14; + + /* Combine the high and low bits together. */ + ctx->ret_addr = temp_addr_a | temp_addr_b; + + /* Account for hashing here instead of in dehash_address(). */ + hash_ctl_64k = FIELD_GET(DF4_HASH_CTL_64K, ctx->map.ctl); + hash_ctl_2M = FIELD_GET(DF4_HASH_CTL_2M, ctx->map.ctl); + hash_ctl_1G = FIELD_GET(DF4_HASH_CTL_1G, ctx->map.ctl); + + hashed_bit = !!hash_pa8; + hashed_bit ^= FIELD_GET(BIT_ULL(14), ctx->ret_addr); + hashed_bit ^= FIELD_GET(BIT_ULL(16), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(21), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(30), ctx->ret_addr) & hash_ctl_1G; + + ctx->ret_addr |= hashed_bit << 8; + + /* Done for 3 and 5 channel. */ + if (ctx->map.intlv_mode == DF4_NPS4_3CHAN_HASH || + ctx->map.intlv_mode == DF4_NPS2_5CHAN_HASH) + return 0; + + /* Select the proper 'group' bit to use for Bit 13. */ + if (ctx->map.intlv_mode == DF4_NPS1_12CHAN_HASH) + hashed_bit = !!(group & BIT(1)); + else + hashed_bit = group & BIT(0); + + hashed_bit ^= FIELD_GET(BIT_ULL(18), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(23), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(32), ctx->ret_addr) & hash_ctl_1G; + + ctx->ret_addr |= hashed_bit << 13; + + /* Done for 6 and 10 channel. */ + if (ctx->map.intlv_mode != DF4_NPS1_12CHAN_HASH) + return 0; + + hashed_bit = group & BIT(0); + hashed_bit ^= FIELD_GET(BIT_ULL(17), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(22), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(31), ctx->ret_addr) & hash_ctl_1G; + + ctx->ret_addr |= hashed_bit << 12; + return 0; +} + +int denormalize_address(struct addr_ctx *ctx) +{ + switch (ctx->map.intlv_mode) { + case NONE: + return 0; + case DF4_NPS4_3CHAN_HASH: + case DF4_NPS2_6CHAN_HASH: + case DF4_NPS1_12CHAN_HASH: + case DF4_NPS2_5CHAN_HASH: + case DF4_NPS1_10CHAN_HASH: + return denorm_addr_df4_np2(ctx); + case DF3_6CHAN: + return denorm_addr_df3_6chan(ctx); + default: + return denorm_addr_common(ctx); + } +} diff --git a/drivers/ras/amd/atl/internal.h b/drivers/ras/amd/atl/internal.h new file mode 100644 index 0000000000000..13f1b6098c968 --- /dev/null +++ b/drivers/ras/amd/atl/internal.h @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD Address Translation Library + * + * internal.h : Helper functions and common defines + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +#ifndef __AMD_ATL_INTERNAL_H__ +#define __AMD_ATL_INTERNAL_H__ + +#include +#include +#include + +#include + +#include "reg_fields.h" + +/* Maximum possible number of Coherent Stations within a single Data Fabric. */ +#define MAX_COH_ST_CHANNELS 32 + +/* PCI ID for Zen4 Server DF Function 0. */ +#define DF_FUNC0_ID_ZEN4_SERVER 0x14AD1022 + +/* Shift needed for adjusting register values to true values. */ +#define DF_DRAM_BASE_LIMIT_LSB 28 + +enum df_revisions { + UNKNOWN, + DF2, + DF3, + DF3p5, + DF4, + DF4p5, +}; + +/* These are mapped 1:1 to the hardware values. Special cases are set at > 0x20. */ +enum intlv_modes { + NONE = 0x00, + NOHASH_2CHAN = 0x01, + NOHASH_4CHAN = 0x03, + NOHASH_8CHAN = 0x05, + DF3_6CHAN = 0x06, + NOHASH_16CHAN = 0x07, + NOHASH_32CHAN = 0x08, + DF3_COD4_2CHAN_HASH = 0x0C, + DF3_COD2_4CHAN_HASH = 0x0D, + DF3_COD1_8CHAN_HASH = 0x0E, + DF4_NPS4_2CHAN_HASH = 0x10, + DF4_NPS2_4CHAN_HASH = 0x11, + DF4_NPS1_8CHAN_HASH = 0x12, + DF4_NPS4_3CHAN_HASH = 0x13, + DF4_NPS2_6CHAN_HASH = 0x14, + DF4_NPS1_12CHAN_HASH = 0x15, + DF4_NPS2_5CHAN_HASH = 0x16, + DF4_NPS1_10CHAN_HASH = 0x17, + DF2_2CHAN_HASH = 0x21, + /* DF4.5 modes are all IntLvNumChan + 0x20 */ + DF4p5_NPS1_16CHAN_1K_HASH = 0x2C, + DF4p5_NPS0_24CHAN_1K_HASH = 0x2E, + DF4p5_NPS4_2CHAN_1K_HASH = 0x30, + DF4p5_NPS2_4CHAN_1K_HASH = 0x31, + DF4p5_NPS1_8CHAN_1K_HASH = 0x32, + DF4p5_NPS4_3CHAN_1K_HASH = 0x33, + DF4p5_NPS2_6CHAN_1K_HASH = 0x34, + DF4p5_NPS1_12CHAN_1K_HASH = 0x35, + DF4p5_NPS2_5CHAN_1K_HASH = 0x36, + DF4p5_NPS1_10CHAN_1K_HASH = 0x37, + DF4p5_NPS4_2CHAN_2K_HASH = 0x40, + DF4p5_NPS2_4CHAN_2K_HASH = 0x41, + DF4p5_NPS1_8CHAN_2K_HASH = 0x42, + DF4p5_NPS1_16CHAN_2K_HASH = 0x43, + DF4p5_NPS4_3CHAN_2K_HASH = 0x44, + DF4p5_NPS2_6CHAN_2K_HASH = 0x45, + DF4p5_NPS1_12CHAN_2K_HASH = 0x46, + DF4p5_NPS0_24CHAN_2K_HASH = 0x47, + DF4p5_NPS2_5CHAN_2K_HASH = 0x48, + DF4p5_NPS1_10CHAN_2K_HASH = 0x49, +}; + +struct df_flags { + __u8 legacy_ficaa : 1, + socket_id_shift_quirk : 1, + __reserved_0 : 6; +}; + +struct df_config { + enum df_revisions rev; + + /* + * These masks operate on the 16-bit Coherent Station IDs, + * e.g. Instance, Fabric, Destination, etc. + */ + u16 component_id_mask; + u16 die_id_mask; + u16 node_id_mask; + u16 socket_id_mask; + + /* + * Least-significant bit of Node ID portion of the + * system-wide Coherent Station Fabric ID. + */ + u8 node_id_shift; + + /* + * Least-significant bit of Die portion of the Node ID. + * Adjusted to include the Node ID shift in order to apply + * to the Coherent Station Fabric ID. + */ + u8 die_id_shift; + + /* + * Least-significant bit of Socket portion of the Node ID. + * Adjusted to include the Node ID shift in order to apply + * to the Coherent Station Fabric ID. + */ + u8 socket_id_shift; + + /* Number of DRAM Address maps visible in a Coherent Station. */ + u8 num_coh_st_maps; + + /* Global flags to handle special cases. */ + struct df_flags flags; +}; + +extern struct df_config df_cfg; + +struct dram_addr_map { + /* + * Each DRAM Address Map can operate independently + * in different interleaving modes. + */ + enum intlv_modes intlv_mode; + + /* System-wide number for this address map. */ + u8 num; + + /* Raw register values */ + u32 base; + u32 limit; + u32 ctl; + u32 intlv; + + /* + * Logical to Physical Coherent Station Remapping array + * + * Index: Logical Coherent Station Instance ID + * Value: Physical Coherent Station Instance ID + * + * phys_coh_st_inst_id = remap_array[log_coh_st_inst_id] + */ + u8 remap_array[MAX_COH_ST_CHANNELS]; + + /* + * Number of bits covering DRAM Address map 0 + * when interleaving is non-power-of-2. + * + * Used only for DF3_6CHAN. + */ + u8 np2_bits; + + /* Position of the 'interleave bit'. */ + u8 intlv_bit_pos; + /* Number of channels interleaved in this map. */ + u8 num_intlv_chan; + /* Number of dies interleaved in this map. */ + u8 num_intlv_dies; + /* Number of sockets interleaved in this map. */ + u8 num_intlv_sockets; + /* + * Total number of channels interleaved accounting + * for die and socket interleaving. + */ + u8 total_intlv_chan; + /* Total bits needed to cover 'total_intlv_chan'. */ + u8 total_intlv_bits; +}; + +/* Original input values cached for debug printing. */ +struct addr_ctx_inputs { + u64 norm_addr; + u8 socket_id; + u8 die_id; + u8 coh_st_inst_id; +}; + +struct addr_ctx { + u64 ret_addr; + + struct addr_ctx_inputs inputs; + struct dram_addr_map map; + + /* AMD Node ID calculated from Socket and Die IDs. */ + u8 node_id; + + /* + * Coherent Station Instance ID + * Local ID used within a 'node'. + */ + u16 inst_id; + + /* + * Coherent Station Fabric ID + * System-wide ID that includes 'node' bits. + */ + u16 coh_st_fabric_id; +}; + +int df_indirect_read_instance(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo); +int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo); + +int get_df_system_info(void); +int determine_node_id(struct addr_ctx *ctx, u8 socket_num, u8 die_num); + +int get_address_map(struct addr_ctx *ctx); + +int denormalize_address(struct addr_ctx *ctx); +int dehash_address(struct addr_ctx *ctx); + +unsigned long norm_to_sys_addr(u8 socket_id, u8 die_id, u8 coh_st_inst_id, unsigned long addr); +unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err); + +/* + * Make a gap in @data that is @num_bits long starting at @bit_num. + * e.g. data = 11111111'b + * bit_num = 3 + * num_bits = 2 + * result = 1111100111'b + */ +static inline u64 expand_bits(u8 bit_num, u8 num_bits, u64 data) +{ + u64 temp1, temp2; + + if (!num_bits) + return data; + + if (!bit_num) { + WARN_ON_ONCE(num_bits >= BITS_PER_LONG); + return data << num_bits; + } + + WARN_ON_ONCE(bit_num >= BITS_PER_LONG); + + temp1 = data & GENMASK_ULL(bit_num - 1, 0); + + temp2 = data & GENMASK_ULL(63, bit_num); + temp2 <<= num_bits; + + return temp1 | temp2; +} + +/* + * Remove bits in @data between @low_bit and @high_bit inclusive. + * e.g. data = XXXYYZZZ'b + * low_bit = 3 + * high_bit = 4 + * result = XXXZZZ'b + */ +static inline u64 remove_bits(u8 low_bit, u8 high_bit, u64 data) +{ + u64 temp1, temp2; + + WARN_ON_ONCE(high_bit >= BITS_PER_LONG); + WARN_ON_ONCE(low_bit >= BITS_PER_LONG); + WARN_ON_ONCE(low_bit > high_bit); + + if (!low_bit) + return data >> (high_bit++); + + temp1 = GENMASK_ULL(low_bit - 1, 0) & data; + temp2 = GENMASK_ULL(63, high_bit + 1) & data; + temp2 >>= high_bit - low_bit + 1; + + return temp1 | temp2; +} + +#define atl_debug(ctx, fmt, arg...) \ + pr_debug("socket_id=%u die_id=%u coh_st_inst_id=%u norm_addr=0x%016llx: " fmt,\ + (ctx)->inputs.socket_id, (ctx)->inputs.die_id,\ + (ctx)->inputs.coh_st_inst_id, (ctx)->inputs.norm_addr, ##arg) + +static inline void atl_debug_on_bad_df_rev(void) +{ + pr_debug("Unrecognized DF rev: %u", df_cfg.rev); +} + +static inline void atl_debug_on_bad_intlv_mode(struct addr_ctx *ctx) +{ + atl_debug(ctx, "Unrecognized interleave mode: %u", ctx->map.intlv_mode); +} + +#endif /* __AMD_ATL_INTERNAL_H__ */ diff --git a/drivers/ras/amd/atl/map.c b/drivers/ras/amd/atl/map.c new file mode 100644 index 0000000000000..33f549b6255a4 --- /dev/null +++ b/drivers/ras/amd/atl/map.c @@ -0,0 +1,665 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Address Translation Library + * + * map.c : Functions to read and decode DRAM address maps + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +#include "internal.h" + +static int df2_get_intlv_mode(struct addr_ctx *ctx) +{ + ctx->map.intlv_mode = FIELD_GET(DF2_INTLV_NUM_CHAN, ctx->map.base); + + if (ctx->map.intlv_mode == 8) + ctx->map.intlv_mode = DF2_2CHAN_HASH; + + if (ctx->map.intlv_mode != NONE && + ctx->map.intlv_mode != NOHASH_2CHAN && + ctx->map.intlv_mode != DF2_2CHAN_HASH) + return -EINVAL; + + return 0; +} + +static int df3_get_intlv_mode(struct addr_ctx *ctx) +{ + ctx->map.intlv_mode = FIELD_GET(DF3_INTLV_NUM_CHAN, ctx->map.base); + return 0; +} + +static int df3p5_get_intlv_mode(struct addr_ctx *ctx) +{ + ctx->map.intlv_mode = FIELD_GET(DF3p5_INTLV_NUM_CHAN, ctx->map.base); + + if (ctx->map.intlv_mode == DF3_6CHAN) + return -EINVAL; + + return 0; +} + +static int df4_get_intlv_mode(struct addr_ctx *ctx) +{ + ctx->map.intlv_mode = FIELD_GET(DF4_INTLV_NUM_CHAN, ctx->map.intlv); + + if (ctx->map.intlv_mode == DF3_COD4_2CHAN_HASH || + ctx->map.intlv_mode == DF3_COD2_4CHAN_HASH || + ctx->map.intlv_mode == DF3_COD1_8CHAN_HASH || + ctx->map.intlv_mode == DF3_6CHAN) + return -EINVAL; + + return 0; +} + +static int df4p5_get_intlv_mode(struct addr_ctx *ctx) +{ + ctx->map.intlv_mode = FIELD_GET(DF4p5_INTLV_NUM_CHAN, ctx->map.intlv); + + if (ctx->map.intlv_mode <= NOHASH_32CHAN) + return 0; + + /* + * Modes matching the ranges above are returned as-is. + * + * All other modes are "fixed up" by adding 20h to make a unique value. + */ + ctx->map.intlv_mode += 0x20; + + return 0; +} + +static int get_intlv_mode(struct addr_ctx *ctx) +{ + int ret; + + switch (df_cfg.rev) { + case DF2: + ret = df2_get_intlv_mode(ctx); + break; + case DF3: + ret = df3_get_intlv_mode(ctx); + break; + case DF3p5: + ret = df3p5_get_intlv_mode(ctx); + break; + case DF4: + ret = df4_get_intlv_mode(ctx); + break; + case DF4p5: + ret = df4p5_get_intlv_mode(ctx); + break; + default: + ret = -EINVAL; + } + + if (ret) + atl_debug_on_bad_df_rev(); + + return ret; +} + +static u64 get_hi_addr_offset(u32 reg_dram_offset) +{ + u8 shift = DF_DRAM_BASE_LIMIT_LSB; + u64 hi_addr_offset; + + switch (df_cfg.rev) { + case DF2: + hi_addr_offset = FIELD_GET(DF2_HI_ADDR_OFFSET, reg_dram_offset); + break; + case DF3: + case DF3p5: + hi_addr_offset = FIELD_GET(DF3_HI_ADDR_OFFSET, reg_dram_offset); + break; + case DF4: + case DF4p5: + hi_addr_offset = FIELD_GET(DF4_HI_ADDR_OFFSET, reg_dram_offset); + break; + default: + hi_addr_offset = 0; + atl_debug_on_bad_df_rev(); + } + + return hi_addr_offset << shift; +} + +/* + * Returns: 0 if offset is disabled. + * 1 if offset is enabled. + * -EINVAL on error. + */ +static int get_dram_offset(struct addr_ctx *ctx, u64 *norm_offset) +{ + u32 reg_dram_offset; + u8 map_num; + + /* Should not be called for map 0. */ + if (!ctx->map.num) { + atl_debug(ctx, "Trying to find DRAM offset for map 0"); + return -EINVAL; + } + + /* + * DramOffset registers don't exist for map 0, so the base register + * actually refers to map 1. + * Adjust the map_num for the register offsets. + */ + map_num = ctx->map.num - 1; + + if (df_cfg.rev >= DF4) { + /* Read D18F7x140 (DramOffset) */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x140 + (4 * map_num), + ctx->inst_id, ®_dram_offset)) + return -EINVAL; + + } else { + /* Read D18F0x1B4 (DramOffset) */ + if (df_indirect_read_instance(ctx->node_id, 0, 0x1B4 + (4 * map_num), + ctx->inst_id, ®_dram_offset)) + return -EINVAL; + } + + if (!FIELD_GET(DF_HI_ADDR_OFFSET_EN, reg_dram_offset)) + return 0; + + *norm_offset = get_hi_addr_offset(reg_dram_offset); + + return 1; +} + +static int df3_6ch_get_dram_addr_map(struct addr_ctx *ctx) +{ + u16 dst_fabric_id = FIELD_GET(DF3_DST_FABRIC_ID, ctx->map.limit); + u8 i, j, shift = 4, mask = 0xF; + u32 reg, offset = 0x60; + u16 dst_node_id; + + /* Get Socket 1 register. */ + if (dst_fabric_id & df_cfg.socket_id_mask) + offset = 0x68; + + /* Read D18F0x06{0,8} (DF::Skt0CsTargetRemap0)/(DF::Skt0CsTargetRemap1) */ + if (df_indirect_read_broadcast(ctx->node_id, 0, offset, ®)) + return -EINVAL; + + /* Save 8 remap entries. */ + for (i = 0, j = 0; i < 8; i++, j++) + ctx->map.remap_array[i] = (reg >> (j * shift)) & mask; + + dst_node_id = dst_fabric_id & df_cfg.node_id_mask; + dst_node_id >>= df_cfg.node_id_shift; + + /* Read D18F2x090 (DF::Np2ChannelConfig) */ + if (df_indirect_read_broadcast(dst_node_id, 2, 0x90, ®)) + return -EINVAL; + + ctx->map.np2_bits = FIELD_GET(DF_LOG2_ADDR_64K_SPACE0, reg); + return 0; +} + +static int df2_get_dram_addr_map(struct addr_ctx *ctx) +{ + /* Read D18F0x110 (DramBaseAddress). */ + if (df_indirect_read_instance(ctx->node_id, 0, 0x110 + (8 * ctx->map.num), + ctx->inst_id, &ctx->map.base)) + return -EINVAL; + + /* Read D18F0x114 (DramLimitAddress). */ + if (df_indirect_read_instance(ctx->node_id, 0, 0x114 + (8 * ctx->map.num), + ctx->inst_id, &ctx->map.limit)) + return -EINVAL; + + return 0; +} + +static int df3_get_dram_addr_map(struct addr_ctx *ctx) +{ + if (df2_get_dram_addr_map(ctx)) + return -EINVAL; + + /* Read D18F0x3F8 (DfGlobalCtl). */ + if (df_indirect_read_instance(ctx->node_id, 0, 0x3F8, + ctx->inst_id, &ctx->map.ctl)) + return -EINVAL; + + return 0; +} + +static int df4_get_dram_addr_map(struct addr_ctx *ctx) +{ + u8 remap_sel, i, j, shift = 4, mask = 0xF; + u32 remap_reg; + + /* Read D18F7xE00 (DramBaseAddress). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0xE00 + (16 * ctx->map.num), + ctx->inst_id, &ctx->map.base)) + return -EINVAL; + + /* Read D18F7xE04 (DramLimitAddress). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0xE04 + (16 * ctx->map.num), + ctx->inst_id, &ctx->map.limit)) + return -EINVAL; + + /* Read D18F7xE08 (DramAddressCtl). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0xE08 + (16 * ctx->map.num), + ctx->inst_id, &ctx->map.ctl)) + return -EINVAL; + + /* Read D18F7xE0C (DramAddressIntlv). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0xE0C + (16 * ctx->map.num), + ctx->inst_id, &ctx->map.intlv)) + return -EINVAL; + + /* Check if Remap Enable bit is valid. */ + if (!FIELD_GET(DF4_REMAP_EN, ctx->map.ctl)) + return 0; + + /* Fill with bogus values, because '0' is a valid value. */ + memset(&ctx->map.remap_array, 0xFF, sizeof(ctx->map.remap_array)); + + /* Get Remap registers. */ + remap_sel = FIELD_GET(DF4_REMAP_SEL, ctx->map.ctl); + + /* Read D18F7x180 (CsTargetRemap0A). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x180 + (8 * remap_sel), + ctx->inst_id, &remap_reg)) + return -EINVAL; + + /* Save first 8 remap entries. */ + for (i = 0, j = 0; i < 8; i++, j++) + ctx->map.remap_array[i] = (remap_reg >> (j * shift)) & mask; + + /* Read D18F7x184 (CsTargetRemap0B). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x184 + (8 * remap_sel), + ctx->inst_id, &remap_reg)) + return -EINVAL; + + /* Save next 8 remap entries. */ + for (i = 8, j = 0; i < 16; i++, j++) + ctx->map.remap_array[i] = (remap_reg >> (j * shift)) & mask; + + return 0; +} + +static int df4p5_get_dram_addr_map(struct addr_ctx *ctx) +{ + u8 remap_sel, i, j, shift = 5, mask = 0x1F; + u32 remap_reg; + + /* Read D18F7x200 (DramBaseAddress). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x200 + (16 * ctx->map.num), + ctx->inst_id, &ctx->map.base)) + return -EINVAL; + + /* Read D18F7x204 (DramLimitAddress). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x204 + (16 * ctx->map.num), + ctx->inst_id, &ctx->map.limit)) + return -EINVAL; + + /* Read D18F7x208 (DramAddressCtl). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x208 + (16 * ctx->map.num), + ctx->inst_id, &ctx->map.ctl)) + return -EINVAL; + + /* Read D18F7x20C (DramAddressIntlv). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x20C + (16 * ctx->map.num), + ctx->inst_id, &ctx->map.intlv)) + return -EINVAL; + + /* Check if Remap Enable bit is valid. */ + if (!FIELD_GET(DF4_REMAP_EN, ctx->map.ctl)) + return 0; + + /* Fill with bogus values, because '0' is a valid value. */ + memset(&ctx->map.remap_array, 0xFF, sizeof(ctx->map.remap_array)); + + /* Get Remap registers. */ + remap_sel = FIELD_GET(DF4p5_REMAP_SEL, ctx->map.ctl); + + /* Read D18F7x180 (CsTargetRemap0A). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x180 + (24 * remap_sel), + ctx->inst_id, &remap_reg)) + return -EINVAL; + + /* Save first 6 remap entries. */ + for (i = 0, j = 0; i < 6; i++, j++) + ctx->map.remap_array[i] = (remap_reg >> (j * shift)) & mask; + + /* Read D18F7x184 (CsTargetRemap0B). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x184 + (24 * remap_sel), + ctx->inst_id, &remap_reg)) + return -EINVAL; + + /* Save next 6 remap entries. */ + for (i = 6, j = 0; i < 12; i++, j++) + ctx->map.remap_array[i] = (remap_reg >> (j * shift)) & mask; + + /* Read D18F7x188 (CsTargetRemap0C). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x188 + (24 * remap_sel), + ctx->inst_id, &remap_reg)) + return -EINVAL; + + /* Save next 6 remap entries. */ + for (i = 12, j = 0; i < 18; i++, j++) + ctx->map.remap_array[i] = (remap_reg >> (j * shift)) & mask; + + return 0; +} + +static int get_dram_addr_map(struct addr_ctx *ctx) +{ + switch (df_cfg.rev) { + case DF2: return df2_get_dram_addr_map(ctx); + case DF3: + case DF3p5: return df3_get_dram_addr_map(ctx); + case DF4: return df4_get_dram_addr_map(ctx); + case DF4p5: return df4p5_get_dram_addr_map(ctx); + default: + atl_debug_on_bad_df_rev(); + return -EINVAL; + } +} + +static int get_coh_st_fabric_id(struct addr_ctx *ctx) +{ + u32 reg; + + /* Read D18F0x50 (FabricBlockInstanceInformation3). */ + if (df_indirect_read_instance(ctx->node_id, 0, 0x50, ctx->inst_id, ®)) + return -EINVAL; + + if (df_cfg.rev < DF4p5) + ctx->coh_st_fabric_id = FIELD_GET(DF2_COH_ST_FABRIC_ID, reg); + else + ctx->coh_st_fabric_id = FIELD_GET(DF4p5_COH_ST_FABRIC_ID, reg); + + return 0; +} + +static int find_normalized_offset(struct addr_ctx *ctx, u64 *norm_offset) +{ + u64 last_offset = 0; + int ret; + + for (ctx->map.num = 1; ctx->map.num < df_cfg.num_coh_st_maps; ctx->map.num++) { + ret = get_dram_offset(ctx, norm_offset); + if (ret < 0) + return ret; + + /* Continue search if this map's offset is not enabled. */ + if (!ret) + continue; + + /* Enabled offsets should never be 0. */ + if (*norm_offset == 0) { + atl_debug(ctx, "Enabled map %u offset is 0", ctx->map.num); + return -EINVAL; + } + + /* Offsets should always increase from one map to the next. */ + if (*norm_offset <= last_offset) { + atl_debug(ctx, "Map %u offset (0x%016llx) <= previous (0x%016llx)", + ctx->map.num, *norm_offset, last_offset); + return -EINVAL; + } + + /* Match if this map's offset is less than the current calculated address. */ + if (ctx->ret_addr >= *norm_offset) + break; + + last_offset = *norm_offset; + } + + /* + * Finished search without finding a match. + * Reset to map 0 and no offset. + */ + if (ctx->map.num >= df_cfg.num_coh_st_maps) { + ctx->map.num = 0; + *norm_offset = 0; + } + + return 0; +} + +static bool valid_map(struct addr_ctx *ctx) +{ + if (df_cfg.rev >= DF4) + return FIELD_GET(DF_ADDR_RANGE_VAL, ctx->map.ctl); + else + return FIELD_GET(DF_ADDR_RANGE_VAL, ctx->map.base); +} + +static int get_address_map_common(struct addr_ctx *ctx) +{ + u64 norm_offset = 0; + + if (get_coh_st_fabric_id(ctx)) + return -EINVAL; + + if (find_normalized_offset(ctx, &norm_offset)) + return -EINVAL; + + if (get_dram_addr_map(ctx)) + return -EINVAL; + + if (!valid_map(ctx)) + return -EINVAL; + + ctx->ret_addr -= norm_offset; + + return 0; +} + +static u8 get_num_intlv_chan(struct addr_ctx *ctx) +{ + switch (ctx->map.intlv_mode) { + case NONE: + return 1; + case NOHASH_2CHAN: + case DF2_2CHAN_HASH: + case DF3_COD4_2CHAN_HASH: + case DF4_NPS4_2CHAN_HASH: + case DF4p5_NPS4_2CHAN_1K_HASH: + case DF4p5_NPS4_2CHAN_2K_HASH: + return 2; + case DF4_NPS4_3CHAN_HASH: + case DF4p5_NPS4_3CHAN_1K_HASH: + case DF4p5_NPS4_3CHAN_2K_HASH: + return 3; + case NOHASH_4CHAN: + case DF3_COD2_4CHAN_HASH: + case DF4_NPS2_4CHAN_HASH: + case DF4p5_NPS2_4CHAN_1K_HASH: + case DF4p5_NPS2_4CHAN_2K_HASH: + return 4; + case DF4_NPS2_5CHAN_HASH: + case DF4p5_NPS2_5CHAN_1K_HASH: + case DF4p5_NPS2_5CHAN_2K_HASH: + return 5; + case DF3_6CHAN: + case DF4_NPS2_6CHAN_HASH: + case DF4p5_NPS2_6CHAN_1K_HASH: + case DF4p5_NPS2_6CHAN_2K_HASH: + return 6; + case NOHASH_8CHAN: + case DF3_COD1_8CHAN_HASH: + case DF4_NPS1_8CHAN_HASH: + case DF4p5_NPS1_8CHAN_1K_HASH: + case DF4p5_NPS1_8CHAN_2K_HASH: + return 8; + case DF4_NPS1_10CHAN_HASH: + case DF4p5_NPS1_10CHAN_1K_HASH: + case DF4p5_NPS1_10CHAN_2K_HASH: + return 10; + case DF4_NPS1_12CHAN_HASH: + case DF4p5_NPS1_12CHAN_1K_HASH: + case DF4p5_NPS1_12CHAN_2K_HASH: + return 12; + case NOHASH_16CHAN: + case DF4p5_NPS1_16CHAN_1K_HASH: + case DF4p5_NPS1_16CHAN_2K_HASH: + return 16; + case DF4p5_NPS0_24CHAN_1K_HASH: + case DF4p5_NPS0_24CHAN_2K_HASH: + return 24; + case NOHASH_32CHAN: + return 32; + default: + atl_debug_on_bad_intlv_mode(ctx); + return 0; + } +} + +static void calculate_intlv_bits(struct addr_ctx *ctx) +{ + ctx->map.num_intlv_chan = get_num_intlv_chan(ctx); + + ctx->map.total_intlv_chan = ctx->map.num_intlv_chan; + ctx->map.total_intlv_chan *= ctx->map.num_intlv_dies; + ctx->map.total_intlv_chan *= ctx->map.num_intlv_sockets; + + /* + * Get the number of bits needed to cover this many channels. + * order_base_2() rounds up automatically. + */ + ctx->map.total_intlv_bits = order_base_2(ctx->map.total_intlv_chan); +} + +static u8 get_intlv_bit_pos(struct addr_ctx *ctx) +{ + u8 addr_sel = 0; + + switch (df_cfg.rev) { + case DF2: + addr_sel = FIELD_GET(DF2_INTLV_ADDR_SEL, ctx->map.base); + break; + case DF3: + case DF3p5: + addr_sel = FIELD_GET(DF3_INTLV_ADDR_SEL, ctx->map.base); + break; + case DF4: + case DF4p5: + addr_sel = FIELD_GET(DF4_INTLV_ADDR_SEL, ctx->map.intlv); + break; + default: + atl_debug_on_bad_df_rev(); + break; + } + + /* Add '8' to get the 'interleave bit position'. */ + return addr_sel + 8; +} + +static u8 get_num_intlv_dies(struct addr_ctx *ctx) +{ + u8 dies = 0; + + switch (df_cfg.rev) { + case DF2: + dies = FIELD_GET(DF2_INTLV_NUM_DIES, ctx->map.limit); + break; + case DF3: + dies = FIELD_GET(DF3_INTLV_NUM_DIES, ctx->map.base); + break; + case DF3p5: + dies = FIELD_GET(DF3p5_INTLV_NUM_DIES, ctx->map.base); + break; + case DF4: + case DF4p5: + dies = FIELD_GET(DF4_INTLV_NUM_DIES, ctx->map.intlv); + break; + default: + atl_debug_on_bad_df_rev(); + break; + } + + /* Register value is log2, e.g. 0 -> 1 die, 1 -> 2 dies, etc. */ + return 1 << dies; +} + +static u8 get_num_intlv_sockets(struct addr_ctx *ctx) +{ + u8 sockets = 0; + + switch (df_cfg.rev) { + case DF2: + sockets = FIELD_GET(DF2_INTLV_NUM_SOCKETS, ctx->map.limit); + break; + case DF3: + case DF3p5: + sockets = FIELD_GET(DF2_INTLV_NUM_SOCKETS, ctx->map.base); + break; + case DF4: + case DF4p5: + sockets = FIELD_GET(DF4_INTLV_NUM_SOCKETS, ctx->map.intlv); + break; + default: + atl_debug_on_bad_df_rev(); + break; + } + + /* Register value is log2, e.g. 0 -> 1 sockets, 1 -> 2 sockets, etc. */ + return 1 << sockets; +} + +static int get_global_map_data(struct addr_ctx *ctx) +{ + if (get_intlv_mode(ctx)) + return -EINVAL; + + if (ctx->map.intlv_mode == DF3_6CHAN && + df3_6ch_get_dram_addr_map(ctx)) + return -EINVAL; + + ctx->map.intlv_bit_pos = get_intlv_bit_pos(ctx); + ctx->map.num_intlv_dies = get_num_intlv_dies(ctx); + ctx->map.num_intlv_sockets = get_num_intlv_sockets(ctx); + calculate_intlv_bits(ctx); + + return 0; +} + +static void dump_address_map(struct dram_addr_map *map) +{ + u8 i; + + pr_debug("intlv_mode=0x%x", map->intlv_mode); + pr_debug("num=0x%x", map->num); + pr_debug("base=0x%x", map->base); + pr_debug("limit=0x%x", map->limit); + pr_debug("ctl=0x%x", map->ctl); + pr_debug("intlv=0x%x", map->intlv); + + for (i = 0; i < MAX_COH_ST_CHANNELS; i++) + pr_debug("remap_array[%u]=0x%x", i, map->remap_array[i]); + + pr_debug("intlv_bit_pos=%u", map->intlv_bit_pos); + pr_debug("num_intlv_chan=%u", map->num_intlv_chan); + pr_debug("num_intlv_dies=%u", map->num_intlv_dies); + pr_debug("num_intlv_sockets=%u", map->num_intlv_sockets); + pr_debug("total_intlv_chan=%u", map->total_intlv_chan); + pr_debug("total_intlv_bits=%u", map->total_intlv_bits); +} + +int get_address_map(struct addr_ctx *ctx) +{ + int ret; + + ret = get_address_map_common(ctx); + if (ret) + return ret; + + ret = get_global_map_data(ctx); + if (ret) + return ret; + + dump_address_map(&ctx->map); + + return ret; +} diff --git a/drivers/ras/amd/atl/reg_fields.h b/drivers/ras/amd/atl/reg_fields.h new file mode 100644 index 0000000000000..6aaa5093f42ce --- /dev/null +++ b/drivers/ras/amd/atl/reg_fields.h @@ -0,0 +1,603 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD Address Translation Library + * + * reg_fields.h : Register field definitions + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +/* + * Notes on naming: + * 1) Use "DF_" prefix for fields that are the same for all revisions. + * 2) Use "DFx_" prefix for fields that differ between revisions. + * a) "x" is the first major revision where the new field appears. + * b) E.g., if DF2 and DF3 have the same field, then call it DF2. + * c) E.g., if DF3p5 and DF4 have the same field, then call it DF4. + */ + +/* + * Coherent Station Fabric ID + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x50 [Fabric Block Instance Information 3] + * DF2 BlockFabricId [19:8] + * DF3 BlockFabricId [19:8] + * DF3p5 BlockFabricId [19:8] + * DF4 BlockFabricId [19:8] + * DF4p5 BlockFabricId [15:8] + */ +#define DF2_COH_ST_FABRIC_ID GENMASK(19, 8) +#define DF4p5_COH_ST_FABRIC_ID GENMASK(15, 8) + +/* + * Component ID Mask + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * DF2 N/A + * + * D18F1x208 [System Fabric ID Mask 0] + * DF3 ComponentIdMask [9:0] + * + * D18F1x150 [System Fabric ID Mask 0] + * DF3p5 ComponentIdMask [15:0] + * + * D18F4x1B0 [System Fabric ID Mask 0] + * DF4 ComponentIdMask [15:0] + * DF4p5 ComponentIdMask [15:0] + */ +#define DF3_COMPONENT_ID_MASK GENMASK(9, 0) +#define DF4_COMPONENT_ID_MASK GENMASK(15, 0) + +/* + * Destination Fabric ID + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x114 [DRAM Limit Address] + * DF2 DstFabricID [7:0] + * DF3 DstFabricID [9:0] + * DF3 DstFabricID [11:0] + * + * D18F7xE08 [DRAM Address Control] + * DF4 DstFabricID [27:16] + * + * D18F7x208 [DRAM Address Control] + * DF4p5 DstFabricID [23:16] + */ +#define DF2_DST_FABRIC_ID GENMASK(7, 0) +#define DF3_DST_FABRIC_ID GENMASK(9, 0) +#define DF3p5_DST_FABRIC_ID GENMASK(11, 0) +#define DF4_DST_FABRIC_ID GENMASK(27, 16) +#define DF4p5_DST_FABRIC_ID GENMASK(23, 16) + +/* + * Die ID Mask + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * D18F1x208 [System Fabric ID Mask] + * DF2 DieIdMask [15:8] + * + * D18F1x20C [System Fabric ID Mask 1] + * DF3 DieIdMask [18:16] + * + * D18F1x158 [System Fabric ID Mask 2] + * DF3p5 DieIdMask [15:0] + * + * D18F4x1B8 [System Fabric ID Mask 2] + * DF4 DieIdMask [15:0] + * DF4p5 DieIdMask [15:0] + */ +#define DF2_DIE_ID_MASK GENMASK(15, 8) +#define DF3_DIE_ID_MASK GENMASK(18, 16) +#define DF4_DIE_ID_MASK GENMASK(15, 0) + +/* + * Die ID Shift + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * D18F1x208 [System Fabric ID Mask] + * DF2 DieIdShift [27:24] + * + * DF3 N/A + * DF3p5 N/A + * DF4 N/A + * DF4p5 N/A + */ +#define DF2_DIE_ID_SHIFT GENMASK(27, 24) + +/* + * DRAM Address Range Valid + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x110 [DRAM Base Address] + * DF2 AddrRngVal [0] + * DF3 AddrRngVal [0] + * DF3p5 AddrRngVal [0] + * + * D18F7xE08 [DRAM Address Control] + * DF4 AddrRngVal [0] + * + * D18F7x208 [DRAM Address Control] + * DF4p5 AddrRngVal [0] + */ +#define DF_ADDR_RANGE_VAL BIT(0) + +/* + * DRAM Base Address + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x110 [DRAM Base Address] + * DF2 DramBaseAddr [31:12] + * DF3 DramBaseAddr [31:12] + * DF3p5 DramBaseAddr [31:12] + * + * D18F7xE00 [DRAM Base Address] + * DF4 DramBaseAddr [27:0] + * + * D18F7x200 [DRAM Base Address] + * DF4p5 DramBaseAddr [27:0] + */ +#define DF2_BASE_ADDR GENMASK(31, 12) +#define DF4_BASE_ADDR GENMASK(27, 0) + +/* + * DRAM Hole Base + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * D18F0x104 [DRAM Hole Control] + * DF2 DramHoleBase [31:24] + * DF3 DramHoleBase [31:24] + * DF3p5 DramHoleBase [31:24] + * + * D18F7x104 [DRAM Hole Control] + * DF4 DramHoleBase [31:24] + * DF4p5 DramHoleBase [31:24] + */ +#define DF_DRAM_HOLE_BASE_MASK GENMASK(31, 24) + +/* + * DRAM Limit Address + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x114 [DRAM Limit Address] + * DF2 DramLimitAddr [31:12] + * DF3 DramLimitAddr [31:12] + * DF3p5 DramLimitAddr [31:12] + * + * D18F7xE04 [DRAM Limit Address] + * DF4 DramLimitAddr [27:0] + * + * D18F7x204 [DRAM Limit Address] + * DF4p5 DramLimitAddr [27:0] + */ +#define DF2_DRAM_LIMIT_ADDR GENMASK(31, 12) +#define DF4_DRAM_LIMIT_ADDR GENMASK(27, 0) + +/* + * Hash Interleave Controls + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * DF2 N/A + * + * D18F0x3F8 [DF Global Control] + * DF3 GlbHashIntlvCtl64K [20] + * GlbHashIntlvCtl2M [21] + * GlbHashIntlvCtl1G [22] + * + * DF3p5 GlbHashIntlvCtl64K [20] + * GlbHashIntlvCtl2M [21] + * GlbHashIntlvCtl1G [22] + * + * D18F7xE08 [DRAM Address Control] + * DF4 HashIntlvCtl64K [8] + * HashIntlvCtl2M [9] + * HashIntlvCtl1G [10] + * + * D18F7x208 [DRAM Address Control] + * DF4p5 HashIntlvCtl4K [7] + * HashIntlvCtl64K [8] + * HashIntlvCtl2M [9] + * HashIntlvCtl1G [10] + * HashIntlvCtl1T [15] + */ +#define DF3_HASH_CTL_64K BIT(20) +#define DF3_HASH_CTL_2M BIT(21) +#define DF3_HASH_CTL_1G BIT(22) +#define DF4_HASH_CTL_4K BIT(7) +#define DF4_HASH_CTL_64K BIT(8) +#define DF4_HASH_CTL_2M BIT(9) +#define DF4_HASH_CTL_1G BIT(10) +#define DF4_HASH_CTL_1T BIT(15) + +/* + * High Address Offset + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x1B4 [DRAM Offset] + * DF2 HiAddrOffset [31:20] + * DF3 HiAddrOffset [31:12] + * DF3p5 HiAddrOffset [31:12] + * + * D18F7x140 [DRAM Offset] + * DF4 HiAddrOffset [24:1] + * DF4p5 HiAddrOffset [24:1] + */ +#define DF2_HI_ADDR_OFFSET GENMASK(31, 20) +#define DF3_HI_ADDR_OFFSET GENMASK(31, 12) +#define DF4_HI_ADDR_OFFSET GENMASK(24, 1) + +/* + * High Address Offset Enable + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x1B4 [DRAM Offset] + * DF2 HiAddrOffsetEn [0] + * DF3 HiAddrOffsetEn [0] + * DF3p5 HiAddrOffsetEn [0] + * + * D18F7x140 [DRAM Offset] + * DF4 HiAddrOffsetEn [0] + * DF4p5 HiAddrOffsetEn [0] + */ +#define DF_HI_ADDR_OFFSET_EN BIT(0) + +/* + * Interleave Address Select + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x110 [DRAM Base Address] + * DF2 IntLvAddrSel [10:8] + * DF3 IntLvAddrSel [11:9] + * DF3p5 IntLvAddrSel [11:9] + * + * D18F7xE0C [DRAM Address Interleave] + * DF4 IntLvAddrSel [2:0] + * + * D18F7x20C [DRAM Address Interleave] + * DF4p5 IntLvAddrSel [2:0] + */ +#define DF2_INTLV_ADDR_SEL GENMASK(10, 8) +#define DF3_INTLV_ADDR_SEL GENMASK(11, 9) +#define DF4_INTLV_ADDR_SEL GENMASK(2, 0) + +/* + * Interleave Number of Channels + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x110 [DRAM Base Address] + * DF2 IntLvNumChan [7:4] + * DF3 IntLvNumChan [5:2] + * DF3p5 IntLvNumChan [6:2] + * + * D18F7xE0C [DRAM Address Interleave] + * DF4 IntLvNumChan [8:4] + * + * D18F7x20C [DRAM Address Interleave] + * DF4p5 IntLvNumChan [9:4] + */ +#define DF2_INTLV_NUM_CHAN GENMASK(7, 4) +#define DF3_INTLV_NUM_CHAN GENMASK(5, 2) +#define DF3p5_INTLV_NUM_CHAN GENMASK(6, 2) +#define DF4_INTLV_NUM_CHAN GENMASK(8, 4) +#define DF4p5_INTLV_NUM_CHAN GENMASK(9, 4) + +/* + * Interleave Number of Dies + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x114 [DRAM Limit Address] + * DF2 IntLvNumDies [11:10] + * + * D18F0x110 [DRAM Base Address] + * DF3 IntLvNumDies [7:6] + * DF3p5 IntLvNumDies [7] + * + * D18F7xE0C [DRAM Address Interleave] + * DF4 IntLvNumDies [13:12] + * + * D18F7x20C [DRAM Address Interleave] + * DF4p5 IntLvNumDies [13:12] + */ +#define DF2_INTLV_NUM_DIES GENMASK(11, 10) +#define DF3_INTLV_NUM_DIES GENMASK(7, 6) +#define DF3p5_INTLV_NUM_DIES BIT(7) +#define DF4_INTLV_NUM_DIES GENMASK(13, 12) + +/* + * Interleave Number of Sockets + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x114 [DRAM Limit Address] + * DF2 IntLvNumSockets [8] + * + * D18F0x110 [DRAM Base Address] + * DF3 IntLvNumSockets [8] + * DF3p5 IntLvNumSockets [8] + * + * D18F7xE0C [DRAM Address Interleave] + * DF4 IntLvNumSockets [18] + * + * D18F7x20C [DRAM Address Interleave] + * DF4p5 IntLvNumSockets [18] + */ +#define DF2_INTLV_NUM_SOCKETS BIT(8) +#define DF4_INTLV_NUM_SOCKETS BIT(18) + +/* + * Legacy MMIO Hole Enable + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x110 [DRAM Base Address] + * DF2 LgcyMmioHoleEn [1] + * DF3 LgcyMmioHoleEn [1] + * DF3p5 LgcyMmioHoleEn [1] + * + * D18F7xE08 [DRAM Address Control] + * DF4 LgcyMmioHoleEn [1] + * + * D18F7x208 [DRAM Address Control] + * DF4p5 LgcyMmioHoleEn [1] + */ +#define DF_LEGACY_MMIO_HOLE_EN BIT(1) + +/* + * Log2 Address 64K Space 0 + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * DF2 N/A + * + * D18F2x90 [Non-power-of-2 channel Configuration Register for COH_ST DRAM Address Maps] + * DF3 Log2Addr64KSpace0 [5:0] + * + * DF3p5 N/A + * DF4 N/A + * DF4p5 N/A + */ +#define DF_LOG2_ADDR_64K_SPACE0 GENMASK(5, 0) + +/* + * Major Revision + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * DF2 N/A + * DF3 N/A + * DF3p5 N/A + * + * D18F0x040 [Fabric Block Instance Count] + * DF4 MajorRevision [27:24] + * DF4p5 MajorRevision [27:24] + */ +#define DF_MAJOR_REVISION GENMASK(27, 24) + +/* + * Minor Revision + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * DF2 N/A + * DF3 N/A + * DF3p5 N/A + * + * D18F0x040 [Fabric Block Instance Count] + * DF4 MinorRevision [23:16] + * DF4p5 MinorRevision [23:16] + */ +#define DF_MINOR_REVISION GENMASK(23, 16) + +/* + * Node ID Mask + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * DF2 N/A + * + * D18F1x208 [System Fabric ID Mask 0] + * DF3 NodeIdMask [25:16] + * + * D18F1x150 [System Fabric ID Mask 0] + * DF3p5 NodeIdMask [31:16] + * + * D18F4x1B0 [System Fabric ID Mask 0] + * DF4 NodeIdMask [31:16] + * DF4p5 NodeIdMask [31:16] + */ +#define DF3_NODE_ID_MASK GENMASK(25, 16) +#define DF4_NODE_ID_MASK GENMASK(31, 16) + +/* + * Node ID Shift + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * DF2 N/A + * + * D18F1x20C [System Fabric ID Mask 1] + * DF3 NodeIdShift [3:0] + * + * D18F1x154 [System Fabric ID Mask 1] + * DF3p5 NodeIdShift [3:0] + * + * D18F4x1B4 [System Fabric ID Mask 1] + * DF4 NodeIdShift [3:0] + * DF4p5 NodeIdShift [3:0] + */ +#define DF3_NODE_ID_SHIFT GENMASK(3, 0) + +/* + * Remap Enable + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * DF2 N/A + * DF3 N/A + * DF3p5 N/A + * + * D18F7xE08 [DRAM Address Control] + * DF4 RemapEn [4] + * + * D18F7x208 [DRAM Address Control] + * DF4p5 RemapEn [4] + */ +#define DF4_REMAP_EN BIT(4) + +/* + * Remap Select + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * DF2 N/A + * DF3 N/A + * DF3p5 N/A + * + * D18F7xE08 [DRAM Address Control] + * DF4 RemapSel [7:5] + * + * D18F7x208 [DRAM Address Control] + * DF4p5 RemapSel [6:5] + */ +#define DF4_REMAP_SEL GENMASK(7, 5) +#define DF4p5_REMAP_SEL GENMASK(6, 5) + +/* + * Socket ID Mask + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * D18F1x208 [System Fabric ID Mask] + * DF2 SocketIdMask [23:16] + * + * D18F1x20C [System Fabric ID Mask 1] + * DF3 SocketIdMask [26:24] + * + * D18F1x158 [System Fabric ID Mask 2] + * DF3p5 SocketIdMask [31:16] + * + * D18F4x1B8 [System Fabric ID Mask 2] + * DF4 SocketIdMask [31:16] + * DF4p5 SocketIdMask [31:16] + */ +#define DF2_SOCKET_ID_MASK GENMASK(23, 16) +#define DF3_SOCKET_ID_MASK GENMASK(26, 24) +#define DF4_SOCKET_ID_MASK GENMASK(31, 16) + +/* + * Socket ID Shift + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * D18F1x208 [System Fabric ID Mask] + * DF2 SocketIdShift [31:28] + * + * D18F1x20C [System Fabric ID Mask 1] + * DF3 SocketIdShift [9:8] + * + * D18F1x158 [System Fabric ID Mask 2] + * DF3p5 SocketIdShift [11:8] + * + * D18F4x1B4 [System Fabric ID Mask 1] + * DF4 SocketIdShift [11:8] + * DF4p5 SocketIdShift [11:8] + */ +#define DF2_SOCKET_ID_SHIFT GENMASK(31, 28) +#define DF3_SOCKET_ID_SHIFT GENMASK(9, 8) +#define DF4_SOCKET_ID_SHIFT GENMASK(11, 8) diff --git a/drivers/ras/amd/atl/system.c b/drivers/ras/amd/atl/system.c new file mode 100644 index 0000000000000..af61f2f1d6dea --- /dev/null +++ b/drivers/ras/amd/atl/system.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Address Translation Library + * + * system.c : Functions to read and save system-wide data + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +#include "internal.h" + +int determine_node_id(struct addr_ctx *ctx, u8 socket_id, u8 die_id) +{ + u16 socket_id_bits, die_id_bits; + + if (socket_id > 0 && df_cfg.socket_id_mask == 0) { + atl_debug(ctx, "Invalid socket inputs: socket_id=%u socket_id_mask=0x%x", + socket_id, df_cfg.socket_id_mask); + return -EINVAL; + } + + /* Do each step independently to avoid shift out-of-bounds issues. */ + socket_id_bits = socket_id; + socket_id_bits <<= df_cfg.socket_id_shift; + socket_id_bits &= df_cfg.socket_id_mask; + + if (die_id > 0 && df_cfg.die_id_mask == 0) { + atl_debug(ctx, "Invalid die inputs: die_id=%u die_id_mask=0x%x", + die_id, df_cfg.die_id_mask); + return -EINVAL; + } + + /* Do each step independently to avoid shift out-of-bounds issues. */ + die_id_bits = die_id; + die_id_bits <<= df_cfg.die_id_shift; + die_id_bits &= df_cfg.die_id_mask; + + ctx->node_id = (socket_id_bits | die_id_bits) >> df_cfg.node_id_shift; + return 0; +} + +static void df2_get_masks_shifts(u32 mask0) +{ + df_cfg.socket_id_shift = FIELD_GET(DF2_SOCKET_ID_SHIFT, mask0); + df_cfg.socket_id_mask = FIELD_GET(DF2_SOCKET_ID_MASK, mask0); + df_cfg.die_id_shift = FIELD_GET(DF2_DIE_ID_SHIFT, mask0); + df_cfg.die_id_mask = FIELD_GET(DF2_DIE_ID_MASK, mask0); + df_cfg.node_id_shift = df_cfg.die_id_shift; + df_cfg.node_id_mask = df_cfg.socket_id_mask | df_cfg.die_id_mask; + df_cfg.component_id_mask = ~df_cfg.node_id_mask; +} + +static void df3_get_masks_shifts(u32 mask0, u32 mask1) +{ + df_cfg.component_id_mask = FIELD_GET(DF3_COMPONENT_ID_MASK, mask0); + df_cfg.node_id_mask = FIELD_GET(DF3_NODE_ID_MASK, mask0); + + df_cfg.node_id_shift = FIELD_GET(DF3_NODE_ID_SHIFT, mask1); + df_cfg.socket_id_shift = FIELD_GET(DF3_SOCKET_ID_SHIFT, mask1); + df_cfg.socket_id_mask = FIELD_GET(DF3_SOCKET_ID_MASK, mask1); + df_cfg.die_id_mask = FIELD_GET(DF3_DIE_ID_MASK, mask1); +} + +static void df3p5_get_masks_shifts(u32 mask0, u32 mask1, u32 mask2) +{ + df_cfg.component_id_mask = FIELD_GET(DF4_COMPONENT_ID_MASK, mask0); + df_cfg.node_id_mask = FIELD_GET(DF4_NODE_ID_MASK, mask0); + + df_cfg.node_id_shift = FIELD_GET(DF3_NODE_ID_SHIFT, mask1); + df_cfg.socket_id_shift = FIELD_GET(DF4_SOCKET_ID_SHIFT, mask1); + + df_cfg.socket_id_mask = FIELD_GET(DF4_SOCKET_ID_MASK, mask2); + df_cfg.die_id_mask = FIELD_GET(DF4_DIE_ID_MASK, mask2); +} + +static void df4_get_masks_shifts(u32 mask0, u32 mask1, u32 mask2) +{ + df3p5_get_masks_shifts(mask0, mask1, mask2); + + if (!(df_cfg.flags.socket_id_shift_quirk && df_cfg.socket_id_shift == 1)) + return; + + df_cfg.socket_id_shift = 0; + df_cfg.socket_id_mask = 1; + df_cfg.die_id_shift = 0; + df_cfg.die_id_mask = 0; + df_cfg.node_id_shift = 8; + df_cfg.node_id_mask = 0x100; +} + +static int df4_get_fabric_id_mask_registers(void) +{ + u32 mask0, mask1, mask2; + + /* Read D18F4x1B0 (SystemFabricIdMask0) */ + if (df_indirect_read_broadcast(0, 4, 0x1B0, &mask0)) + return -EINVAL; + + /* Read D18F4x1B4 (SystemFabricIdMask1) */ + if (df_indirect_read_broadcast(0, 4, 0x1B4, &mask1)) + return -EINVAL; + + /* Read D18F4x1B8 (SystemFabricIdMask2) */ + if (df_indirect_read_broadcast(0, 4, 0x1B8, &mask2)) + return -EINVAL; + + df4_get_masks_shifts(mask0, mask1, mask2); + return 0; +} + +static int df4_determine_df_rev(u32 reg) +{ + df_cfg.rev = FIELD_GET(DF_MINOR_REVISION, reg) < 5 ? DF4 : DF4p5; + + /* Check for special cases or quirks based on Device/Vendor IDs.*/ + + /* Read D18F0x000 (DeviceVendorId0) */ + if (df_indirect_read_broadcast(0, 0, 0, ®)) + return -EINVAL; + + if (reg == DF_FUNC0_ID_ZEN4_SERVER) + df_cfg.flags.socket_id_shift_quirk = 1; + + return df4_get_fabric_id_mask_registers(); +} + +static int determine_df_rev_legacy(void) +{ + u32 fabric_id_mask0, fabric_id_mask1, fabric_id_mask2; + + /* + * Check for DF3.5. + * + * Component ID Mask must be non-zero. Register D18F1x150 is + * reserved pre-DF3.5, so value will be Read-as-Zero. + */ + + /* Read D18F1x150 (SystemFabricIdMask0). */ + if (df_indirect_read_broadcast(0, 1, 0x150, &fabric_id_mask0)) + return -EINVAL; + + if (FIELD_GET(DF4_COMPONENT_ID_MASK, fabric_id_mask0)) { + df_cfg.rev = DF3p5; + + /* Read D18F1x154 (SystemFabricIdMask1) */ + if (df_indirect_read_broadcast(0, 1, 0x154, &fabric_id_mask1)) + return -EINVAL; + + /* Read D18F1x158 (SystemFabricIdMask2) */ + if (df_indirect_read_broadcast(0, 1, 0x158, &fabric_id_mask2)) + return -EINVAL; + + df3p5_get_masks_shifts(fabric_id_mask0, fabric_id_mask1, fabric_id_mask2); + return 0; + } + + /* + * Check for DF3. + * + * Component ID Mask must be non-zero. Field is Read-as-Zero on DF2. + */ + + /* Read D18F1x208 (SystemFabricIdMask). */ + if (df_indirect_read_broadcast(0, 1, 0x208, &fabric_id_mask0)) + return -EINVAL; + + if (FIELD_GET(DF3_COMPONENT_ID_MASK, fabric_id_mask0)) { + df_cfg.rev = DF3; + + /* Read D18F1x20C (SystemFabricIdMask1) */ + if (df_indirect_read_broadcast(0, 1, 0x20C, &fabric_id_mask1)) + return -EINVAL; + + df3_get_masks_shifts(fabric_id_mask0, fabric_id_mask1); + return 0; + } + + /* Default to DF2. */ + df_cfg.rev = DF2; + df2_get_masks_shifts(fabric_id_mask0); + return 0; +} + +static int determine_df_rev(void) +{ + u32 reg; + u8 rev; + + if (df_cfg.rev != UNKNOWN) + return 0; + + /* Read D18F0x40 (FabricBlockInstanceCount). */ + if (df_indirect_read_broadcast(0, 0, 0x40, ®)) + return -EINVAL; + + /* + * Revision fields added for DF4 and later. + * + * Major revision of '0' is found pre-DF4. Field is Read-as-Zero. + */ + rev = FIELD_GET(DF_MAJOR_REVISION, reg); + if (!rev) + return determine_df_rev_legacy(); + + /* + * Fail out for major revisions other than '4'. + * + * Explicit support should be added for newer systems to avoid issues. + */ + if (rev == 4) + return df4_determine_df_rev(reg); + + return -EINVAL; +} + +static void get_num_maps(void) +{ + switch (df_cfg.rev) { + case DF2: + case DF3: + case DF3p5: + df_cfg.num_coh_st_maps = 2; + break; + case DF4: + case DF4p5: + df_cfg.num_coh_st_maps = 4; + break; + default: + atl_debug_on_bad_df_rev(); + } +} + +static void apply_node_id_shift(void) +{ + if (df_cfg.rev == DF2) + return; + + df_cfg.die_id_shift = df_cfg.node_id_shift; + df_cfg.die_id_mask <<= df_cfg.node_id_shift; + df_cfg.socket_id_mask <<= df_cfg.node_id_shift; + df_cfg.socket_id_shift += df_cfg.node_id_shift; +} + +static void dump_df_cfg(void) +{ + pr_debug("rev=0x%x", df_cfg.rev); + + pr_debug("component_id_mask=0x%x", df_cfg.component_id_mask); + pr_debug("die_id_mask=0x%x", df_cfg.die_id_mask); + pr_debug("node_id_mask=0x%x", df_cfg.node_id_mask); + pr_debug("socket_id_mask=0x%x", df_cfg.socket_id_mask); + + pr_debug("die_id_shift=0x%x", df_cfg.die_id_shift); + pr_debug("node_id_shift=0x%x", df_cfg.node_id_shift); + pr_debug("socket_id_shift=0x%x", df_cfg.socket_id_shift); + + pr_debug("num_coh_st_maps=%u", df_cfg.num_coh_st_maps); + + pr_debug("flags.legacy_ficaa=%u", df_cfg.flags.legacy_ficaa); + pr_debug("flags.socket_id_shift_quirk=%u", df_cfg.flags.socket_id_shift_quirk); +} + +int get_df_system_info(void) +{ + if (determine_df_rev()) { + pr_warn("amd_atl: Failed to determine DF Revision"); + df_cfg.rev = UNKNOWN; + return -EINVAL; + } + + apply_node_id_shift(); + + get_num_maps(); + + dump_df_cfg(); + + return 0; +} diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c new file mode 100644 index 0000000000000..9d51e4954687f --- /dev/null +++ b/drivers/ras/amd/atl/umc.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Address Translation Library + * + * umc.c : Unified Memory Controller (UMC) topology helpers + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +#include "internal.h" + +static u8 get_die_id(struct atl_err *err) +{ + /* + * For CPUs, this is the AMD Node ID modulo the number + * of AMD Nodes per socket. + */ + return topology_die_id(err->cpu) % amd_get_nodes_per_socket(); +} + +#define UMC_CHANNEL_NUM GENMASK(31, 20) +static u8 get_coh_st_inst_id(struct atl_err *err) +{ + return FIELD_GET(UMC_CHANNEL_NUM, err->ipid); +} + +unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err) +{ + u8 socket_id = topology_physical_package_id(err->cpu); + u8 coh_st_inst_id = get_coh_st_inst_id(err); + unsigned long addr = err->addr; + u8 die_id = get_die_id(err); + + pr_debug("socket_id=0x%x die_id=0x%x coh_st_inst_id=0x%x addr=0x%016lx", + socket_id, die_id, coh_st_inst_id, addr); + + return norm_to_sys_addr(socket_id, die_id, coh_st_inst_id, addr); +} diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index 95540ea8dd9db..a6e4792a1b2e9 100644 --- a/drivers/ras/ras.c +++ b/drivers/ras/ras.c @@ -10,6 +10,37 @@ #include #include +#if IS_ENABLED(CONFIG_AMD_ATL) +/* + * Once set, this function pointer should never be unset. + * + * The library module will set this pointer if it successfully loads. The module + * should not be unloaded except for testing and debug purposes. + */ +static unsigned long (*amd_atl_umc_na_to_spa)(struct atl_err *err); + +void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *)) +{ + amd_atl_umc_na_to_spa = f; +} +EXPORT_SYMBOL_GPL(amd_atl_register_decoder); + +void amd_atl_unregister_decoder(void) +{ + amd_atl_umc_na_to_spa = NULL; +} +EXPORT_SYMBOL_GPL(amd_atl_unregister_decoder); + +unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) +{ + if (!amd_atl_umc_na_to_spa) + return -EINVAL; + + return amd_atl_umc_na_to_spa(err); +} +EXPORT_SYMBOL_GPL(amd_convert_umc_mca_addr_to_sys_addr); +#endif /* CONFIG_AMD_ATL */ + #define CREATE_TRACE_POINTS #define TRACE_INCLUDE_PATH ../../include/ras #include diff --git a/include/linux/ras.h b/include/linux/ras.h index 1f4048bf2674d..09c632832bf1b 100644 --- a/include/linux/ras.h +++ b/include/linux/ras.h @@ -25,6 +25,7 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, const char *fru_text, const u8 sev, const u8 *err, const u32 len); void log_arm_hw_error(struct cper_sec_proc_arm *err); + #else static inline void log_non_standard_event(const guid_t *sec_type, @@ -35,4 +36,19 @@ static inline void log_arm_hw_error(struct cper_sec_proc_arm *err) { return; } #endif +struct atl_err { + u64 addr; + u64 ipid; + u32 cpu; +}; + +#if IS_ENABLED(CONFIG_AMD_ATL) +void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *)); +void amd_atl_unregister_decoder(void); +unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err); +#else +static inline unsigned long +amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; } +#endif /* CONFIG_AMD_ATL */ + #endif /* __RAS_H__ */ -- GitLab From 6c9058f49084569d1d816e87185e0a4f9ab1a321 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 22 Jan 2024 22:14:00 -0600 Subject: [PATCH 0076/1519] EDAC/amd64: Use new AMD Address Translation Library Remove old address translation code and use the new AMD Address Translation Library. Use "imply" in Kconfig so that the "AMD_ATL" config option takes the value of "EDAC_AMD64" as its default. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240123041401.79812-3-yazen.ghannam@amd.com --- drivers/edac/Kconfig | 1 + drivers/edac/amd64_edac.c | 286 ++------------------------------------ 2 files changed, 10 insertions(+), 277 deletions(-) diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 5a7f3fabee22d..16c8de5050e59 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -78,6 +78,7 @@ config EDAC_GHES config EDAC_AMD64 tristate "AMD64 (Opteron, Athlon64)" depends on AMD_NB && EDAC_DECODE_MCE + imply AMD_ATL help Support for error detection and correction of DRAM ECC errors on the AMD64 families (>= K8) of memory controllers. diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 537b9987a431c..ca9a8641652da 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include "amd64_edac.h" #include @@ -1051,281 +1052,6 @@ static int fixup_node_id(int node_id, struct mce *m) return nid - gpu_node_map.base_node_id + 1; } -/* Protect the PCI config register pairs used for DF indirect access. */ -static DEFINE_MUTEX(df_indirect_mutex); - -/* - * Data Fabric Indirect Access uses FICAA/FICAD. - * - * Fabric Indirect Configuration Access Address (FICAA): Constructed based - * on the device's Instance Id and the PCI function and register offset of - * the desired register. - * - * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO - * and FICAD HI registers but so far we only need the LO register. - * - * Use Instance Id 0xFF to indicate a broadcast read. - */ -#define DF_BROADCAST 0xFF -static int __df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) -{ - struct pci_dev *F4; - u32 ficaa; - int err = -ENODEV; - - if (node >= amd_nb_num()) - goto out; - - F4 = node_to_amd_nb(node)->link; - if (!F4) - goto out; - - ficaa = (instance_id == DF_BROADCAST) ? 0 : 1; - ficaa |= reg & 0x3FC; - ficaa |= (func & 0x7) << 11; - ficaa |= instance_id << 16; - - mutex_lock(&df_indirect_mutex); - - err = pci_write_config_dword(F4, 0x5C, ficaa); - if (err) { - pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa); - goto out_unlock; - } - - err = pci_read_config_dword(F4, 0x98, lo); - if (err) - pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa); - -out_unlock: - mutex_unlock(&df_indirect_mutex); - -out: - return err; -} - -static int df_indirect_read_instance(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) -{ - return __df_indirect_read(node, func, reg, instance_id, lo); -} - -static int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo) -{ - return __df_indirect_read(node, func, reg, DF_BROADCAST, lo); -} - -struct addr_ctx { - u64 ret_addr; - u32 tmp; - u16 nid; - u8 inst_id; -}; - -static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) -{ - u64 dram_base_addr, dram_limit_addr, dram_hole_base; - - u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask; - u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets; - u8 intlv_addr_sel, intlv_addr_bit; - u8 num_intlv_bits, hashed_bit; - u8 lgcy_mmio_hole_en, base = 0; - u8 cs_mask, cs_id = 0; - bool hash_enabled = false; - - struct addr_ctx ctx; - - memset(&ctx, 0, sizeof(ctx)); - - /* Start from the normalized address */ - ctx.ret_addr = norm_addr; - - ctx.nid = nid; - ctx.inst_id = umc; - - /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */ - if (df_indirect_read_instance(nid, 0, 0x1B4, umc, &ctx.tmp)) - goto out_err; - - /* Remove HiAddrOffset from normalized address, if enabled: */ - if (ctx.tmp & BIT(0)) { - u64 hi_addr_offset = (ctx.tmp & GENMASK_ULL(31, 20)) << 8; - - if (norm_addr >= hi_addr_offset) { - ctx.ret_addr -= hi_addr_offset; - base = 1; - } - } - - /* Read D18F0x110 (DramBaseAddress). */ - if (df_indirect_read_instance(nid, 0, 0x110 + (8 * base), umc, &ctx.tmp)) - goto out_err; - - /* Check if address range is valid. */ - if (!(ctx.tmp & BIT(0))) { - pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n", - __func__, ctx.tmp); - goto out_err; - } - - lgcy_mmio_hole_en = ctx.tmp & BIT(1); - intlv_num_chan = (ctx.tmp >> 4) & 0xF; - intlv_addr_sel = (ctx.tmp >> 8) & 0x7; - dram_base_addr = (ctx.tmp & GENMASK_ULL(31, 12)) << 16; - - /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */ - if (intlv_addr_sel > 3) { - pr_err("%s: Invalid interleave address select %d.\n", - __func__, intlv_addr_sel); - goto out_err; - } - - /* Read D18F0x114 (DramLimitAddress). */ - if (df_indirect_read_instance(nid, 0, 0x114 + (8 * base), umc, &ctx.tmp)) - goto out_err; - - intlv_num_sockets = (ctx.tmp >> 8) & 0x1; - intlv_num_dies = (ctx.tmp >> 10) & 0x3; - dram_limit_addr = ((ctx.tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0); - - intlv_addr_bit = intlv_addr_sel + 8; - - /* Re-use intlv_num_chan by setting it equal to log2(#channels) */ - switch (intlv_num_chan) { - case 0: intlv_num_chan = 0; break; - case 1: intlv_num_chan = 1; break; - case 3: intlv_num_chan = 2; break; - case 5: intlv_num_chan = 3; break; - case 7: intlv_num_chan = 4; break; - - case 8: intlv_num_chan = 1; - hash_enabled = true; - break; - default: - pr_err("%s: Invalid number of interleaved channels %d.\n", - __func__, intlv_num_chan); - goto out_err; - } - - num_intlv_bits = intlv_num_chan; - - if (intlv_num_dies > 2) { - pr_err("%s: Invalid number of interleaved nodes/dies %d.\n", - __func__, intlv_num_dies); - goto out_err; - } - - num_intlv_bits += intlv_num_dies; - - /* Add a bit if sockets are interleaved. */ - num_intlv_bits += intlv_num_sockets; - - /* Assert num_intlv_bits <= 4 */ - if (num_intlv_bits > 4) { - pr_err("%s: Invalid interleave bits %d.\n", - __func__, num_intlv_bits); - goto out_err; - } - - if (num_intlv_bits > 0) { - u64 temp_addr_x, temp_addr_i, temp_addr_y; - u8 die_id_bit, sock_id_bit, cs_fabric_id; - - /* - * Read FabricBlockInstanceInformation3_CS[BlockFabricID]. - * This is the fabric id for this coherent slave. Use - * umc/channel# as instance id of the coherent slave - * for FICAA. - */ - if (df_indirect_read_instance(nid, 0, 0x50, umc, &ctx.tmp)) - goto out_err; - - cs_fabric_id = (ctx.tmp >> 8) & 0xFF; - die_id_bit = 0; - - /* If interleaved over more than 1 channel: */ - if (intlv_num_chan) { - die_id_bit = intlv_num_chan; - cs_mask = (1 << die_id_bit) - 1; - cs_id = cs_fabric_id & cs_mask; - } - - sock_id_bit = die_id_bit; - - /* Read D18F1x208 (SystemFabricIdMask). */ - if (intlv_num_dies || intlv_num_sockets) - if (df_indirect_read_broadcast(nid, 1, 0x208, &ctx.tmp)) - goto out_err; - - /* If interleaved over more than 1 die. */ - if (intlv_num_dies) { - sock_id_bit = die_id_bit + intlv_num_dies; - die_id_shift = (ctx.tmp >> 24) & 0xF; - die_id_mask = (ctx.tmp >> 8) & 0xFF; - - cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit; - } - - /* If interleaved over more than 1 socket. */ - if (intlv_num_sockets) { - socket_id_shift = (ctx.tmp >> 28) & 0xF; - socket_id_mask = (ctx.tmp >> 16) & 0xFF; - - cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit; - } - - /* - * The pre-interleaved address consists of XXXXXXIIIYYYYY - * where III is the ID for this CS, and XXXXXXYYYYY are the - * address bits from the post-interleaved address. - * "num_intlv_bits" has been calculated to tell us how many "I" - * bits there are. "intlv_addr_bit" tells us how many "Y" bits - * there are (where "I" starts). - */ - temp_addr_y = ctx.ret_addr & GENMASK_ULL(intlv_addr_bit - 1, 0); - temp_addr_i = (cs_id << intlv_addr_bit); - temp_addr_x = (ctx.ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits; - ctx.ret_addr = temp_addr_x | temp_addr_i | temp_addr_y; - } - - /* Add dram base address */ - ctx.ret_addr += dram_base_addr; - - /* If legacy MMIO hole enabled */ - if (lgcy_mmio_hole_en) { - if (df_indirect_read_broadcast(nid, 0, 0x104, &ctx.tmp)) - goto out_err; - - dram_hole_base = ctx.tmp & GENMASK(31, 24); - if (ctx.ret_addr >= dram_hole_base) - ctx.ret_addr += (BIT_ULL(32) - dram_hole_base); - } - - if (hash_enabled) { - /* Save some parentheses and grab ls-bit at the end. */ - hashed_bit = (ctx.ret_addr >> 12) ^ - (ctx.ret_addr >> 18) ^ - (ctx.ret_addr >> 21) ^ - (ctx.ret_addr >> 30) ^ - cs_id; - - hashed_bit &= BIT(0); - - if (hashed_bit != ((ctx.ret_addr >> intlv_addr_bit) & BIT(0))) - ctx.ret_addr ^= BIT(intlv_addr_bit); - } - - /* Is calculated system address is above DRAM limit address? */ - if (ctx.ret_addr > dram_limit_addr) - goto out_err; - - *sys_addr = ctx.ret_addr; - return 0; - -out_err: - return -EINVAL; -} - static int get_channel_from_ecc_syndrome(struct mem_ctl_info *, u16); /* @@ -3073,9 +2799,10 @@ static void decode_umc_error(int node_id, struct mce *m) { u8 ecc_type = (m->status >> 45) & 0x3; struct mem_ctl_info *mci; + unsigned long sys_addr; struct amd64_pvt *pvt; + struct atl_err a_err; struct err_info err; - u64 sys_addr; node_id = fixup_node_id(node_id, m); @@ -3106,7 +2833,12 @@ static void decode_umc_error(int node_id, struct mce *m) pvt->ops->get_err_info(m, &err); - if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, err.channel, &sys_addr)) { + a_err.addr = m->addr; + a_err.ipid = m->ipid; + a_err.cpu = m->extcpu; + + sys_addr = amd_convert_umc_mca_addr_to_sys_addr(&a_err); + if (IS_ERR_VALUE(sys_addr)) { err.err_code = ERR_NORM_ADDR; goto log_error; } -- GitLab From 1289c431641f8beacc47db506210154dcea2492a Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 22 Jan 2024 22:14:01 -0600 Subject: [PATCH 0077/1519] Documentation: RAS: Add index and address translation section There are a lot of RAS topic to document, and there are a lot of details for each topic. Prep for this by adding an index for the RAS directory. This will provide a top-level document and table of contents. It also provides the option to build the RAS directory individually using "make SPHINXDIRS=". Start a section on address translation. This will be expanded with details for future translation methods and how they're used in the kernel. Move the error decoding topic to its own section. Links to other error decoding kernel docs will be added. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240123041401.79812-4-yazen.ghannam@amd.com --- Documentation/RAS/address-translation.rst | 24 +++++++++++++++++++ .../RAS/{ras.rst => error-decoding.rst} | 11 +++------ Documentation/RAS/index.rst | 14 +++++++++++ Documentation/index.rst | 2 +- MAINTAINERS | 1 + 5 files changed, 43 insertions(+), 9 deletions(-) create mode 100644 Documentation/RAS/address-translation.rst rename Documentation/RAS/{ras.rst => error-decoding.rst} (73%) create mode 100644 Documentation/RAS/index.rst diff --git a/Documentation/RAS/address-translation.rst b/Documentation/RAS/address-translation.rst new file mode 100644 index 0000000000000..f0ca17b43cd3d --- /dev/null +++ b/Documentation/RAS/address-translation.rst @@ -0,0 +1,24 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Address translation +=================== + +x86 AMD +------- + +Zen-based AMD systems include a Data Fabric that manages the layout of +physical memory. Devices attached to the Fabric, like memory controllers, +I/O, etc., may not have a complete view of the system physical memory map. +These devices may provide a "normalized", i.e. device physical, address +when reporting memory errors. Normalized addresses must be translated to +a system physical address for the kernel to action on the memory. + +AMD Address Translation Library (CONFIG_AMD_ATL) provides translation for +this case. + +Glossary of acronyms used in address translation for Zen-based systems + +* CCM = Cache Coherent Moderator +* COD = Cluster-on-Die +* COH_ST = Coherent Station +* DF = Data Fabric diff --git a/Documentation/RAS/ras.rst b/Documentation/RAS/error-decoding.rst similarity index 73% rename from Documentation/RAS/ras.rst rename to Documentation/RAS/error-decoding.rst index 2556b397cd271..26a72f3fe5de8 100644 --- a/Documentation/RAS/ras.rst +++ b/Documentation/RAS/error-decoding.rst @@ -1,15 +1,10 @@ .. SPDX-License-Identifier: GPL-2.0 -Reliability, Availability and Serviceability features -===================================================== - -This documents different aspects of the RAS functionality present in the -kernel. - Error decoding ---------------- +============== -* x86 +x86 +--- Error decoding on AMD systems should be done using the rasdaemon tool: https://github.com/mchehab/rasdaemon/ diff --git a/Documentation/RAS/index.rst b/Documentation/RAS/index.rst new file mode 100644 index 0000000000000..2794c1816e906 --- /dev/null +++ b/Documentation/RAS/index.rst @@ -0,0 +1,14 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================================================== +Reliability, Availability and Serviceability (RAS) features +=========================================================== + +This documents different aspects of the RAS functionality present in the +kernel. + +.. toctree:: + :maxdepth: 2 + + error-decoding + address-translation diff --git a/Documentation/index.rst b/Documentation/index.rst index 36e61783437c1..07f2aa07f0fa0 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -113,7 +113,7 @@ to ReStructured Text format, or are simply too old. :maxdepth: 1 staging/index - RAS/ras + RAS/index Translations diff --git a/MAINTAINERS b/MAINTAINERS index 25537a37338e5..5b945fd5a3b91 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18359,6 +18359,7 @@ M: Tony Luck M: Borislav Petkov L: linux-edac@vger.kernel.org S: Maintained +F: Documentation/RAS/ F: Documentation/admin-guide/ras.rst F: drivers/ras/ F: include/linux/ras.h -- GitLab From bb998361999e79bc87dae1ebe0f5bf317f632585 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 8 Jan 2024 11:39:50 +0000 Subject: [PATCH 0078/1519] x86/entry: Avoid redundant CR3 write on paranoid returns The CR3 restore happens in: 1. #NMI return. 2. paranoid_exit() (i.e. #MCE, #VC, #DB and #DF return) Contrary to the implication in commit 21e94459110252 ("x86/mm: Optimize RESTORE_CR3"), the kernel never modifies CR3 in any of these exceptions, except for switching from user to kernel pagetables under PTI. That means that most of the time when returning from an exception that interrupted the kernel no CR3 restore is necessary. Writing CR3 is expensive on some machines. Most of the time because the interrupt might have come during kernel entry before the user to kernel CR3 switch or the during exit after the kernel to user switch. In the former case skipping the restore would be correct, but definitely not for the latter. So check the saved CR3 value and restore it only, if it is a user CR3. Give the macro a new name to clarify its usage, and remove a comment that was describing the original behaviour along with the not longer needed jump label. Signed-off-by: Lai Jiangshan Signed-off-by: Brendan Jackman Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240108113950.360438-1-jackmanb@google.com [Rewrote commit message; responded to review comments] Change-Id: I6e56978c4753fb943a7897ff101f519514fa0827 --- arch/x86/entry/calling.h | 26 ++++++++++---------------- arch/x86/entry/entry_64.S | 7 +++---- 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 9f1d94790a549..92dca4a4a15c5 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -239,17 +239,19 @@ For 32-bit we have the following conventions - kernel is built with .Ldone_\@: .endm -.macro RESTORE_CR3 scratch_reg:req save_reg:req +/* Restore CR3 from a kernel context. May restore a user CR3 value. */ +.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI - ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID - /* - * KERNEL pages can always resume with NOFLUSH as we do - * explicit flushes. + * If CR3 contained the kernel page tables at the paranoid exception + * entry, then there is nothing to restore as CR3 is not modified while + * handling the exception. */ bt $PTI_USER_PGTABLE_BIT, \save_reg - jnc .Lnoflush_\@ + jnc .Lend_\@ + + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID /* * Check if there's a pending flush for the user ASID we're @@ -257,20 +259,12 @@ For 32-bit we have the following conventions - kernel is built with */ movq \save_reg, \scratch_reg andq $(0x7FF), \scratch_reg - bt \scratch_reg, THIS_CPU_user_pcid_flush_mask - jnc .Lnoflush_\@ - btr \scratch_reg, THIS_CPU_user_pcid_flush_mask - jmp .Lwrcr3_\@ + jc .Lwrcr3_\@ -.Lnoflush_\@: SET_NOFLUSH_BIT \save_reg .Lwrcr3_\@: - /* - * The CR3 write could be avoided when not changing its value, - * but would require a CR3 read *and* a scratch register. - */ movq \save_reg, %cr3 .Lend_\@: .endm @@ -285,7 +279,7 @@ For 32-bit we have the following conventions - kernel is built with .endm .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req .endm -.macro RESTORE_CR3 scratch_reg:req save_reg:req +.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req .endm #endif diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index c40f89ab1b4c7..aedd169c46c9e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -968,14 +968,14 @@ SYM_CODE_START_LOCAL(paranoid_exit) IBRS_EXIT save_reg=%r15 /* - * The order of operations is important. RESTORE_CR3 requires + * The order of operations is important. PARANOID_RESTORE_CR3 requires * kernel GSBASE. * * NB to anyone to try to optimize this code: this code does * not execute at all for exceptions from user mode. Those * exceptions go through error_return instead. */ - RESTORE_CR3 scratch_reg=%rax save_reg=%r14 + PARANOID_RESTORE_CR3 scratch_reg=%rax save_reg=%r14 /* Handle the three GSBASE cases */ ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE @@ -1404,8 +1404,7 @@ end_repeat_nmi: /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */ IBRS_EXIT save_reg=%r15 - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 + PARANOID_RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 /* * The above invocation of paranoid_entry stored the GSBASE -- GitLab From a0c446dc4d9365a24d81f2ee024bdde46e40365f Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Mon, 22 Jan 2024 16:57:12 +0800 Subject: [PATCH 0079/1519] irqchip/gic-v3: Use readl_relaxed_poll_timeout_atomic() Replace the open coded register polling loop with readl_relaxed_poll_timeout_atomic() which provides the same functionality. Signed-off-by: Dawei Li Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240122085716.2999875-2-dawei.li@shingroup.cn --- drivers/irqchip/irq-gic-v3.c | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 98b0329b7154a..65cbf378eec43 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -251,17 +252,13 @@ static inline void __iomem *gic_dist_base(struct irq_data *d) static void gic_do_wait_for_rwp(void __iomem *base, u32 bit) { - u32 count = 1000000; /* 1s! */ + u32 val; + int ret; - while (readl_relaxed(base + GICD_CTLR) & bit) { - count--; - if (!count) { - pr_err_ratelimited("RWP timeout, gone fishing\n"); - return; - } - cpu_relax(); - udelay(1); - } + ret = readl_relaxed_poll_timeout_atomic(base + GICD_CTLR, val, !(val & bit), + 1, USEC_PER_SEC); + if (ret == -ETIMEDOUT) + pr_err_ratelimited("RWP timeout, gone fishing\n"); } /* Wait for completion of a distributor change */ @@ -279,8 +276,8 @@ static void gic_redist_wait_for_rwp(void) static void gic_enable_redist(bool enable) { void __iomem *rbase; - u32 count = 1000000; /* 1s! */ u32 val; + int ret; if (gic_data.flags & FLAGS_WORKAROUND_GICR_WAKER_MSM8996) return; @@ -301,16 +298,13 @@ static void gic_enable_redist(bool enable) return; /* No PM support in this redistributor */ } - while (--count) { - val = readl_relaxed(rbase + GICR_WAKER); - if (enable ^ (bool)(val & GICR_WAKER_ChildrenAsleep)) - break; - cpu_relax(); - udelay(1); - } - if (!count) + ret = readl_relaxed_poll_timeout_atomic(rbase + GICR_WAKER, val, + enable ^ (bool)(val & GICR_WAKER_ChildrenAsleep), + 1, USEC_PER_SEC); + if (ret == -ETIMEDOUT) { pr_err_ratelimited("redistributor failed to %s...\n", enable ? "wakeup" : "sleep"); + } } /* -- GitLab From d22083a5f09b2066728a91f3abb71284451247b1 Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Mon, 22 Jan 2024 16:57:13 +0800 Subject: [PATCH 0080/1519] irqchip/gic(v3): Replace gic_irq() with irqd_to_hwirq() GIC & GIC-v3 share same gic_irq() implementations, both of which serve exact same purpose as irqd_to_hwirq(). irqd_to_hwirq() is a generic and top level API of the interrupt subsystem, it's independent of any chip implementation. Replace gic_irq() with irqd_to_hwirq() and convert struct irq_data::hwirq to irq_hw_number_t explicitly. Suggested-by: Marc Zyngier Signed-off-by: Dawei Li Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240122085716.2999875-3-dawei.li@shingroup.cn --- drivers/irqchip/irq-gic-v3.c | 19 +++++++------------ drivers/irqchip/irq-gic.c | 27 ++++++++++++--------------- include/linux/irq.h | 2 +- 3 files changed, 20 insertions(+), 28 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 65cbf378eec43..20a75f0353cde 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -181,11 +181,6 @@ static enum gic_intid_range get_intid_range(struct irq_data *d) return __get_intid_range(d->hwirq); } -static inline unsigned int gic_irq(struct irq_data *d) -{ - return d->hwirq; -} - static inline bool gic_irq_in_rdist(struct irq_data *d) { switch (get_intid_range(d)) { @@ -542,7 +537,7 @@ static int gic_irq_nmi_setup(struct irq_data *d) * A secondary irq_chip should be in charge of LPI request, * it should not be possible to get there */ - if (WARN_ON(gic_irq(d) >= 8192)) + if (WARN_ON(irqd_to_hwirq(d) >= 8192)) return -EINVAL; /* desc lock should already be held */ @@ -582,7 +577,7 @@ static void gic_irq_nmi_teardown(struct irq_data *d) * A secondary irq_chip should be in charge of LPI request, * it should not be possible to get there */ - if (WARN_ON(gic_irq(d) >= 8192)) + if (WARN_ON(irqd_to_hwirq(d) >= 8192)) return; /* desc lock should already be held */ @@ -620,7 +615,7 @@ static bool gic_arm64_erratum_2941627_needed(struct irq_data *d) static void gic_eoi_irq(struct irq_data *d) { - write_gicreg(gic_irq(d), ICC_EOIR1_EL1); + write_gicreg(irqd_to_hwirq(d), ICC_EOIR1_EL1); isb(); if (gic_arm64_erratum_2941627_needed(d)) { @@ -640,19 +635,19 @@ static void gic_eoimode1_eoi_irq(struct irq_data *d) * No need to deactivate an LPI, or an interrupt that * is is getting forwarded to a vcpu. */ - if (gic_irq(d) >= 8192 || irqd_is_forwarded_to_vcpu(d)) + if (irqd_to_hwirq(d) >= 8192 || irqd_is_forwarded_to_vcpu(d)) return; if (!gic_arm64_erratum_2941627_needed(d)) - gic_write_dir(gic_irq(d)); + gic_write_dir(irqd_to_hwirq(d)); else gic_poke_irq(d, GICD_ICACTIVER); } static int gic_set_type(struct irq_data *d, unsigned int type) { + irq_hw_number_t irq = irqd_to_hwirq(d); enum gic_intid_range range; - unsigned int irq = gic_irq(d); void __iomem *base; u32 offset, index; int ret; @@ -678,7 +673,7 @@ static int gic_set_type(struct irq_data *d, unsigned int type) ret = gic_configure_irq(index, type, base + offset, NULL); if (ret && (range == PPI_RANGE || range == EPPI_RANGE)) { /* Misconfigured PPIs are usually not fatal */ - pr_warn("GIC: PPI INTID%d is secure or misconfigured\n", irq); + pr_warn("GIC: PPI INTID%ld is secure or misconfigured\n", irq); ret = 0; } diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 412196a7dad58..98aa383e39db1 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -162,11 +162,6 @@ static inline void __iomem *gic_cpu_base(struct irq_data *d) return gic_data_cpu_base(gic_data); } -static inline unsigned int gic_irq(struct irq_data *d) -{ - return d->hwirq; -} - static inline bool cascading_gic_irq(struct irq_data *d) { void *data = irq_data_get_irq_handler_data(d); @@ -183,14 +178,16 @@ static inline bool cascading_gic_irq(struct irq_data *d) */ static void gic_poke_irq(struct irq_data *d, u32 offset) { - u32 mask = 1 << (gic_irq(d) % 32); - writel_relaxed(mask, gic_dist_base(d) + offset + (gic_irq(d) / 32) * 4); + u32 mask = 1 << (irqd_to_hwirq(d) % 32); + + writel_relaxed(mask, gic_dist_base(d) + offset + (irqd_to_hwirq(d) / 32) * 4); } static int gic_peek_irq(struct irq_data *d, u32 offset) { - u32 mask = 1 << (gic_irq(d) % 32); - return !!(readl_relaxed(gic_dist_base(d) + offset + (gic_irq(d) / 32) * 4) & mask); + u32 mask = 1 << (irqd_to_hwirq(d) % 32); + + return !!(readl_relaxed(gic_dist_base(d) + offset + (irqd_to_hwirq(d) / 32) * 4) & mask); } static void gic_mask_irq(struct irq_data *d) @@ -220,7 +217,7 @@ static void gic_unmask_irq(struct irq_data *d) static void gic_eoi_irq(struct irq_data *d) { - u32 hwirq = gic_irq(d); + irq_hw_number_t hwirq = irqd_to_hwirq(d); if (hwirq < 16) hwirq = this_cpu_read(sgi_intid); @@ -230,7 +227,7 @@ static void gic_eoi_irq(struct irq_data *d) static void gic_eoimode1_eoi_irq(struct irq_data *d) { - u32 hwirq = gic_irq(d); + irq_hw_number_t hwirq = irqd_to_hwirq(d); /* Do not deactivate an IRQ forwarded to a vcpu. */ if (irqd_is_forwarded_to_vcpu(d)) @@ -293,8 +290,8 @@ static int gic_irq_get_irqchip_state(struct irq_data *d, static int gic_set_type(struct irq_data *d, unsigned int type) { + irq_hw_number_t gicirq = irqd_to_hwirq(d); void __iomem *base = gic_dist_base(d); - unsigned int gicirq = gic_irq(d); int ret; /* Interrupt configuration for SGIs can't be changed */ @@ -309,7 +306,7 @@ static int gic_set_type(struct irq_data *d, unsigned int type) ret = gic_configure_irq(gicirq, type, base + GIC_DIST_CONFIG, NULL); if (ret && gicirq < 32) { /* Misconfigured PPIs are usually not fatal */ - pr_warn("GIC: PPI%d is secure or misconfigured\n", gicirq - 16); + pr_warn("GIC: PPI%ld is secure or misconfigured\n", gicirq - 16); ret = 0; } @@ -319,7 +316,7 @@ static int gic_set_type(struct irq_data *d, unsigned int type) static int gic_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu) { /* Only interrupts on the primary GIC can be forwarded to a vcpu. */ - if (cascading_gic_irq(d) || gic_irq(d) < 16) + if (cascading_gic_irq(d) || irqd_to_hwirq(d) < 16) return -EINVAL; if (vcpu) @@ -796,7 +793,7 @@ static void rmw_writeb(u8 bval, void __iomem *addr) static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val, bool force) { - void __iomem *reg = gic_dist_base(d) + GIC_DIST_TARGET + gic_irq(d); + void __iomem *reg = gic_dist_base(d) + GIC_DIST_TARGET + irqd_to_hwirq(d); struct gic_chip_data *gic = irq_data_get_irq_chip_data(d); unsigned int cpu; diff --git a/include/linux/irq.h b/include/linux/irq.h index 90081afa10ce5..97baa937ab5b9 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -179,7 +179,7 @@ struct irq_common_data { struct irq_data { u32 mask; unsigned int irq; - unsigned long hwirq; + irq_hw_number_t hwirq; struct irq_common_data *common; struct irq_chip *chip; struct irq_domain *domain; -- GitLab From 9676635685fe348003a29948d9726e5d9e4b4a6e Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Mon, 22 Jan 2024 16:57:14 +0800 Subject: [PATCH 0081/1519] genirq: Remove unneeded forward declaration The protoype of irq_flow_handler_t is independent of irq_data, so remove unneeded forward declaration. Signed-off-by: Dawei Li Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240122085716.2999875-4-dawei.li@shingroup.cn --- include/linux/irqhandler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/irqhandler.h b/include/linux/irqhandler.h index c30f454a9518f..72dd1eb3a0e7f 100644 --- a/include/linux/irqhandler.h +++ b/include/linux/irqhandler.h @@ -8,7 +8,7 @@ */ struct irq_desc; -struct irq_data; + typedef void (*irq_flow_handler_t)(struct irq_desc *desc); #endif -- GitLab From 22653244a9fed06f2f864b44808a85bf5c4e3ef2 Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Mon, 22 Jan 2024 16:57:16 +0800 Subject: [PATCH 0082/1519] genirq: Deduplicate interrupt descriptor initialization alloc_desc() and early_irq_init() contain duplicated code to initialize interrupt descriptors. Replace that with a helper function. Suggested-by: Marc Zyngier Signed-off-by: Dawei Li Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240122085716.2999875-6-dawei.li@shingroup.cn --- kernel/irq/irqdesc.c | 112 ++++++++++++++++++++++++------------------- 1 file changed, 64 insertions(+), 48 deletions(-) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 371eb1711d346..4c6b32318ce35 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -92,11 +92,23 @@ static void desc_smp_init(struct irq_desc *desc, int node, #endif } +static void free_masks(struct irq_desc *desc) +{ +#ifdef CONFIG_GENERIC_PENDING_IRQ + free_cpumask_var(desc->pending_mask); +#endif + free_cpumask_var(desc->irq_common_data.affinity); +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + free_cpumask_var(desc->irq_common_data.effective_affinity); +#endif +} + #else static inline int alloc_masks(struct irq_desc *desc, int node) { return 0; } static inline void desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { } +static inline void free_masks(struct irq_desc *desc) { } #endif static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, @@ -165,6 +177,39 @@ static void delete_irq_desc(unsigned int irq) mas_erase(&mas); } +#ifdef CONFIG_SPARSE_IRQ +static const struct kobj_type irq_kobj_type; +#endif + +static int init_desc(struct irq_desc *desc, int irq, int node, + unsigned int flags, + const struct cpumask *affinity, + struct module *owner) +{ + desc->kstat_irqs = alloc_percpu(unsigned int); + if (!desc->kstat_irqs) + return -ENOMEM; + + if (alloc_masks(desc, node)) { + free_percpu(desc->kstat_irqs); + return -ENOMEM; + } + + raw_spin_lock_init(&desc->lock); + lockdep_set_class(&desc->lock, &irq_desc_lock_class); + mutex_init(&desc->request_mutex); + init_waitqueue_head(&desc->wait_for_threads); + desc_set_defaults(irq, desc, node, affinity, owner); + irqd_set(&desc->irq_data, flags); + irq_resend_init(desc); +#ifdef CONFIG_SPARSE_IRQ + kobject_init(&desc->kobj, &irq_kobj_type); + init_rcu_head(&desc->rcu); +#endif + + return 0; +} + #ifdef CONFIG_SPARSE_IRQ static void irq_kobj_release(struct kobject *kobj); @@ -384,21 +429,6 @@ struct irq_desc *irq_to_desc(unsigned int irq) EXPORT_SYMBOL_GPL(irq_to_desc); #endif -#ifdef CONFIG_SMP -static void free_masks(struct irq_desc *desc) -{ -#ifdef CONFIG_GENERIC_PENDING_IRQ - free_cpumask_var(desc->pending_mask); -#endif - free_cpumask_var(desc->irq_common_data.affinity); -#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK - free_cpumask_var(desc->irq_common_data.effective_affinity); -#endif -} -#else -static inline void free_masks(struct irq_desc *desc) { } -#endif - void irq_lock_sparse(void) { mutex_lock(&sparse_irq_lock); @@ -414,36 +444,19 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, struct module *owner) { struct irq_desc *desc; + int ret; desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node); if (!desc) return NULL; - /* allocate based on nr_cpu_ids */ - desc->kstat_irqs = alloc_percpu(unsigned int); - if (!desc->kstat_irqs) - goto err_desc; - - if (alloc_masks(desc, node)) - goto err_kstat; - raw_spin_lock_init(&desc->lock); - lockdep_set_class(&desc->lock, &irq_desc_lock_class); - mutex_init(&desc->request_mutex); - init_rcu_head(&desc->rcu); - init_waitqueue_head(&desc->wait_for_threads); - - desc_set_defaults(irq, desc, node, affinity, owner); - irqd_set(&desc->irq_data, flags); - kobject_init(&desc->kobj, &irq_kobj_type); - irq_resend_init(desc); + ret = init_desc(desc, irq, node, flags, affinity, owner); + if (unlikely(ret)) { + kfree(desc); + return NULL; + } return desc; - -err_kstat: - free_percpu(desc->kstat_irqs); -err_desc: - kfree(desc); - return NULL; } static void irq_kobj_release(struct kobject *kobj) @@ -583,26 +596,29 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { int __init early_irq_init(void) { int count, i, node = first_online_node; - struct irq_desc *desc; + int ret; init_irq_default_affinity(); printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS); - desc = irq_desc; count = ARRAY_SIZE(irq_desc); for (i = 0; i < count; i++) { - desc[i].kstat_irqs = alloc_percpu(unsigned int); - alloc_masks(&desc[i], node); - raw_spin_lock_init(&desc[i].lock); - lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - mutex_init(&desc[i].request_mutex); - init_waitqueue_head(&desc[i].wait_for_threads); - desc_set_defaults(i, &desc[i], node, NULL, NULL); - irq_resend_init(&desc[i]); + ret = init_desc(irq_desc + i, i, node, 0, NULL, NULL); + if (unlikely(ret)) + goto __free_desc_res; } + return arch_early_irq_init(); + +__free_desc_res: + while (--i >= 0) { + free_masks(irq_desc + i); + free_percpu(irq_desc[i].kstat_irqs); + } + + return ret; } struct irq_desc *irq_to_desc(unsigned int irq) -- GitLab From fc747eebef734563cf68a512f57937c8f231834a Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Wed, 24 Jan 2024 11:52:56 -0600 Subject: [PATCH 0083/1519] x86/resctrl: Remove redundant variable in mbm_config_write_domain() The kernel test robot reported the following warning after commit 54e35eb8611c ("x86/resctrl: Read supported bandwidth sources from CPUID"). even though the issue is present even in the original commit 92bd5a139033 ("x86/resctrl: Add interface to write mbm_total_bytes_config") which added this function. The reported warning is: $ make C=1 CHECK=scripts/coccicheck arch/x86/kernel/cpu/resctrl/rdtgroup.o ... arch/x86/kernel/cpu/resctrl/rdtgroup.c:1621:5-8: Unneeded variable: "ret". Return "0" on line 1655 Remove the local variable 'ret'. [ bp: Massage commit message, make mbm_config_write_domain() void. ] Fixes: 92bd5a139033 ("x86/resctrl: Add interface to write mbm_total_bytes_config") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401241810.jbd8Ipa1-lkp@intel.com/ Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Acked-by: Reinette Chatre Link: https://lore.kernel.org/r/202401241810.jbd8Ipa1-lkp@intel.com --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 2b69e560b05f1..aa24343f1d237 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1614,11 +1614,10 @@ static void mon_event_config_write(void *info) wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0); } -static int mbm_config_write_domain(struct rdt_resource *r, - struct rdt_domain *d, u32 evtid, u32 val) +static void mbm_config_write_domain(struct rdt_resource *r, + struct rdt_domain *d, u32 evtid, u32 val) { struct mon_config_info mon_info = {0}; - int ret = 0; /* * Read the current config value first. If both are the same then @@ -1627,7 +1626,7 @@ static int mbm_config_write_domain(struct rdt_resource *r, mon_info.evtid = evtid; mondata_config_read(d, &mon_info); if (mon_info.mon_config == val) - goto out; + return; mon_info.mon_config = val; @@ -1650,9 +1649,6 @@ static int mbm_config_write_domain(struct rdt_resource *r, * mbm_local and mbm_total counts for all the RMIDs. */ resctrl_arch_reset_rmid_all(r, d); - -out: - return ret; } static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) @@ -1661,7 +1657,6 @@ static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) char *dom_str = NULL, *id_str; unsigned long dom_id, val; struct rdt_domain *d; - int ret = 0; next: if (!tok || tok[0] == '\0') @@ -1690,9 +1685,7 @@ static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) list_for_each_entry(d, &r->domains, list) { if (d->id == dom_id) { - ret = mbm_config_write_domain(r, d, evtid, val); - if (ret) - return -EINVAL; + mbm_config_write_domain(r, d, evtid, val); goto next; } } -- GitLab From 8246601a7d391ce8207408149d65732f28af81a1 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 20 Dec 2023 01:50:43 +0800 Subject: [PATCH 0084/1519] riscv: tlb: fix __p*d_free_tlb() If non-leaf PTEs I.E pmd, pud or p4d is modified, a sfence.vma is a must for safe, imagine if an implementation caches the non-leaf translation in TLB, although I didn't meet this HW so far, but it's possible in theory. Signed-off-by: Jisheng Zhang Fixes: c5e9b2c2ae82 ("riscv: Improve tlb_flush()") Link: https://lore.kernel.org/r/20231219175046.2496-2-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgalloc.h | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index d169a4f41a2e7..c80bb9990d32e 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -95,7 +95,13 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) __pud_free(mm, pud); } -#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud) +#define __pud_free_tlb(tlb, pud, addr) \ +do { \ + if (pgtable_l4_enabled) { \ + pagetable_pud_dtor(virt_to_ptdesc(pud)); \ + tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pud)); \ + } \ +} while (0) #define p4d_alloc_one p4d_alloc_one static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) @@ -124,7 +130,11 @@ static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) __p4d_free(mm, p4d); } -#define __p4d_free_tlb(tlb, p4d, addr) p4d_free((tlb)->mm, p4d) +#define __p4d_free_tlb(tlb, p4d, addr) \ +do { \ + if (pgtable_l5_enabled) \ + tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(p4d)); \ +} while (0) #endif /* __PAGETABLE_PMD_FOLDED */ static inline void sync_kernel_mappings(pgd_t *pgd) @@ -149,7 +159,11 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) #ifndef __PAGETABLE_PMD_FOLDED -#define __pmd_free_tlb(tlb, pmd, addr) pmd_free((tlb)->mm, pmd) +#define __pmd_free_tlb(tlb, pmd, addr) \ +do { \ + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); \ + tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pmd)); \ +} while (0) #endif /* __PAGETABLE_PMD_FOLDED */ -- GitLab From 983a73da1f996faee9997149eb05b12fa7bd8cbf Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 24 Jan 2024 00:13:54 -0800 Subject: [PATCH 0085/1519] xfrm: Pass UDP encapsulation in TX packet offload In addition to citied commit in Fixes line, allow UDP encapsulation in TX path too. Fixes: 89edf40220be ("xfrm: Support UDP encapsulation in packet offload mode") CC: Steffen Klassert Reported-by: Mike Yu Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 3784534c91855..653e51ae39648 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -407,7 +407,7 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct net_device *dev = x->xso.dev; - if (!x->type_offload || x->encap) + if (!x->type_offload) return false; if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET || -- GitLab From 2263639f96f24a121ec9f037981b81daf5a8d60a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 23 Jan 2024 15:24:46 -0700 Subject: [PATCH 0086/1519] iov_iter: streamline iovec/bvec alignment iteration Rewrite the alignment checking iterators for iovec and bvec to be easier to read, and also significantly more compact in terms of generated code. This saves 270 bytes of text on x86-64 for me (with clang-18) and 224 bytes on arm64 (with gcc-13). In profiles, also saves a bit of time as well for the same workload: 0.81% -0.18% [kernel.vmlinux] [k] iov_iter_aligned_bvec 0.48% -0.09% [kernel.vmlinux] [k] iov_iter_is_aligned which is a nice side benefit as well. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/544b31f7-6d4b-42f5-a544-1420501f081f@kernel.dk Reviewed-by: Keith Busch Signed-off-by: Christian Brauner v2: do the other half of the iterators too, as suggested by Keith. This further saves some text. --- lib/iov_iter.c | 55 +++++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index e0aa6b440ca5f..15f5040709c36 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -714,12 +714,11 @@ EXPORT_SYMBOL(iov_iter_discard); static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { + const struct iovec *iov = iter_iov(i); size_t size = i->count; size_t skip = i->iov_offset; - unsigned k; - for (k = 0; k < i->nr_segs; k++, skip = 0) { - const struct iovec *iov = iter_iov(i) + k; + do { size_t len = iov->iov_len - skip; if (len > size) @@ -729,34 +728,36 @@ static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, if ((unsigned long)(iov->iov_base + skip) & addr_mask) return false; + iov++; size -= len; - if (!size) - break; - } + skip = 0; + } while (size); + return true; } static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { - size_t size = i->count; + const struct bio_vec *bvec = i->bvec; unsigned skip = i->iov_offset; - unsigned k; + size_t size = i->count; - for (k = 0; k < i->nr_segs; k++, skip = 0) { - size_t len = i->bvec[k].bv_len - skip; + do { + size_t len = bvec->bv_len; if (len > size) len = size; if (len & len_mask) return false; - if ((unsigned long)(i->bvec[k].bv_offset + skip) & addr_mask) + if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) return false; + bvec++; size -= len; - if (!size) - break; - } + skip = 0; + } while (size); + return true; } @@ -800,13 +801,12 @@ EXPORT_SYMBOL_GPL(iov_iter_is_aligned); static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) { + const struct iovec *iov = iter_iov(i); unsigned long res = 0; size_t size = i->count; size_t skip = i->iov_offset; - unsigned k; - for (k = 0; k < i->nr_segs; k++, skip = 0) { - const struct iovec *iov = iter_iov(i) + k; + do { size_t len = iov->iov_len - skip; if (len) { res |= (unsigned long)iov->iov_base + skip; @@ -814,30 +814,31 @@ static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) len = size; res |= len; size -= len; - if (!size) - break; } - } + iov++; + skip = 0; + } while (size); return res; } static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) { + const struct bio_vec *bvec = i->bvec; unsigned res = 0; size_t size = i->count; unsigned skip = i->iov_offset; - unsigned k; - for (k = 0; k < i->nr_segs; k++, skip = 0) { - size_t len = i->bvec[k].bv_len - skip; - res |= (unsigned long)i->bvec[k].bv_offset + skip; + do { + size_t len = bvec->bv_len - skip; + res |= (unsigned long)bvec->bv_offset + skip; if (len > size) len = size; res |= len; + bvec++; size -= len; - if (!size) - break; - } + skip = 0; + } while (size); + return res; } -- GitLab From a6b48c83d28e21ddcd6a080128bb73f9e3d130ac Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 25 Jan 2024 06:21:56 -1000 Subject: [PATCH 0087/1519] tools/workqueue/wq_dump.py: Clean up code and drop duplicate information - Factor out wq_type_str() - Improve formatting so that it adapts to actual field widths. - Drop duplicate information from "Workqueue -> rescuer" section. If anything, we should add more rescuer-specific info - e.g. the number of work items rescued. Signed-off-by: Tejun Heo Cc: Juri Lelli --- tools/workqueue/wq_dump.py | 69 +++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/tools/workqueue/wq_dump.py b/tools/workqueue/wq_dump.py index 6da621989e210..333b2fc00b829 100644 --- a/tools/workqueue/wq_dump.py +++ b/tools/workqueue/wq_dump.py @@ -75,6 +75,20 @@ def cpumask_str(cpumask): output += f'{v:08x}' return output.strip() +wq_type_len = 9 + +def wq_type_str(wq): + if wq.flags & WQ_UNBOUND: + if wq.flags & WQ_ORDERED: + return f'{"ordered":{wq_type_len}}' + else: + if wq.unbound_attrs.affn_strict: + return f'{"unbound,S":{wq_type_len}}' + else: + return f'{"unbound":{wq_type_len}}' + else: + return f'{"percpu":{wq_type_len}}' + worker_pool_idr = prog['worker_pool_idr'] workqueues = prog['workqueues'] wq_unbound_cpumask = prog['wq_unbound_cpumask'] @@ -92,6 +106,10 @@ WQ_AFFN_CACHE = prog['WQ_AFFN_CACHE'] WQ_AFFN_NUMA = prog['WQ_AFFN_NUMA'] WQ_AFFN_SYSTEM = prog['WQ_AFFN_SYSTEM'] +WQ_NAME_LEN = prog['WQ_NAME_LEN'].value_() + +cpumask_str_len = len(cpumask_str(wq_unbound_cpumask)) + print('Affinity Scopes') print('===============') @@ -148,24 +166,13 @@ print('') print('Workqueue CPU -> pool') print('=====================') -print('[ workqueue \ type CPU', end='') +print(f'[{"workqueue":^{WQ_NAME_LEN-2}}\\ {"type CPU":{wq_type_len}}', end='') for cpu in for_each_possible_cpu(prog): print(f' {cpu:{max_pool_id_len}}', end='') print(' dfl]') for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_(), 'list'): - print(f'{wq.name.string_().decode()[-24:]:24}', end='') - if wq.flags & WQ_UNBOUND: - if wq.flags & WQ_ORDERED: - print(' ordered ', end='') - else: - print(' unbound', end='') - if wq.unbound_attrs.affn_strict: - print(',S ', end='') - else: - print(' ', end='') - else: - print(' percpu ', end='') + print(f'{wq.name.string_().decode():{WQ_NAME_LEN}} {wq_type_str(wq):10}', end='') for cpu in for_each_possible_cpu(prog): pool_id = per_cpu_ptr(wq.cpu_pwq, cpu)[0].pool.id.value_() @@ -178,29 +185,23 @@ for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_( print('') print('Workqueue -> rescuer') -print('=====================') -print(f'wq_unbound_cpumask={cpumask_str(wq_unbound_cpumask)}') -print('') -print('[ workqueue \ type unbound_cpumask rescuer pid cpumask]') +print('====================') + +ucpus_len = max(cpumask_str_len, len("unbound_cpus")) +rcpus_len = max(cpumask_str_len, len("rescuer_cpus")) + +print(f'[{"workqueue":^{WQ_NAME_LEN-2}}\\ {"unbound_cpus":{ucpus_len}} pid {"rescuer_cpus":{rcpus_len}} ]') for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_(), 'list'): - print(f'{wq.name.string_().decode()[-24:]:24}', end='') - if wq.flags & WQ_UNBOUND: - if wq.flags & WQ_ORDERED: - print(' ordered ', end='') - else: - print(' unbound', end='') - if wq.unbound_attrs.affn_strict: - print(',S ', end='') - else: - print(' ', end='') - print(f' {cpumask_str(wq.unbound_attrs.cpumask):24}', end='') + if not (wq.flags & WQ_MEM_RECLAIM): + continue + + print(f'{wq.name.string_().decode():{WQ_NAME_LEN}}', end='') + if wq.unbound_attrs.value_() != 0: + print(f' {cpumask_str(wq.unbound_attrs.cpumask):{ucpus_len}}', end='') else: - print(' percpu ', end='') - print(' ', end='') + print(f' {"":{ucpus_len}}', end='') - if wq.flags & WQ_MEM_RECLAIM: - print(f' {wq.rescuer.task.comm.string_().decode()[-24:]:24}', end='') - print(f' {wq.rescuer.task.pid.value_():5}', end='') - print(f' {cpumask_str(wq.rescuer.task.cpus_ptr)}', end='') + print(f' {wq.rescuer.task.pid.value_():6}', end='') + print(f' {cpumask_str(wq.rescuer.task.cpus_ptr):{rcpus_len}}', end='') print('') -- GitLab From d8f899d13d72d285db43dbb9df1acaed22d8c4e7 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 24 Jan 2024 22:28:55 +0800 Subject: [PATCH 0088/1519] fs: make the i_size_read/write helpers be smp_load_acquire/store_release() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In [Link] Linus mentions that acquire/release makes it clear which _particular_ memory accesses are the ordered ones, and it's unlikely to make any performance difference, so it's much better to pair up the release->acquire ordering than have a "wmb->rmb" ordering. ========================================================= update pagecache folio_mark_uptodate(folio) smp_wmb() set_bit PG_uptodate === ↑↑↑ STLR ↑↑↑ === smp_store_release(&inode->i_size, i_size) folio_test_uptodate(folio) test_bit PG_uptodate smp_rmb() === ↓↓↓ LDAR ↓↓↓ === smp_load_acquire(&inode->i_size) copy_page_to_iter() ========================================================= Calling smp_store_release() in i_size_write() ensures that the data in the page and the PG_uptodate bit are updated before the isize is updated, and calling smp_load_acquire() in i_size_read ensures that it will not read a newer isize than the data in the page. Therefore, this avoids buffered read-write inconsistencies caused by Load-Load reordering. Link: https://lore.kernel.org/r/CAHk-=wifOnmeJq+sn+2s-P46zw0SFEbw9BSCGgp2c5fYPtRPGw@mail.gmail.com/ Suggested-by: Linus Torvalds Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240124142857.4146716-2-libaokun1@huawei.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6669147b9e8..ebce4763b4bb9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -907,7 +907,8 @@ static inline loff_t i_size_read(const struct inode *inode) preempt_enable(); return i_size; #else - return inode->i_size; + /* Pairs with smp_store_release() in i_size_write() */ + return smp_load_acquire(&inode->i_size); #endif } @@ -929,7 +930,12 @@ static inline void i_size_write(struct inode *inode, loff_t i_size) inode->i_size = i_size; preempt_enable(); #else - inode->i_size = i_size; + /* + * Pairs with smp_load_acquire() in i_size_read() to ensure + * changes related to inode size (such as page contents) are + * visible before we see the changed inode size. + */ + smp_store_release(&inode->i_size, i_size); #endif } -- GitLab From 4b944f8ef99641d5af287c7d9df91d20ef5d3e88 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 24 Jan 2024 22:28:56 +0800 Subject: [PATCH 0089/1519] Revert "mm/filemap: avoid buffered read/write race to read inconsistent data" This reverts commit e2c27b803bb6 ("mm/filemap: avoid buffered read/write race to read inconsistent data"). After making the i_size_read/write helpers be smp_load_acquire/store_release(), it is already guaranteed that changes to page contents are visible before we see increased inode size, so the extra smp_rmb() in filemap_read() can be removed. Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240124142857.4146716-3-libaokun1@huawei.com Signed-off-by: Christian Brauner --- mm/filemap.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 750e779c23db7..a72dd2eafd5ac 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2608,15 +2608,6 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, goto put_folios; end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); - /* - * Pairs with a barrier in - * block_write_end()->mark_buffer_dirty() or other page - * dirtying routines like iomap_write_end() to ensure - * changes to page contents are visible before we see - * increased inode size. - */ - smp_rmb(); - /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: -- GitLab From ad72872eb3ae634d1e4296a384baa81e85cc6acc Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 24 Jan 2024 22:28:57 +0800 Subject: [PATCH 0090/1519] asm-generic: remove extra type checking in acquire/release for non-SMP case If CONFIG_SMP is not enabled, the smp_load_acquire/smp_store_release is implemented as READ_ONCE/READ_ONCE and barrier() and type checking. READ_ONCE/READ_ONCE already checks the pointer type, and then checks it more stringently outside, but the non-SMP case simply isn't relevant, so remove the extra compiletime_assert_atomic_type() to avoid compilation errors. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401230837.TXro0PHi-lkp@intel.com/ Suggested-by: Linus Torvalds Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240124142857.4146716-4-libaokun1@huawei.com Signed-off-by: Christian Brauner --- include/asm-generic/barrier.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 961f4d88f9ef7..0c0695763bea3 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -193,7 +193,6 @@ do { \ #ifndef smp_store_release #define smp_store_release(p, v) \ do { \ - compiletime_assert_atomic_type(*p); \ barrier(); \ WRITE_ONCE(*p, v); \ } while (0) @@ -203,7 +202,6 @@ do { \ #define smp_load_acquire(p) \ ({ \ __unqual_scalar_typeof(*p) ___p1 = READ_ONCE(*p); \ - compiletime_assert_atomic_type(*p); \ barrier(); \ (typeof(*p))___p1; \ }) -- GitLab From e2fbc857d3c677ce96d9260577491e0d8f21b6f7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 8 Dec 2023 17:52:11 -0800 Subject: [PATCH 0091/1519] x86/nmi: Rate limit unknown NMI messages On some AMD machines, unknown NMI messages were printed on the console continuously when using perf command with IBS. It was reported that it can slow down the kernel. Ratelimit the unknown NMI messages. Signed-off-by: Namhyung Kim Signed-off-by: Borislav Petkov (AMD) Acked-by: Ravi Bangoria Acked-by: Guilherme Amadio Acked-by: Thomas Gleixner Link: https://lore.kernel.org/r/20231209015211.357983-1-namhyung@kernel.org --- arch/x86/kernel/nmi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 17e955ab69fed..d238679011866 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -303,13 +303,13 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) __this_cpu_add(nmi_stats.unknown, 1); - pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); + pr_emerg_ratelimited("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); if (unknown_nmi_panic || panic_on_unrecovered_nmi) nmi_panic(regs, "NMI: Not continuing"); - pr_emerg("Dazed and confused, but trying to continue\n"); + pr_emerg_ratelimited("Dazed and confused, but trying to continue\n"); } NOKPROBE_SYMBOL(unknown_nmi_error); -- GitLab From b37bf5ef177a1aae937451f2e272943a9333dd5c Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 24 Jan 2024 21:51:50 +0100 Subject: [PATCH 0092/1519] Documentation/maintainer-tip: Add Closes tag Document where Closes: lands in the tag ordering. Signed-off-by: Borislav Petkov (AMD) Acked-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240124205442.GAZbF5EmOB8LpKqlSc@fat_crate.local --- Documentation/process/maintainer-tip.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/process/maintainer-tip.rst b/Documentation/process/maintainer-tip.rst index 08dd0f804410b..799359231b7f2 100644 --- a/Documentation/process/maintainer-tip.rst +++ b/Documentation/process/maintainer-tip.rst @@ -304,13 +304,15 @@ following tag ordering scheme: - Reported-by: ``Reporter `` + - Closes: ``URL or Message-ID of the bug report this is fixing`` + - Originally-by: ``Original author `` - Suggested-by: ``Suggester `` - Co-developed-by: ``Co-author `` - Signed-off: ``Co-author `` + Signed-off-by: ``Co-author `` Note, that Co-developed-by and Signed-off-by of the co-author(s) must come in pairs. -- GitLab From a4cb5ece145828cae35503857debf3d49c9d1c5f Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:49:50 -0800 Subject: [PATCH 0093/1519] x86/cpufeatures,opcode,msr: Add the WRMSRNS instruction support WRMSRNS is an instruction that behaves exactly like WRMSR, with the only difference being that it is not a serializing instruction by default. Under certain conditions, WRMSRNS may replace WRMSR to improve performance. Add its CPU feature bit, opcode to the x86 opcode map, and an always inline API __wrmsrns() to embed WRMSRNS into the code. Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Acked-by: Masami Hiramatsu (Google) Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231205105030.8698-2-xin3.li@intel.com --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr.h | 18 ++++++++++++++++++ arch/x86/lib/x86-opcode-map.txt | 2 +- tools/arch/x86/include/asm/cpufeatures.h | 1 + tools/arch/x86/lib/x86-opcode-map.txt | 2 +- 5 files changed, 22 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 29cb275a219d7..bd05c75c62aa2 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -327,6 +327,7 @@ #define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ #define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ #define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ +#define X86_FEATURE_WRMSRNS (12*32+19) /* "" Non-serializing WRMSR */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ #define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */ diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 65ec1965cd281..c284ff9ebe676 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -97,6 +97,19 @@ static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high) : : "c" (msr), "a"(low), "d" (high) : "memory"); } +/* + * WRMSRNS behaves exactly like WRMSR with the only difference being + * that it is not a serializing instruction by default. + */ +static __always_inline void __wrmsrns(u32 msr, u32 low, u32 high) +{ + /* Instruction opcode for WRMSRNS; supported in binutils >= 2.40. */ + asm volatile("1: .byte 0x0f,0x01,0xc6\n" + "2:\n" + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) + : : "c" (msr), "a"(low), "d" (high)); +} + #define native_rdmsr(msr, val1, val2) \ do { \ u64 __val = __rdmsr((msr)); \ @@ -297,6 +310,11 @@ do { \ #endif /* !CONFIG_PARAVIRT_XXL */ +static __always_inline void wrmsrns(u32 msr, u64 val) +{ + __wrmsrns(msr, val, val >> 32); +} + /* * 64-bit version of wrmsr_safe(): */ diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 5168ee0360b24..1efe1d9bf5ce4 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -1051,7 +1051,7 @@ GrpTable: Grp6 EndTable GrpTable: Grp7 -0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B) 1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B) 3: LIDT Ms diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index f4542d2718f4f..8792841016031 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -322,6 +322,7 @@ #define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ #define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ #define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ +#define X86_FEATURE_WRMSRNS (12*32+19) /* "" Non-serializing WRMSR */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ #define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */ diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt index 5168ee0360b24..1efe1d9bf5ce4 100644 --- a/tools/arch/x86/lib/x86-opcode-map.txt +++ b/tools/arch/x86/lib/x86-opcode-map.txt @@ -1051,7 +1051,7 @@ GrpTable: Grp6 EndTable GrpTable: Grp7 -0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B) 1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B) 3: LIDT Ms -- GitLab From 3167b37f82ea8f9da156ff4edf100756bbc9277e Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:49:51 -0800 Subject: [PATCH 0094/1519] x86/entry: Remove idtentry_sysvec from entry_{32,64}.S idtentry_sysvec is really just DECLARE_IDTENTRY defined in , no need to define it separately. Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-3-xin3.li@intel.com --- arch/x86/entry/entry_32.S | 4 ---- arch/x86/entry/entry_64.S | 8 -------- arch/x86/include/asm/idtentry.h | 2 +- 3 files changed, 1 insertion(+), 13 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index c73047bf9f4bf..89a7ec0920ec5 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -649,10 +649,6 @@ SYM_CODE_START_LOCAL(asm_\cfunc) SYM_CODE_END(asm_\cfunc) .endm -.macro idtentry_sysvec vector cfunc - idtentry \vector asm_\cfunc \cfunc has_error_code=0 -.endm - /* * Include the defines which emit the idt entries which are shared * shared between 32 and 64 bit and emit the __irqentry_text_* markers diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index c40f89ab1b4c7..29ce68f8ede04 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -370,14 +370,6 @@ SYM_CODE_END(\asmsym) idtentry \vector asm_\cfunc \cfunc has_error_code=1 .endm -/* - * System vectors which invoke their handlers directly and are not - * going through the regular common device interrupt handling code. - */ -.macro idtentry_sysvec vector cfunc - idtentry \vector asm_\cfunc \cfunc has_error_code=0 -.endm - /** * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB * @vector: Vector number diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 13639e57e1f8a..e9f71b3217c2d 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -447,7 +447,7 @@ __visible noinstr void func(struct pt_regs *regs, \ /* System vector entries */ #define DECLARE_IDTENTRY_SYSVEC(vector, func) \ - idtentry_sysvec vector func + DECLARE_IDTENTRY(vector, func) #ifdef CONFIG_X86_64 # define DECLARE_IDTENTRY_MCE(vector, func) \ -- GitLab From 8df719341e8556f1e2bfa0f78fc433db6eba110b Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:49:52 -0800 Subject: [PATCH 0095/1519] x86/trapnr: Add event type macros to Intel VT-x classifies events into eight different types, which is inherited by FRED for event identification. As such, event types becomes a common x86 concept, and should be defined in a common x86 header. Add event type macros to , and use them in . Suggested-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-4-xin3.li@intel.com --- arch/x86/include/asm/trapnr.h | 12 ++++++++++++ arch/x86/include/asm/vmx.h | 17 +++++++++-------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/trapnr.h b/arch/x86/include/asm/trapnr.h index f5d2325aa0b74..8d1154cdf7875 100644 --- a/arch/x86/include/asm/trapnr.h +++ b/arch/x86/include/asm/trapnr.h @@ -2,6 +2,18 @@ #ifndef _ASM_X86_TRAPNR_H #define _ASM_X86_TRAPNR_H +/* + * Event type codes used by FRED, Intel VT-x and AMD SVM + */ +#define EVENT_TYPE_EXTINT 0 // External interrupt +#define EVENT_TYPE_RESERVED 1 +#define EVENT_TYPE_NMI 2 // NMI +#define EVENT_TYPE_HWEXC 3 // Hardware originated traps, exceptions +#define EVENT_TYPE_SWINT 4 // INT n +#define EVENT_TYPE_PRIV_SWEXC 5 // INT1 +#define EVENT_TYPE_SWEXC 6 // INTO, INT3 +#define EVENT_TYPE_OTHER 7 // FRED SYSCALL/SYSENTER, VT-x MTF + /* Interrupts/Exceptions */ #define X86_TRAP_DE 0 /* Divide-by-zero */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 0e73616b82f34..4dba173630084 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -17,6 +17,7 @@ #include #include +#include #include #define VMCS_CONTROL_BIT(x) BIT(VMX_FEATURE_##x & 0x1f) @@ -374,14 +375,14 @@ enum vmcs_field { #define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK -#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ -#define INTR_TYPE_RESERVED (1 << 8) /* reserved */ -#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */ -#define INTR_TYPE_HARD_EXCEPTION (3 << 8) /* processor exception */ -#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ -#define INTR_TYPE_PRIV_SW_EXCEPTION (5 << 8) /* ICE breakpoint - undocumented */ -#define INTR_TYPE_SOFT_EXCEPTION (6 << 8) /* software exception */ -#define INTR_TYPE_OTHER_EVENT (7 << 8) /* other event */ +#define INTR_TYPE_EXT_INTR (EVENT_TYPE_EXTINT << 8) /* external interrupt */ +#define INTR_TYPE_RESERVED (EVENT_TYPE_RESERVED << 8) /* reserved */ +#define INTR_TYPE_NMI_INTR (EVENT_TYPE_NMI << 8) /* NMI */ +#define INTR_TYPE_HARD_EXCEPTION (EVENT_TYPE_HWEXC << 8) /* processor exception */ +#define INTR_TYPE_SOFT_INTR (EVENT_TYPE_SWINT << 8) /* software interrupt */ +#define INTR_TYPE_PRIV_SW_EXCEPTION (EVENT_TYPE_PRIV_SWEXC << 8) /* ICE breakpoint */ +#define INTR_TYPE_SOFT_EXCEPTION (EVENT_TYPE_SWEXC << 8) /* software exception */ +#define INTR_TYPE_OTHER_EVENT (EVENT_TYPE_OTHER << 8) /* other event */ /* GUEST_INTERRUPTIBILITY_INFO flags. */ #define GUEST_INTR_STATE_STI 0x00000001 -- GitLab From 51383e741b41748ee80140233ab98ca6b56918b3 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:49:53 -0800 Subject: [PATCH 0096/1519] Documentation/x86/64: Add documentation for FRED Briefly introduce FRED, and its advantages compared to IDT. Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Reviewed-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20231205105030.8698-5-xin3.li@intel.com --- Documentation/arch/x86/x86_64/fred.rst | 96 +++++++++++++++++++++++++ Documentation/arch/x86/x86_64/index.rst | 1 + 2 files changed, 97 insertions(+) create mode 100644 Documentation/arch/x86/x86_64/fred.rst diff --git a/Documentation/arch/x86/x86_64/fred.rst b/Documentation/arch/x86/x86_64/fred.rst new file mode 100644 index 0000000000000..9f57e7b91f7e7 --- /dev/null +++ b/Documentation/arch/x86/x86_64/fred.rst @@ -0,0 +1,96 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================= +Flexible Return and Event Delivery (FRED) +========================================= + +Overview +======== + +The FRED architecture defines simple new transitions that change +privilege level (ring transitions). The FRED architecture was +designed with the following goals: + +1) Improve overall performance and response time by replacing event + delivery through the interrupt descriptor table (IDT event + delivery) and event return by the IRET instruction with lower + latency transitions. + +2) Improve software robustness by ensuring that event delivery + establishes the full supervisor context and that event return + establishes the full user context. + +The new transitions defined by the FRED architecture are FRED event +delivery and, for returning from events, two FRED return instructions. +FRED event delivery can effect a transition from ring 3 to ring 0, but +it is used also to deliver events incident to ring 0. One FRED +instruction (ERETU) effects a return from ring 0 to ring 3, while the +other (ERETS) returns while remaining in ring 0. Collectively, FRED +event delivery and the FRED return instructions are FRED transitions. + +In addition to these transitions, the FRED architecture defines a new +instruction (LKGS) for managing the state of the GS segment register. +The LKGS instruction can be used by 64-bit operating systems that do +not use the new FRED transitions. + +Furthermore, the FRED architecture is easy to extend for future CPU +architectures. + +Software based event dispatching +================================ + +FRED operates differently from IDT in terms of event handling. Instead +of directly dispatching an event to its handler based on the event +vector, FRED requires the software to dispatch an event to its handler +based on both the event's type and vector. Therefore, an event dispatch +framework must be implemented to facilitate the event-to-handler +dispatch process. The FRED event dispatch framework takes control +once an event is delivered, and employs a two-level dispatch. + +The first level dispatching is event type based, and the second level +dispatching is event vector based. + +Full supervisor/user context +============================ + +FRED event delivery atomically save and restore full supervisor/user +context upon event delivery and return. Thus it avoids the problem of +transient states due to %cr2 and/or %dr6, and it is no longer needed +to handle all the ugly corner cases caused by half baked entry states. + +FRED allows explicit unblock of NMI with new event return instructions +ERETS/ERETU, avoiding the mess caused by IRET which unconditionally +unblocks NMI, e.g., when an exception happens during NMI handling. + +FRED always restores the full value of %rsp, thus ESPFIX is no longer +needed when FRED is enabled. + +LKGS +==== + +LKGS behaves like the MOV to GS instruction except that it loads the +base address into the IA32_KERNEL_GS_BASE MSR instead of the GS +segment’s descriptor cache. With LKGS, it ends up with avoiding +mucking with kernel GS, i.e., an operating system can always operate +with its own GS base address. + +Because FRED event delivery from ring 3 and ERETU both swap the value +of the GS base address and that of the IA32_KERNEL_GS_BASE MSR, plus +the introduction of LKGS instruction, the SWAPGS instruction is no +longer needed when FRED is enabled, thus is disallowed (#UD). + +Stack levels +============ + +4 stack levels 0~3 are introduced to replace the nonreentrant IST for +event handling, and each stack level should be configured to use a +dedicated stack. + +The current stack level could be unchanged or go higher upon FRED +event delivery. If unchanged, the CPU keeps using the current event +stack. If higher, the CPU switches to a new event stack specified by +the MSR of the new stack level, i.e., MSR_IA32_FRED_RSP[123]. + +Only execution of a FRED return instruction ERET[US], could lower the +current stack level, causing the CPU to switch back to the stack it was +on before a previous event delivery that promoted the stack level. diff --git a/Documentation/arch/x86/x86_64/index.rst b/Documentation/arch/x86/x86_64/index.rst index a56070fc8e77a..ad15e9bd623f6 100644 --- a/Documentation/arch/x86/x86_64/index.rst +++ b/Documentation/arch/x86/x86_64/index.rst @@ -15,3 +15,4 @@ x86_64 Support cpu-hotplug-spec machinecheck fsgs + fred -- GitLab From 2cce95918d635126098d784c040b59333c464b20 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:49:54 -0800 Subject: [PATCH 0097/1519] x86/fred: Add Kconfig option for FRED (CONFIG_X86_FRED) Add the configuration option CONFIG_X86_FRED to enable FRED. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-6-xin3.li@intel.com --- arch/x86/Kconfig | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5edec175b9bfc..854ab38a359a2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -496,6 +496,15 @@ config X86_CPU_RESCTRL Say N if unsure. +config X86_FRED + bool "Flexible Return and Event Delivery" + depends on X86_64 + help + When enabled, try to use Flexible Return and Event Delivery + instead of the legacy SYSCALL/SYSENTER/IDT architecture for + ring transitions and exception/interrupt handling if the + system supports. + if X86_32 config X86_BIGSMP bool "Support for big SMP systems with more than 8 CPUs" -- GitLab From 51c158f7aaccc6f6423a61a1df4a0d4c0d9d22a9 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:49:55 -0800 Subject: [PATCH 0098/1519] x86/cpufeatures: Add the CPU feature bit for FRED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Any FRED enabled CPU will always have the following features as its baseline: 1) LKGS, load attributes of the GS segment but the base address into the IA32_KERNEL_GS_BASE MSR instead of the GS segment’s descriptor cache. 2) WRMSRNS, non-serializing WRMSR for faster MSR writes. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-7-xin3.li@intel.com --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/cpuid-deps.c | 2 ++ tools/arch/x86/include/asm/cpufeatures.h | 1 + 3 files changed, 4 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index bd05c75c62aa2..ccbf914b3d1a3 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -326,6 +326,7 @@ #define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */ #define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ #define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ +#define X86_FEATURE_FRED (12*32+17) /* Flexible Return and Event Delivery */ #define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ #define X86_FEATURE_WRMSRNS (12*32+19) /* "" Non-serializing WRMSR */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index e462c1d3800a6..b7174209d855c 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -82,6 +82,8 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, + { X86_FEATURE_FRED, X86_FEATURE_LKGS }, + { X86_FEATURE_FRED, X86_FEATURE_WRMSRNS }, {} }; diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 8792841016031..953e6efead265 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -321,6 +321,7 @@ #define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */ #define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ #define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ +#define X86_FEATURE_FRED (12*32+17) /* Flexible Return and Event Delivery */ #define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ #define X86_FEATURE_WRMSRNS (12*32+19) /* "" Non-serializing WRMSR */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ -- GitLab From e554a8ca49d6d6d782f546ae4d7f036946e7dd87 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:49:56 -0800 Subject: [PATCH 0099/1519] x86/fred: Disable FRED support if CONFIG_X86_FRED is disabled Add CONFIG_X86_FRED to to make cpu_feature_enabled() work correctly with FRED. Originally-by: Megha Dey Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-8-xin3.li@intel.com --- arch/x86/include/asm/disabled-features.h | 8 +++++++- tools/arch/x86/include/asm/disabled-features.h | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 702d93fdd10e8..f40b29d3abad7 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -117,6 +117,12 @@ #define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31)) #endif +#ifdef CONFIG_X86_FRED +# define DISABLE_FRED 0 +#else +# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -133,7 +139,7 @@ #define DISABLED_MASK10 0 #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK) -#define DISABLED_MASK12 (DISABLE_LAM) +#define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM) #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 702d93fdd10e8..f40b29d3abad7 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -117,6 +117,12 @@ #define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31)) #endif +#ifdef CONFIG_X86_FRED +# define DISABLE_FRED 0 +#else +# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -133,7 +139,7 @@ #define DISABLED_MASK10 0 #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK) -#define DISABLED_MASK12 (DISABLE_LAM) +#define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM) #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 -- GitLab From 8318d6a6362f5903edb4c904a8dd447e59be4ad1 Mon Sep 17 00:00:00 2001 From: Audra Mitchell Date: Thu, 25 Jan 2024 14:05:32 -0500 Subject: [PATCH 0100/1519] workqueue: Shorten events_freezable_power_efficient name Since we have set the WQ_NAME_LEN to 32, decrease the name of events_freezable_power_efficient so that it does not trip the name length warning when the workqueue is created. Signed-off-by: Audra Mitchell Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8ca65665efe96..ee6aa1b897e0c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -6706,7 +6706,7 @@ void __init workqueue_init_early(void) WQ_FREEZABLE, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", WQ_POWER_EFFICIENT, 0); - system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient", + system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient", WQ_FREEZABLE | WQ_POWER_EFFICIENT, 0); BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || -- GitLab From 8f588afe6256c50b3d1f8a671828fc4aab421c05 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 25 Jan 2024 09:34:57 -0800 Subject: [PATCH 0101/1519] x86/mm: Get rid of conditional IF flag handling in page fault path We had this nonsensical code that would happily handle kernel page faults with interrupts disabled, which makes no sense at all. It turns out that this is legacy code that _used_ to make sense, back when we enabled IRQs as early as possible, and we used to have this code sequence essentially immediately after reading the faulting address from the %cr2 register. Back then, we could have kernel page faults to populate the vmalloc area with interrupts disabled, and they would need to stay disabled for that case. However, the code in question has been moved down in the page fault handling, and is now in the "handle faults in user addresses" section, and apparently nobody ever noticed that it no longer makes sense to handle these page faults with interrupts conditionally disabled. So replace the conditional IRQ enable: if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); with an unconditional one, and add a temporary WARN_ON_ONCE() if some codepath actually does do page faults with interrupts disabled (without also doing a pagefault_disable(), of course). NOTE! We used to allow user space to disable interrupts with iopl(3). That is no longer true since commits: a24ca9976843 ("x86/iopl: Remove legacy IOPL option") b968e84b509d ("x86/iopl: Fake iopl(3) CLI/STI usage") so the WARN_ON_ONCE() is valid for both the kernel and user situation. For some of the history relevant to this code, see particularly commit 8c914cb704a1 ("x86_64: actively synchronize vmalloc area when registering certain callbacks"), which moved this below the vmalloc fault handling. Now that the user_mode() check is irrelevant, we can also move the FAULT_FLAG_USER flag setting down to where the other flag settings are done. Signed-off-by: Linus Torvalds Signed-off-by: Ingo Molnar Acked-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Uros Bizjak Cc: Sean Christopherson Link: https://lore.kernel.org/r/20240125173457.1281880-1-torvalds@linux-foundation.org --- arch/x86/mm/fault.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 679b09cfe241c..150e002e08849 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1302,21 +1302,14 @@ void do_user_addr_fault(struct pt_regs *regs, return; } - /* - * It's safe to allow irq's after cr2 has been saved and the - * vmalloc fault has been handled. - * - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet: - */ - if (user_mode(regs)) { - local_irq_enable(); - flags |= FAULT_FLAG_USER; - } else { - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); + /* Legacy check - remove this after verifying that it doesn't trigger */ + if (WARN_ON_ONCE(!(regs->flags & X86_EFLAGS_IF))) { + bad_area_nosemaphore(regs, error_code, address); + return; } + local_irq_enable(); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* @@ -1332,6 +1325,14 @@ void do_user_addr_fault(struct pt_regs *regs, if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; + /* + * We set FAULT_FLAG_USER based on the register state, not + * based on X86_PF_USER. User space accesses that cause + * system page faults are still user accesses. + */ + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + #ifdef CONFIG_X86_64 /* * Faults in the vsyscall page might need emulation. The -- GitLab From d12a82848eac28d248e67940378fe4a72b0a8cd3 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 22 Jan 2024 13:42:40 +0100 Subject: [PATCH 0102/1519] bitmap: Define a cleanup function for bitmaps Add support for autopointers for bitmaps allocated with bitmap_alloc() et al. Signed-off-by: Bartosz Golaszewski Signed-off-by: Thomas Gleixner Reviewed-by: Andy Shevchenko Acked-by: Yury Norov Link: https://lore.kernel.org/r/20240122124243.44002-2-brgl@bgdev.pl --- include/linux/bitmap.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 99451431e4d65..df24c8fb10096 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -127,6 +128,8 @@ unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node); unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node); void bitmap_free(const unsigned long *bitmap); +DEFINE_FREE(bitmap, unsigned long *, if (_T) bitmap_free(_T)) + /* Managed variants of the above. */ unsigned long *devm_bitmap_alloc(struct device *dev, unsigned int nbits, gfp_t flags); -- GitLab From 3832f390423302e373a1818f6cf8cb29ebf3a195 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 22 Jan 2024 13:42:41 +0100 Subject: [PATCH 0103/1519] genirq/irq_sim: Remove unused field from struct irq_sim_irq_ctx The irqnum field is unused. Remove it. Signed-off-by: Bartosz Golaszewski Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240122124243.44002-3-brgl@bgdev.pl --- kernel/irq/irq_sim.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index dd76323ea3fd7..f5ebb3ba6f9aa 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -19,7 +19,6 @@ struct irq_sim_work_ctx { }; struct irq_sim_irq_ctx { - int irqnum; bool enabled; struct irq_sim_work_ctx *work_ctx; }; -- GitLab From 8dab7fd47e53865d37fce73c67bac97b41d5d64a Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 22 Jan 2024 13:42:42 +0100 Subject: [PATCH 0104/1519] genirq/irq_sim: Order headers alphabetically For better readability and maintenance keep headers in alphabetical order. Signed-off-by: Bartosz Golaszewski Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240122124243.44002-4-brgl@bgdev.pl --- kernel/irq/irq_sim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index f5ebb3ba6f9aa..b0d50b48dbd16 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -4,10 +4,10 @@ * Copyright (C) 2020 Bartosz Golaszewski */ +#include #include #include #include -#include #include struct irq_sim_work_ctx { -- GitLab From ef7e585bf48013baabc00de1a15753dd7b626a2d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 13 Jan 2024 19:06:15 -0800 Subject: [PATCH 0105/1519] cpu/hotplug: Delete an extraneous kernel-doc description struct cpuhp_cpu_state has an extraneous kernel-doc comment for @cpu. There is no struct member by that name, so remove the comment to prevent the kernel-doc warning: kernel/cpu.c:85: warning: Excess struct member 'cpu' description in 'cpuhp_cpu_state' Signed-off-by: Randy Dunlap Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240114030615.30441-1-rdunlap@infradead.org --- kernel/cpu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index e6ec3ba4950b4..f05937581f26b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -54,7 +54,6 @@ * @rollback: Perform a rollback * @single: Single callback invocation * @bringup: Single callback bringup or teardown selector - * @cpu: CPU number * @node: Remote CPU node; for multi-instance, do a * single entry callback for install/remove * @last: For multi-instance rollback, remember how far we got -- GitLab From effe6d278e06f85289b6ada0402a6d16ebc149a5 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 16 Jan 2024 12:51:51 +0800 Subject: [PATCH 0106/1519] kernel/cpu: Convert snprintf() to sysfs_emit() Per filesystems/sysfs.rst, show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. coccinelle complains that there are still a couple of functions that use snprintf(). Convert them to sysfs_emit(). No functional change intended. Signed-off-by: Li Zhijian Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240116045151.3940401-40-lizhijian@fujitsu.com --- kernel/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index f05937581f26b..ad7d0b00bce99 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3004,7 +3004,7 @@ static ssize_t control_show(struct device *dev, return sysfs_emit(buf, "%d\n", cpu_smt_num_threads); #endif - return snprintf(buf, PAGE_SIZE - 2, "%s\n", state); + return sysfs_emit(buf, "%s\n", state); } static ssize_t control_store(struct device *dev, struct device_attribute *attr, @@ -3017,7 +3017,7 @@ static DEVICE_ATTR_RW(control); static ssize_t active_show(struct device *dev, struct device_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active()); + return sysfs_emit(buf, "%d\n", sched_smt_active()); } static DEVICE_ATTR_RO(active); -- GitLab From 6a229b0e2ff6143b65ba4ef42bd71e29ffc2c16d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 26 Jan 2024 11:55:46 -1000 Subject: [PATCH 0107/1519] workqueue: Drop unnecessary kick_pool() in create_worker() After creating a new worker, create_worker() is calling kick_pool() to wake up the new worker task. However, as kick_pool() doesn't do anything if there is no work pending, it also calls wake_up_process() explicitly. There's no reason to call kick_pool() at all. wake_up_process() is enough by itself. Drop the unnecessary kick_pool() call. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ee6aa1b897e0c..b6b690a17f7cb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2217,12 +2217,11 @@ static struct worker *create_worker(struct worker_pool *pool) worker->pool->nr_workers++; worker_enter_idle(worker); - kick_pool(pool); /* * @worker is waiting on a completion in kthread() and will trigger hung - * check if not woken up soon. As kick_pool() might not have waken it - * up, wake it up explicitly once more. + * check if not woken up soon. As kick_pool() is noop if @pool is empty, + * wake it up explicitly. */ wake_up_process(worker->task); -- GitLab From e563d0a7cdc1890ff36bb177b5c8c2854d881e4d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 26 Jan 2024 11:55:50 -1000 Subject: [PATCH 0108/1519] workqueue: Break up enum definitions and give names to the types workqueue is collecting different sorts of enums into a single unnamed enum type which can increase confusion around enum width. Also, unnamed enums can't be accessed from BPF. Let's break up enum definitions according to their purposes and give them type names. Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 41 +++++++++++++++++++++++---------------- kernel/workqueue.c | 6 +++++- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 2cc0a9606175f..78047d0d98820 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -22,7 +22,7 @@ */ #define work_data_bits(work) ((unsigned long *)(&(work)->data)) -enum { +enum work_bits { WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */ WORK_STRUCT_INACTIVE_BIT= 1, /* work item is inactive */ WORK_STRUCT_PWQ_BIT = 2, /* data points to pwq */ @@ -36,21 +36,6 @@ enum { WORK_STRUCT_COLOR_BITS = 4, - WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, - WORK_STRUCT_INACTIVE = 1 << WORK_STRUCT_INACTIVE_BIT, - WORK_STRUCT_PWQ = 1 << WORK_STRUCT_PWQ_BIT, - WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT, -#ifdef CONFIG_DEBUG_OBJECTS_WORK - WORK_STRUCT_STATIC = 1 << WORK_STRUCT_STATIC_BIT, -#else - WORK_STRUCT_STATIC = 0, -#endif - - WORK_NR_COLORS = (1 << WORK_STRUCT_COLOR_BITS), - - /* not bound to any CPU, prefer the local CPU */ - WORK_CPU_UNBOUND = NR_CPUS, - /* * Reserve 8 bits off of pwq pointer w/ debugobjects turned off. * This makes pwqs aligned to 256 bytes and allows 16 workqueue @@ -74,6 +59,26 @@ enum { WORK_OFFQ_LEFT = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31, +}; + +enum work_flags { + WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, + WORK_STRUCT_INACTIVE = 1 << WORK_STRUCT_INACTIVE_BIT, + WORK_STRUCT_PWQ = 1 << WORK_STRUCT_PWQ_BIT, + WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT, +#ifdef CONFIG_DEBUG_OBJECTS_WORK + WORK_STRUCT_STATIC = 1 << WORK_STRUCT_STATIC_BIT, +#else + WORK_STRUCT_STATIC = 0, +#endif +}; + +enum wq_misc_consts { + WORK_NR_COLORS = (1 << WORK_STRUCT_COLOR_BITS), + + /* not bound to any CPU, prefer the local CPU */ + WORK_CPU_UNBOUND = NR_CPUS, + /* bit mask for work_busy() return values */ WORK_BUSY_PENDING = 1 << 0, WORK_BUSY_RUNNING = 1 << 1, @@ -347,7 +352,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } * Workqueue flags and constants. For details, please refer to * Documentation/core-api/workqueue.rst. */ -enum { +enum wq_flags { WQ_UNBOUND = 1 << 1, /* not bound to any cpu */ WQ_FREEZABLE = 1 << 2, /* freeze during suspend */ WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ @@ -387,7 +392,9 @@ enum { __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ __WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */ +}; +enum wq_consts { WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_UNBOUND_MAX_ACTIVE = WQ_MAX_ACTIVE, WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b6b690a17f7cb..45d0a784ba4f5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -56,7 +56,7 @@ #include "workqueue_internal.h" -enum { +enum worker_pool_flags { /* * worker_pool flags * @@ -75,7 +75,9 @@ enum { */ POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ +}; +enum worker_flags { /* worker flags */ WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ @@ -86,7 +88,9 @@ enum { WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | WORKER_UNBOUND | WORKER_REBOUND, +}; +enum wq_internal_consts { NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */ -- GitLab From 361bb7c961403173be109d8892f3c23096dc098d Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Thu, 11 Jan 2024 17:58:49 +0100 Subject: [PATCH 0109/1519] arm64: dts: qcom: sm8650-qrd: add gpio74 as reserved gpio The TLMM gpio74 is also used to communicate with the secure NFC on-board module, some variants of the SM8650-QRD board requires this GPIO to be dedicated to the secure firmware and set reserved in order to successfully initialize the TLMM GPIOs from HLOS (Linux). On the other boards this GPIO is unused so it's still safe to mark the GPIO as reserved. Fixes: a834911d50c1 ("arm64: dts: qcom: sm8650: add initial SM8650 QRD dts") Reported-by: Georgi Djakov Signed-off-by: Neil Armstrong Reviewed-by: Konrad Dybcio Reviewed-by: Elliot Berman Link: https://lore.kernel.org/r/20240111-topic-sm8650-upstream-qrd-fix-gpio-reserved-v1-1-fad39b4c5def@linaro.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/sm8650-qrd.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/sm8650-qrd.dts b/arch/arm64/boot/dts/qcom/sm8650-qrd.dts index 592a67a47c782..b9151c2ddf2e5 100644 --- a/arch/arm64/boot/dts/qcom/sm8650-qrd.dts +++ b/arch/arm64/boot/dts/qcom/sm8650-qrd.dts @@ -659,7 +659,7 @@ touchscreen@0 { &tlmm { /* Reserved I/Os for NFC */ - gpio-reserved-ranges = <32 8>; + gpio-reserved-ranges = <32 8>, <74 1>; bt_default: bt-default-state { bt-en-pins { -- GitLab From df77288f7e3accf246785c53cd5f117fc5d81611 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Thu, 11 Jan 2024 17:58:50 +0100 Subject: [PATCH 0110/1519] arm64: dts: qcom: sm8650-mtp: add gpio74 as reserved gpio The TLMM gpio74 is also used to communicate with the secure NFC on-board module, some variants of the SM8650-MTP board requires this GPIO to be dedicated to the secure firmware and set reserved in order to successfully initialize the TLMM GPIOs from HLOS (Linux). On the other boards this GPIO is unused so it's still safe to mark the GPIO as reserved. Fixes: 6fbdb3c1fac7 ("arm64: dts: qcom: sm8650: add initial SM8650 MTP dts") Reported-by: Georgi Djakov Signed-off-by: Neil Armstrong Reviewed-by: Konrad Dybcio Tested-by: Georgi Djakov Reviewed-by: Elliot Berman Link: https://lore.kernel.org/r/20240111-topic-sm8650-upstream-qrd-fix-gpio-reserved-v1-2-fad39b4c5def@linaro.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/sm8650-mtp.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/sm8650-mtp.dts b/arch/arm64/boot/dts/qcom/sm8650-mtp.dts index 9d916edb1c73c..be133a3d5cbe0 100644 --- a/arch/arm64/boot/dts/qcom/sm8650-mtp.dts +++ b/arch/arm64/boot/dts/qcom/sm8650-mtp.dts @@ -622,7 +622,7 @@ right_spkr: speaker@0,1 { &tlmm { /* Reserved I/Os for NFC */ - gpio-reserved-ranges = <32 8>; + gpio-reserved-ranges = <32 8>, <74 1>; disp0_reset_n_active: disp0-reset-n-active-state { pins = "gpio133"; -- GitLab From 3e6454177f3a76265cba4901d5caa4eb0c7b6447 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Mon, 8 Jan 2024 14:49:57 +0000 Subject: [PATCH 0111/1519] rust: sync: add `CondVar::notify_sync` Wake up another thread synchronously. This method behaves like `notify_one`, except that it hints to the scheduler that the current thread is about to go to sleep, so it should schedule the target thread on the same CPU. This is used by Rust Binder as a performance optimization. When sending a transaction to a different process, we usually know which thread will handle it, so we can schedule that thread for execution next on this CPU for better cache locality. Reviewed-by: Benno Lossin Reviewed-by: Martin Rodriguez Reboredo Reviewed-by: Tiago Lam Reviewed-by: Boqun Feng Signed-off-by: Alice Ryhl Link: https://lore.kernel.org/r/20240108-rb-new-condvar-methods-v4-1-88e0c871cc05@google.com Signed-off-by: Miguel Ojeda --- rust/kernel/sync/condvar.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/rust/kernel/sync/condvar.rs b/rust/kernel/sync/condvar.rs index 90108cc6da0b6..30fb5cbfb730f 100644 --- a/rust/kernel/sync/condvar.rs +++ b/rust/kernel/sync/condvar.rs @@ -159,6 +159,16 @@ fn notify(&self, count: i32, flags: u32) { }; } + /// Calls the kernel function to notify one thread synchronously. + /// + /// This method behaves like `notify_one`, except that it hints to the scheduler that the + /// current thread is about to go to sleep, so it should schedule the target thread on the same + /// CPU. + pub fn notify_sync(&self) { + // SAFETY: `wait_queue_head` points to valid memory. + unsafe { bindings::__wake_up_sync(self.wait_queue_head.get(), bindings::TASK_NORMAL) }; + } + /// Wakes a single waiter up, if any. /// /// This is not 'sticky' in the sense that if no thread is waiting, the notification is lost -- GitLab From 82e170874849c4c57dba6b4ee1a08fae1c0a5f02 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Mon, 8 Jan 2024 14:49:58 +0000 Subject: [PATCH 0112/1519] rust: time: add msecs to jiffies conversion Defines type aliases and conversions for msecs and jiffies. This is used by Rust Binder for process freezing. There, we want to sleep until the freeze operation completes, but we want to be able to abort the process freezing if it doesn't complete within some timeout. The freeze timeout is supplied in msecs. Note that we need to convert to jiffies in Binder. It is not enough to introduce a variant of `CondVar::wait_timeout` that takes the timeout in msecs because we need to be able to restart the sleep with the remaining sleep duration if it is interrupted, and if the API takes msecs rather than jiffies, then that would require a conversion roundtrip jiffies-> msecs->jiffies that is best avoided. Suggested-by: Boqun Feng Reviewed-by: Boqun Feng Reviewed-by: Benno Lossin Reviewed-by: Martin Rodriguez Reboredo Reviewed-by: Tiago Lam Signed-off-by: Alice Ryhl Link: https://lore.kernel.org/r/20240108-rb-new-condvar-methods-v4-2-88e0c871cc05@google.com Signed-off-by: Miguel Ojeda --- rust/bindings/bindings_helper.h | 1 + rust/kernel/lib.rs | 1 + rust/kernel/time.rs | 20 ++++++++++++++++++++ 3 files changed, 22 insertions(+) create mode 100644 rust/kernel/time.rs diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index c0cb4b05b9185..936651110c39a 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index cb2d024db51fe..b89ecf4e97a07 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -48,6 +48,7 @@ pub mod str; pub mod sync; pub mod task; +pub mod time; pub mod types; pub mod workqueue; diff --git a/rust/kernel/time.rs b/rust/kernel/time.rs new file mode 100644 index 0000000000000..25a896eed4689 --- /dev/null +++ b/rust/kernel/time.rs @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Time related primitives. +//! +//! This module contains the kernel APIs related to time and timers that +//! have been ported or wrapped for usage by Rust code in the kernel. + +/// The time unit of Linux kernel. One jiffy equals (1/HZ) second. +pub type Jiffies = core::ffi::c_ulong; + +/// The millisecond time unit. +pub type Msecs = core::ffi::c_uint; + +/// Converts milliseconds to jiffies. +#[inline] +pub fn msecs_to_jiffies(msecs: Msecs) -> Jiffies { + // SAFETY: The `__msecs_to_jiffies` function is always safe to call no + // matter what the argument is. + unsafe { bindings::__msecs_to_jiffies(msecs) } +} -- GitLab From e7b9b1ff1d49e2117f2d78fa5f11b0d006c169f7 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Mon, 8 Jan 2024 14:49:59 +0000 Subject: [PATCH 0113/1519] rust: sync: add `CondVar::wait_timeout` Sleep on a condition variable with a timeout. This is used by Rust Binder for process freezing. There, we want to sleep until the freeze operation completes, but we want to be able to abort the process freezing if it doesn't complete within some timeout. Note that it is not enough to avoid jiffies by introducing a variant of `CondVar::wait_timeout` that takes the timeout in msecs because we need to be able to restart the sleep with the remaining sleep duration if it is interrupted, and if the API takes msecs rather than jiffies, then that would require a conversion roundtrip jiffies->msecs->jiffies that is best avoided. Reviewed-by: Martin Rodriguez Reboredo Reviewed-by: Tiago Lam Reviewed-by: Boqun Feng Signed-off-by: Alice Ryhl Reviewed-by: Benno Lossin Link: https://lore.kernel.org/r/20240108-rb-new-condvar-methods-v4-3-88e0c871cc05@google.com [ Added `CondVarTimeoutResult` re-export and fixed typo. ] Signed-off-by: Miguel Ojeda --- rust/kernel/sync.rs | 2 +- rust/kernel/sync/condvar.rs | 60 +++++++++++++++++++++++++++++++++---- rust/kernel/sync/lock.rs | 4 +-- rust/kernel/task.rs | 5 +++- 4 files changed, 61 insertions(+), 10 deletions(-) diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs index d219ee518eff1..c1fb10fc64f4b 100644 --- a/rust/kernel/sync.rs +++ b/rust/kernel/sync.rs @@ -13,7 +13,7 @@ mod locked_by; pub use arc::{Arc, ArcBorrow, UniqueArc}; -pub use condvar::CondVar; +pub use condvar::{CondVar, CondVarTimeoutResult}; pub use lock::{mutex::Mutex, spinlock::SpinLock}; pub use locked_by::LockedBy; diff --git a/rust/kernel/sync/condvar.rs b/rust/kernel/sync/condvar.rs index 30fb5cbfb730f..3f148d7040bb9 100644 --- a/rust/kernel/sync/condvar.rs +++ b/rust/kernel/sync/condvar.rs @@ -6,7 +6,11 @@ //! variable. use super::{lock::Backend, lock::Guard, LockClassKey}; -use crate::{bindings, init::PinInit, pin_init, str::CStr, types::Opaque}; +use crate::{ + bindings, init::PinInit, pin_init, str::CStr, task::MAX_SCHEDULE_TIMEOUT, time::Jiffies, + types::Opaque, +}; +use core::ffi::c_long; use core::marker::PhantomPinned; use macros::pin_data; @@ -102,7 +106,12 @@ pub fn new(name: &'static CStr, key: &'static LockClassKey) -> impl PinInit(&self, wait_state: u32, guard: &mut Guard<'_, T, B>) { + fn wait_internal( + &self, + wait_state: u32, + guard: &mut Guard<'_, T, B>, + timeout_in_jiffies: c_long, + ) -> c_long { let wait = Opaque::::uninit(); // SAFETY: `wait` points to valid memory. @@ -117,11 +126,13 @@ fn wait_internal(&self, wait_state: u32, guard: &mut Guar ) }; - // SAFETY: No arguments, switches to another thread. - guard.do_unlocked(|| unsafe { bindings::schedule() }); + // SAFETY: Switches to another thread. The timeout can be any number. + let ret = guard.do_unlocked(|| unsafe { bindings::schedule_timeout(timeout_in_jiffies) }); // SAFETY: Both `wait` and `wait_queue_head` point to valid memory. unsafe { bindings::finish_wait(self.wait_queue_head.get(), wait.get()) }; + + ret } /// Releases the lock and waits for a notification in uninterruptible mode. @@ -131,7 +142,7 @@ fn wait_internal(&self, wait_state: u32, guard: &mut Guar /// [`CondVar::notify_one`] or [`CondVar::notify_all`]. Note that it may also wake up /// spuriously. pub fn wait(&self, guard: &mut Guard<'_, T, B>) { - self.wait_internal(bindings::TASK_UNINTERRUPTIBLE, guard); + self.wait_internal(bindings::TASK_UNINTERRUPTIBLE, guard, MAX_SCHEDULE_TIMEOUT); } /// Releases the lock and waits for a notification in interruptible mode. @@ -142,10 +153,31 @@ pub fn wait(&self, guard: &mut Guard<'_, T, B>) { /// Returns whether there is a signal pending. #[must_use = "wait_interruptible returns if a signal is pending, so the caller must check the return value"] pub fn wait_interruptible(&self, guard: &mut Guard<'_, T, B>) -> bool { - self.wait_internal(bindings::TASK_INTERRUPTIBLE, guard); + self.wait_internal(bindings::TASK_INTERRUPTIBLE, guard, MAX_SCHEDULE_TIMEOUT); crate::current!().signal_pending() } + /// Releases the lock and waits for a notification in interruptible mode. + /// + /// Atomically releases the given lock (whose ownership is proven by the guard) and puts the + /// thread to sleep. It wakes up when notified by [`CondVar::notify_one`] or + /// [`CondVar::notify_all`], or when a timeout occurs, or when the thread receives a signal. + #[must_use = "wait_interruptible_timeout returns if a signal is pending, so the caller must check the return value"] + pub fn wait_interruptible_timeout( + &self, + guard: &mut Guard<'_, T, B>, + jiffies: Jiffies, + ) -> CondVarTimeoutResult { + let jiffies = jiffies.try_into().unwrap_or(MAX_SCHEDULE_TIMEOUT); + let res = self.wait_internal(bindings::TASK_INTERRUPTIBLE, guard, jiffies); + + match (res as Jiffies, crate::current!().signal_pending()) { + (jiffies, true) => CondVarTimeoutResult::Signal { jiffies }, + (0, false) => CondVarTimeoutResult::Timeout, + (jiffies, false) => CondVarTimeoutResult::Woken { jiffies }, + } + } + /// Calls the kernel function to notify the appropriate number of threads with the given flags. fn notify(&self, count: i32, flags: u32) { // SAFETY: `wait_queue_head` points to valid memory. @@ -185,3 +217,19 @@ pub fn notify_all(&self) { self.notify(0, 0); } } + +/// The return type of `wait_timeout`. +pub enum CondVarTimeoutResult { + /// The timeout was reached. + Timeout, + /// Somebody woke us up. + Woken { + /// Remaining sleep duration. + jiffies: Jiffies, + }, + /// A signal occurred. + Signal { + /// Remaining sleep duration. + jiffies: Jiffies, + }, +} diff --git a/rust/kernel/sync/lock.rs b/rust/kernel/sync/lock.rs index f12a684bc9579..149a5259d431a 100644 --- a/rust/kernel/sync/lock.rs +++ b/rust/kernel/sync/lock.rs @@ -139,7 +139,7 @@ pub struct Guard<'a, T: ?Sized, B: Backend> { unsafe impl Sync for Guard<'_, T, B> {} impl Guard<'_, T, B> { - pub(crate) fn do_unlocked(&mut self, cb: impl FnOnce()) { + pub(crate) fn do_unlocked(&mut self, cb: impl FnOnce() -> U) -> U { // SAFETY: The caller owns the lock, so it is safe to unlock it. unsafe { B::unlock(self.lock.state.get(), &self.state) }; @@ -147,7 +147,7 @@ pub(crate) fn do_unlocked(&mut self, cb: impl FnOnce()) { let _relock = ScopeGuard::new(|| unsafe { B::relock(self.lock.state.get(), &mut self.state) }); - cb(); + cb() } } diff --git a/rust/kernel/task.rs b/rust/kernel/task.rs index 9451932d5d867..49a8e10d68f5a 100644 --- a/rust/kernel/task.rs +++ b/rust/kernel/task.rs @@ -5,7 +5,10 @@ //! C header: [`include/linux/sched.h`](srctree/include/linux/sched.h). use crate::{bindings, types::Opaque}; -use core::{marker::PhantomData, ops::Deref, ptr}; +use core::{ffi::c_long, marker::PhantomData, ops::Deref, ptr}; + +/// A sentinel value used for infinite timeouts. +pub const MAX_SCHEDULE_TIMEOUT: c_long = c_long::MAX; /// Returns the currently running task. #[macro_export] -- GitLab From f090f0d0eea9666a96702b29bc9a64cbabee85c5 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Mon, 8 Jan 2024 14:50:00 +0000 Subject: [PATCH 0114/1519] rust: sync: update integer types in CondVar Reduce the chances of compilation failures due to integer type mismatches in `CondVar`. When an integer is defined using a #define in C, bindgen doesn't know which integer type it is supposed to be, so it will just use `u32` by default (if it fits in an u32). Whenever the right type is something else, we insert a cast in Rust. However, this means that the code has a lot of extra casts, and sometimes the code will be missing casts if u32 happens to be correct on the developer's machine, even though the type might be something else on a different platform. This patch updates all uses of such constants in `rust/kernel/sync/condvar.rs` to use constants defined with the right type. This allows us to remove various unnecessary casts, while also future-proofing for the case where `unsigned int != u32` (even though that is unlikely to ever happen in the kernel). I wrote this patch at the suggestion of Benno in [1]. Link: https://lore.kernel.org/all/nAEg-6vbtX72ZY3oirDhrSEf06TBWmMiTt73EklMzEAzN4FD4mF3TPEyAOxBZgZtjzoiaBYtYr3s8sa9wp1uYH9vEWRf2M-Lf4I0BY9rAgk=@proton.me/ [1] Suggested-by: Benno Lossin Reviewed-by: Tiago Lam Reviewed-by: Boqun Feng Reviewed-by: Benno Lossin Reviewed-by: Martin Rodriguez Reboredo Signed-off-by: Alice Ryhl Link: https://lore.kernel.org/r/20240108-rb-new-condvar-methods-v4-4-88e0c871cc05@google.com [ Added note on the unlikeliness of `sizeof(int)` changing. ] Signed-off-by: Miguel Ojeda --- rust/kernel/sync/condvar.rs | 38 +++++++++++++++++++------------------ rust/kernel/task.rs | 15 ++++++++++++++- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/rust/kernel/sync/condvar.rs b/rust/kernel/sync/condvar.rs index 3f148d7040bb9..9f1d83589beb6 100644 --- a/rust/kernel/sync/condvar.rs +++ b/rust/kernel/sync/condvar.rs @@ -7,11 +7,17 @@ use super::{lock::Backend, lock::Guard, LockClassKey}; use crate::{ - bindings, init::PinInit, pin_init, str::CStr, task::MAX_SCHEDULE_TIMEOUT, time::Jiffies, + bindings, + init::PinInit, + pin_init, + str::CStr, + task::{MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE, TASK_NORMAL, TASK_UNINTERRUPTIBLE}, + time::Jiffies, types::Opaque, }; -use core::ffi::c_long; +use core::ffi::{c_int, c_long}; use core::marker::PhantomPinned; +use core::ptr; use macros::pin_data; /// Creates a [`CondVar`] initialiser with the given name and a newly-created lock class. @@ -108,7 +114,7 @@ pub fn new(name: &'static CStr, key: &'static LockClassKey) -> impl PinInit( &self, - wait_state: u32, + wait_state: c_int, guard: &mut Guard<'_, T, B>, timeout_in_jiffies: c_long, ) -> c_long { @@ -119,11 +125,7 @@ fn wait_internal( // SAFETY: Both `wait` and `wait_queue_head` point to valid memory. unsafe { - bindings::prepare_to_wait_exclusive( - self.wait_queue_head.get(), - wait.get(), - wait_state as _, - ) + bindings::prepare_to_wait_exclusive(self.wait_queue_head.get(), wait.get(), wait_state) }; // SAFETY: Switches to another thread. The timeout can be any number. @@ -142,7 +144,7 @@ fn wait_internal( /// [`CondVar::notify_one`] or [`CondVar::notify_all`]. Note that it may also wake up /// spuriously. pub fn wait(&self, guard: &mut Guard<'_, T, B>) { - self.wait_internal(bindings::TASK_UNINTERRUPTIBLE, guard, MAX_SCHEDULE_TIMEOUT); + self.wait_internal(TASK_UNINTERRUPTIBLE, guard, MAX_SCHEDULE_TIMEOUT); } /// Releases the lock and waits for a notification in interruptible mode. @@ -153,7 +155,7 @@ pub fn wait(&self, guard: &mut Guard<'_, T, B>) { /// Returns whether there is a signal pending. #[must_use = "wait_interruptible returns if a signal is pending, so the caller must check the return value"] pub fn wait_interruptible(&self, guard: &mut Guard<'_, T, B>) -> bool { - self.wait_internal(bindings::TASK_INTERRUPTIBLE, guard, MAX_SCHEDULE_TIMEOUT); + self.wait_internal(TASK_INTERRUPTIBLE, guard, MAX_SCHEDULE_TIMEOUT); crate::current!().signal_pending() } @@ -169,7 +171,7 @@ pub fn wait_interruptible_timeout( jiffies: Jiffies, ) -> CondVarTimeoutResult { let jiffies = jiffies.try_into().unwrap_or(MAX_SCHEDULE_TIMEOUT); - let res = self.wait_internal(bindings::TASK_INTERRUPTIBLE, guard, jiffies); + let res = self.wait_internal(TASK_INTERRUPTIBLE, guard, jiffies); match (res as Jiffies, crate::current!().signal_pending()) { (jiffies, true) => CondVarTimeoutResult::Signal { jiffies }, @@ -178,15 +180,15 @@ pub fn wait_interruptible_timeout( } } - /// Calls the kernel function to notify the appropriate number of threads with the given flags. - fn notify(&self, count: i32, flags: u32) { + /// Calls the kernel function to notify the appropriate number of threads. + fn notify(&self, count: c_int) { // SAFETY: `wait_queue_head` points to valid memory. unsafe { bindings::__wake_up( self.wait_queue_head.get(), - bindings::TASK_NORMAL, + TASK_NORMAL, count, - flags as _, + ptr::null_mut(), ) }; } @@ -198,7 +200,7 @@ fn notify(&self, count: i32, flags: u32) { /// CPU. pub fn notify_sync(&self) { // SAFETY: `wait_queue_head` points to valid memory. - unsafe { bindings::__wake_up_sync(self.wait_queue_head.get(), bindings::TASK_NORMAL) }; + unsafe { bindings::__wake_up_sync(self.wait_queue_head.get(), TASK_NORMAL) }; } /// Wakes a single waiter up, if any. @@ -206,7 +208,7 @@ pub fn notify_sync(&self) { /// This is not 'sticky' in the sense that if no thread is waiting, the notification is lost /// completely (as opposed to automatically waking up the next waiter). pub fn notify_one(&self) { - self.notify(1, 0); + self.notify(1); } /// Wakes all waiters up, if any. @@ -214,7 +216,7 @@ pub fn notify_one(&self) { /// This is not 'sticky' in the sense that if no thread is waiting, the notification is lost /// completely (as opposed to automatically waking up the next waiter). pub fn notify_all(&self) { - self.notify(0, 0); + self.notify(0); } } diff --git a/rust/kernel/task.rs b/rust/kernel/task.rs index 49a8e10d68f5a..a3a4007db682a 100644 --- a/rust/kernel/task.rs +++ b/rust/kernel/task.rs @@ -5,11 +5,24 @@ //! C header: [`include/linux/sched.h`](srctree/include/linux/sched.h). use crate::{bindings, types::Opaque}; -use core::{ffi::c_long, marker::PhantomData, ops::Deref, ptr}; +use core::{ + ffi::{c_int, c_long, c_uint}, + marker::PhantomData, + ops::Deref, + ptr, +}; /// A sentinel value used for infinite timeouts. pub const MAX_SCHEDULE_TIMEOUT: c_long = c_long::MAX; +/// Bitmask for tasks that are sleeping in an interruptible state. +pub const TASK_INTERRUPTIBLE: c_int = bindings::TASK_INTERRUPTIBLE as c_int; +/// Bitmask for tasks that are sleeping in an uninterruptible state. +pub const TASK_UNINTERRUPTIBLE: c_int = bindings::TASK_UNINTERRUPTIBLE as c_int; +/// Convenience constant for waking up tasks regardless of whether they are in interruptible or +/// uninterruptible sleep. +pub const TASK_NORMAL: c_uint = bindings::TASK_NORMAL as c_uint; + /// Returns the currently running task. #[macro_export] macro_rules! current { -- GitLab From 453f0ae797328e675840466c80e5b268d7feb9ba Mon Sep 17 00:00:00 2001 From: Muralidhara M K Date: Sun, 28 Jan 2024 09:59:50 -0600 Subject: [PATCH 0115/1519] RAS/AMD/ATL: Add MI300 support AMD MI300 systems include on-die HBM3 memory and a unique topology. And they fall under Data Fabric version 4.5 in overall design. Generally, topology information (IDs, etc.) is gathered from Data Fabric registers. However, the unique topology for MI300 means that some topology information is fixed in hardware and follows arbitrary mappings. Furthermore, not all hardware instances are software-visible, so register accesses must be adjusted. Recognize and add helper functions for the new MI300 interleave modes. Add lookup tables for fixed values where appropriate. Adjust how Die and Node IDs are found and used. Also, fix some register bitmasks that were mislabeled. Signed-off-by: Muralidhara M K Co-developed-by: Yazen Ghannam Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240128155950.1434067-1-yazen.ghannam@amd.com --- drivers/ras/amd/atl/access.c | 27 ++++++++ drivers/ras/amd/atl/dehash.c | 95 +++++++++++++++++++++++++++- drivers/ras/amd/atl/denormalize.c | 102 ++++++++++++++++++++++++++++++ drivers/ras/amd/atl/internal.h | 10 ++- drivers/ras/amd/atl/map.c | 17 +++++ drivers/ras/amd/atl/reg_fields.h | 9 ++- drivers/ras/amd/atl/system.c | 3 + drivers/ras/amd/atl/umc.c | 51 +++++++++++++++ 8 files changed, 309 insertions(+), 5 deletions(-) diff --git a/drivers/ras/amd/atl/access.c b/drivers/ras/amd/atl/access.c index f6dd87bb2c35a..ee4661ed28ba7 100644 --- a/drivers/ras/amd/atl/access.c +++ b/drivers/ras/amd/atl/access.c @@ -36,6 +36,32 @@ static DEFINE_MUTEX(df_indirect_mutex); #define DF_FICAA_REG_NUM_LEGACY GENMASK(10, 2) +static u16 get_accessible_node(u16 node) +{ + /* + * On heterogeneous systems, not all AMD Nodes are accessible + * through software-visible registers. The Node ID needs to be + * adjusted for register accesses. But its value should not be + * changed for the translation methods. + */ + if (df_cfg.flags.heterogeneous) { + /* Only Node 0 is accessible on DF3.5 systems. */ + if (df_cfg.rev == DF3p5) + node = 0; + + /* + * Only the first Node in each Socket is accessible on + * DF4.5 systems, and this is visible to software as one + * Fabric per Socket. The Socket ID can be derived from + * the Node ID and global shift values. + */ + if (df_cfg.rev == DF4p5) + node >>= df_cfg.socket_id_shift - df_cfg.node_id_shift; + } + + return node; +} + static int __df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) { u32 ficaa_addr = 0x8C, ficad_addr = 0xB8; @@ -43,6 +69,7 @@ static int __df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *l int err = -ENODEV; u32 ficaa = 0; + node = get_accessible_node(node); if (node >= amd_nb_num()) goto out; diff --git a/drivers/ras/amd/atl/dehash.c b/drivers/ras/amd/atl/dehash.c index 6f414926e6fe1..4ea46262c4f58 100644 --- a/drivers/ras/amd/atl/dehash.c +++ b/drivers/ras/amd/atl/dehash.c @@ -253,7 +253,7 @@ static int df4p5_dehash_addr(struct addr_ctx *ctx) hash_ctl_64k = FIELD_GET(DF4_HASH_CTL_64K, ctx->map.ctl); hash_ctl_2M = FIELD_GET(DF4_HASH_CTL_2M, ctx->map.ctl); hash_ctl_1G = FIELD_GET(DF4_HASH_CTL_1G, ctx->map.ctl); - hash_ctl_1T = FIELD_GET(DF4_HASH_CTL_1T, ctx->map.ctl); + hash_ctl_1T = FIELD_GET(DF4p5_HASH_CTL_1T, ctx->map.ctl); /* * Generate a unique address to determine which bits @@ -343,6 +343,94 @@ static int df4p5_dehash_addr(struct addr_ctx *ctx) return 0; } +/* + * MI300 hash bits + * 4K 64K 2M 1G 1T 1T + * COH_ST_Select[0] = XOR of addr{8, 12, 15, 22, 29, 36, 43} + * COH_ST_Select[1] = XOR of addr{9, 13, 16, 23, 30, 37, 44} + * COH_ST_Select[2] = XOR of addr{10, 14, 17, 24, 31, 38, 45} + * COH_ST_Select[3] = XOR of addr{11, 18, 25, 32, 39, 46} + * COH_ST_Select[4] = XOR of addr{14, 19, 26, 33, 40, 47} aka Stack + * DieID[0] = XOR of addr{12, 20, 27, 34, 41 } + * DieID[1] = XOR of addr{13, 21, 28, 35, 42 } + */ +static int mi300_dehash_addr(struct addr_ctx *ctx) +{ + bool hash_ctl_4k, hash_ctl_64k, hash_ctl_2M, hash_ctl_1G, hash_ctl_1T; + bool hashed_bit, intlv_bit, test_bit; + u8 num_intlv_bits, base_bit, i; + + if (!map_bits_valid(ctx, 8, 8, 4, 1)) + return -EINVAL; + + hash_ctl_4k = FIELD_GET(DF4p5_HASH_CTL_4K, ctx->map.ctl); + hash_ctl_64k = FIELD_GET(DF4_HASH_CTL_64K, ctx->map.ctl); + hash_ctl_2M = FIELD_GET(DF4_HASH_CTL_2M, ctx->map.ctl); + hash_ctl_1G = FIELD_GET(DF4_HASH_CTL_1G, ctx->map.ctl); + hash_ctl_1T = FIELD_GET(DF4p5_HASH_CTL_1T, ctx->map.ctl); + + /* Channel bits */ + num_intlv_bits = ilog2(ctx->map.num_intlv_chan); + + for (i = 0; i < num_intlv_bits; i++) { + base_bit = 8 + i; + + /* COH_ST_Select[4] jumps to a base bit of 14. */ + if (i == 4) + base_bit = 14; + + intlv_bit = BIT_ULL(base_bit) & ctx->ret_addr; + + hashed_bit = intlv_bit; + + /* 4k hash bit only applies to the first 3 bits. */ + if (i <= 2) { + test_bit = BIT_ULL(12 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_4k; + } + + /* Use temporary 'test_bit' value to avoid Sparse warnings. */ + test_bit = BIT_ULL(15 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_64k; + test_bit = BIT_ULL(22 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_2M; + test_bit = BIT_ULL(29 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_1G; + test_bit = BIT_ULL(36 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_1T; + test_bit = BIT_ULL(43 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_1T; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(base_bit); + } + + /* Die bits */ + num_intlv_bits = ilog2(ctx->map.num_intlv_dies); + + for (i = 0; i < num_intlv_bits; i++) { + base_bit = 12 + i; + + intlv_bit = BIT_ULL(base_bit) & ctx->ret_addr; + + hashed_bit = intlv_bit; + + test_bit = BIT_ULL(20 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_64k; + test_bit = BIT_ULL(27 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_2M; + test_bit = BIT_ULL(34 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_1G; + test_bit = BIT_ULL(41 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_1T; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(base_bit); + } + + return 0; +} + int dehash_address(struct addr_ctx *ctx) { switch (ctx->map.intlv_mode) { @@ -400,6 +488,11 @@ int dehash_address(struct addr_ctx *ctx) case DF4p5_NPS1_16CHAN_2K_HASH: return df4p5_dehash_addr(ctx); + case MI3_HASH_8CHAN: + case MI3_HASH_16CHAN: + case MI3_HASH_32CHAN: + return mi300_dehash_addr(ctx); + default: atl_debug_on_bad_intlv_mode(ctx); return -EINVAL; diff --git a/drivers/ras/amd/atl/denormalize.c b/drivers/ras/amd/atl/denormalize.c index 01f1d0fb67995..d5d0e1fda1590 100644 --- a/drivers/ras/amd/atl/denormalize.c +++ b/drivers/ras/amd/atl/denormalize.c @@ -80,6 +80,40 @@ static u64 make_space_for_coh_st_id_split_2_1(struct addr_ctx *ctx) return expand_bits(12, ctx->map.total_intlv_bits - 1, denorm_addr); } +/* + * Make space for CS ID at bits [14:8] as follows: + * + * 8 channels -> bits [10:8] + * 16 channels -> bits [11:8] + * 32 channels -> bits [14,11:8] + * + * 1 die -> N/A + * 2 dies -> bit [12] + * 4 dies -> bits [13:12] + */ +static u64 make_space_for_coh_st_id_mi300(struct addr_ctx *ctx) +{ + u8 num_intlv_bits = ilog2(ctx->map.num_intlv_chan); + u64 denorm_addr; + + if (ctx->map.intlv_bit_pos != 8) { + pr_debug("Invalid interleave bit: %u", ctx->map.intlv_bit_pos); + return ~0ULL; + } + + /* Channel bits. Covers up to 4 bits at [11:8]. */ + denorm_addr = expand_bits(8, min(num_intlv_bits, 4), ctx->ret_addr); + + /* Die bits. Always starts at [12]. */ + denorm_addr = expand_bits(12, ilog2(ctx->map.num_intlv_dies), denorm_addr); + + /* Additional channel bit at [14]. */ + if (num_intlv_bits > 4) + denorm_addr = expand_bits(14, 1, denorm_addr); + + return denorm_addr; +} + /* * Take the current calculated address and shift enough bits in the middle * to make a gap where the interleave bits will be inserted. @@ -107,6 +141,12 @@ static u64 make_space_for_coh_st_id(struct addr_ctx *ctx) case DF4p5_NPS1_8CHAN_2K_HASH: case DF4p5_NPS1_16CHAN_2K_HASH: return make_space_for_coh_st_id_split_2_1(ctx); + + case MI3_HASH_8CHAN: + case MI3_HASH_16CHAN: + case MI3_HASH_32CHAN: + return make_space_for_coh_st_id_mi300(ctx); + default: atl_debug_on_bad_intlv_mode(ctx); return ~0ULL; @@ -204,6 +244,32 @@ static u16 get_coh_st_id_df4(struct addr_ctx *ctx) return coh_st_id; } +/* + * MI300 hash has: + * (C)hannel[3:0] = coh_st_id[3:0] + * (S)tack[0] = coh_st_id[4] + * (D)ie[1:0] = coh_st_id[6:5] + * + * Hashed coh_st_id is swizzled so that Stack bit is at the end. + * coh_st_id = SDDCCCC + */ +static u16 get_coh_st_id_mi300(struct addr_ctx *ctx) +{ + u8 channel_bits, die_bits, stack_bit; + u16 die_id; + + /* Subtract the "base" Destination Fabric ID. */ + ctx->coh_st_fabric_id -= get_dst_fabric_id(ctx); + + die_id = (ctx->coh_st_fabric_id & df_cfg.die_id_mask) >> df_cfg.die_id_shift; + + channel_bits = FIELD_GET(GENMASK(3, 0), ctx->coh_st_fabric_id); + stack_bit = FIELD_GET(BIT(4), ctx->coh_st_fabric_id) << 6; + die_bits = die_id << 4; + + return stack_bit | die_bits | channel_bits; +} + /* * Derive the correct Coherent Station ID that represents the interleave bits * used within the system physical address. This accounts for the @@ -237,6 +303,11 @@ static u16 calculate_coh_st_id(struct addr_ctx *ctx) case DF4p5_NPS1_16CHAN_2K_HASH: return get_coh_st_id_df4(ctx); + case MI3_HASH_8CHAN: + case MI3_HASH_16CHAN: + case MI3_HASH_32CHAN: + return get_coh_st_id_mi300(ctx); + /* COH_ST ID is simply the COH_ST Fabric ID adjusted by the Destination Fabric ID. */ case DF4p5_NPS2_4CHAN_1K_HASH: case DF4p5_NPS1_8CHAN_1K_HASH: @@ -287,6 +358,9 @@ static u64 insert_coh_st_id(struct addr_ctx *ctx, u64 denorm_addr, u16 coh_st_id case NOHASH_8CHAN: case NOHASH_16CHAN: case NOHASH_32CHAN: + case MI3_HASH_8CHAN: + case MI3_HASH_16CHAN: + case MI3_HASH_32CHAN: case DF2_2CHAN_HASH: return insert_coh_st_id_at_intlv_bit(ctx, denorm_addr, coh_st_id); @@ -314,6 +388,31 @@ static u64 insert_coh_st_id(struct addr_ctx *ctx, u64 denorm_addr, u16 coh_st_id } } +/* + * MI300 systems have a fixed, hardware-defined physical-to-logical + * Coherent Station mapping. The Remap registers are not used. + */ +static const u16 phy_to_log_coh_st_map_mi300[] = { + 12, 13, 14, 15, + 8, 9, 10, 11, + 4, 5, 6, 7, + 0, 1, 2, 3, + 28, 29, 30, 31, + 24, 25, 26, 27, + 20, 21, 22, 23, + 16, 17, 18, 19, +}; + +static u16 get_logical_coh_st_fabric_id_mi300(struct addr_ctx *ctx) +{ + if (ctx->inst_id >= sizeof(phy_to_log_coh_st_map_mi300)) { + atl_debug(ctx, "Instance ID out of range"); + return ~0; + } + + return phy_to_log_coh_st_map_mi300[ctx->inst_id] | (ctx->node_id << df_cfg.node_id_shift); +} + static u16 get_logical_coh_st_fabric_id(struct addr_ctx *ctx) { u16 component_id, log_fabric_id; @@ -321,6 +420,9 @@ static u16 get_logical_coh_st_fabric_id(struct addr_ctx *ctx) /* Start with the physical COH_ST Fabric ID. */ u16 phys_fabric_id = ctx->coh_st_fabric_id; + if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) + return get_logical_coh_st_fabric_id_mi300(ctx); + /* Skip logical ID lookup if remapping is disabled. */ if (!FIELD_GET(DF4_REMAP_EN, ctx->map.ctl) && ctx->map.intlv_mode != DF3_6CHAN) diff --git a/drivers/ras/amd/atl/internal.h b/drivers/ras/amd/atl/internal.h index 13f1b6098c968..21d45755e5f20 100644 --- a/drivers/ras/amd/atl/internal.h +++ b/drivers/ras/amd/atl/internal.h @@ -27,8 +27,12 @@ /* PCI ID for Zen4 Server DF Function 0. */ #define DF_FUNC0_ID_ZEN4_SERVER 0x14AD1022 +/* PCI IDs for MI300 DF Function 0. */ +#define DF_FUNC0_ID_MI300 0x15281022 + /* Shift needed for adjusting register values to true values. */ #define DF_DRAM_BASE_LIMIT_LSB 28 +#define MI300_DRAM_LIMIT_LSB 20 enum df_revisions { UNKNOWN, @@ -59,6 +63,9 @@ enum intlv_modes { DF4_NPS1_12CHAN_HASH = 0x15, DF4_NPS2_5CHAN_HASH = 0x16, DF4_NPS1_10CHAN_HASH = 0x17, + MI3_HASH_8CHAN = 0x18, + MI3_HASH_16CHAN = 0x19, + MI3_HASH_32CHAN = 0x1A, DF2_2CHAN_HASH = 0x21, /* DF4.5 modes are all IntLvNumChan + 0x20 */ DF4p5_NPS1_16CHAN_1K_HASH = 0x2C, @@ -86,7 +93,8 @@ enum intlv_modes { struct df_flags { __u8 legacy_ficaa : 1, socket_id_shift_quirk : 1, - __reserved_0 : 6; + heterogeneous : 1, + __reserved_0 : 5; }; struct df_config { diff --git a/drivers/ras/amd/atl/map.c b/drivers/ras/amd/atl/map.c index 33f549b6255a4..8b908e8d7495f 100644 --- a/drivers/ras/amd/atl/map.c +++ b/drivers/ras/amd/atl/map.c @@ -63,6 +63,10 @@ static int df4p5_get_intlv_mode(struct addr_ctx *ctx) if (ctx->map.intlv_mode <= NOHASH_32CHAN) return 0; + if (ctx->map.intlv_mode >= MI3_HASH_8CHAN && + ctx->map.intlv_mode <= MI3_HASH_32CHAN) + return 0; + /* * Modes matching the ranges above are returned as-is. * @@ -125,6 +129,9 @@ static u64 get_hi_addr_offset(u32 reg_dram_offset) atl_debug_on_bad_df_rev(); } + if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) + shift = MI300_DRAM_LIMIT_LSB; + return hi_addr_offset << shift; } @@ -369,6 +376,13 @@ static int get_coh_st_fabric_id(struct addr_ctx *ctx) { u32 reg; + /* + * On MI300 systems, the Coherent Station Fabric ID is derived + * later. And it does not depend on the register value. + */ + if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) + return 0; + /* Read D18F0x50 (FabricBlockInstanceInformation3). */ if (df_indirect_read_instance(ctx->node_id, 0, 0x50, ctx->inst_id, ®)) return -EINVAL; @@ -490,6 +504,7 @@ static u8 get_num_intlv_chan(struct addr_ctx *ctx) case NOHASH_8CHAN: case DF3_COD1_8CHAN_HASH: case DF4_NPS1_8CHAN_HASH: + case MI3_HASH_8CHAN: case DF4p5_NPS1_8CHAN_1K_HASH: case DF4p5_NPS1_8CHAN_2K_HASH: return 8; @@ -502,6 +517,7 @@ static u8 get_num_intlv_chan(struct addr_ctx *ctx) case DF4p5_NPS1_12CHAN_2K_HASH: return 12; case NOHASH_16CHAN: + case MI3_HASH_16CHAN: case DF4p5_NPS1_16CHAN_1K_HASH: case DF4p5_NPS1_16CHAN_2K_HASH: return 16; @@ -509,6 +525,7 @@ static u8 get_num_intlv_chan(struct addr_ctx *ctx) case DF4p5_NPS0_24CHAN_2K_HASH: return 24; case NOHASH_32CHAN: + case MI3_HASH_32CHAN: return 32; default: atl_debug_on_bad_intlv_mode(ctx); diff --git a/drivers/ras/amd/atl/reg_fields.h b/drivers/ras/amd/atl/reg_fields.h index 6aaa5093f42ce..9dcdf6e4a856b 100644 --- a/drivers/ras/amd/atl/reg_fields.h +++ b/drivers/ras/amd/atl/reg_fields.h @@ -246,11 +246,11 @@ #define DF3_HASH_CTL_64K BIT(20) #define DF3_HASH_CTL_2M BIT(21) #define DF3_HASH_CTL_1G BIT(22) -#define DF4_HASH_CTL_4K BIT(7) #define DF4_HASH_CTL_64K BIT(8) #define DF4_HASH_CTL_2M BIT(9) #define DF4_HASH_CTL_1G BIT(10) -#define DF4_HASH_CTL_1T BIT(15) +#define DF4p5_HASH_CTL_4K BIT(7) +#define DF4p5_HASH_CTL_1T BIT(15) /* * High Address Offset @@ -268,10 +268,13 @@ * D18F7x140 [DRAM Offset] * DF4 HiAddrOffset [24:1] * DF4p5 HiAddrOffset [24:1] + * MI300 HiAddrOffset [31:1] */ #define DF2_HI_ADDR_OFFSET GENMASK(31, 20) #define DF3_HI_ADDR_OFFSET GENMASK(31, 12) -#define DF4_HI_ADDR_OFFSET GENMASK(24, 1) + +/* Follow reference code by including reserved bits for simplicity. */ +#define DF4_HI_ADDR_OFFSET GENMASK(31, 1) /* * High Address Offset Enable diff --git a/drivers/ras/amd/atl/system.c b/drivers/ras/amd/atl/system.c index af61f2f1d6dea..46493ed405d6f 100644 --- a/drivers/ras/amd/atl/system.c +++ b/drivers/ras/amd/atl/system.c @@ -124,6 +124,9 @@ static int df4_determine_df_rev(u32 reg) if (reg == DF_FUNC0_ID_ZEN4_SERVER) df_cfg.flags.socket_id_shift_quirk = 1; + if (reg == DF_FUNC0_ID_MI300) + df_cfg.flags.heterogeneous = 1; + return df4_get_fabric_id_mask_registers(); } diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c index 9d51e4954687f..7bfa21a582f0a 100644 --- a/drivers/ras/amd/atl/umc.c +++ b/drivers/ras/amd/atl/umc.c @@ -12,8 +12,56 @@ #include "internal.h" +/* + * MI300 has a fixed, model-specific mapping between a UMC instance and + * its related Data Fabric Coherent Station instance. + * + * The MCA_IPID_UMC[InstanceId] field holds a unique identifier for the + * UMC instance within a Node. Use this to find the appropriate Coherent + * Station ID. + * + * Redundant bits were removed from the map below. + */ +static const u16 umc_coh_st_map[32] = { + 0x393, 0x293, 0x193, 0x093, + 0x392, 0x292, 0x192, 0x092, + 0x391, 0x291, 0x191, 0x091, + 0x390, 0x290, 0x190, 0x090, + 0x793, 0x693, 0x593, 0x493, + 0x792, 0x692, 0x592, 0x492, + 0x791, 0x691, 0x591, 0x491, + 0x790, 0x690, 0x590, 0x490, +}; + +#define UMC_ID_MI300 GENMASK(23, 12) +static u8 get_coh_st_inst_id_mi300(struct atl_err *err) +{ + u16 umc_id = FIELD_GET(UMC_ID_MI300, err->ipid); + u8 i; + + for (i = 0; i < ARRAY_SIZE(umc_coh_st_map); i++) { + if (umc_id == umc_coh_st_map[i]) + break; + } + + WARN_ON_ONCE(i >= ARRAY_SIZE(umc_coh_st_map)); + + return i; +} + +#define MCA_IPID_INST_ID_HI GENMASK_ULL(47, 44) static u8 get_die_id(struct atl_err *err) { + /* + * AMD Node ID is provided in MCA_IPID[InstanceIdHi], and this + * needs to be divided by 4 to get the internal Die ID. + */ + if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) { + u8 node_id = FIELD_GET(MCA_IPID_INST_ID_HI, err->ipid); + + return node_id >> 2; + } + /* * For CPUs, this is the AMD Node ID modulo the number * of AMD Nodes per socket. @@ -24,6 +72,9 @@ static u8 get_die_id(struct atl_err *err) #define UMC_CHANNEL_NUM GENMASK(31, 20) static u8 get_coh_st_inst_id(struct atl_err *err) { + if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) + return get_coh_st_inst_id_mi300(err); + return FIELD_GET(UMC_CHANNEL_NUM, err->ipid); } -- GitLab From aafd753555c0ecb9c7ce11ff14429a34c8c0a14b Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 22 Jan 2024 13:42:43 +0100 Subject: [PATCH 0116/1519] genirq/irq_sim: Shrink code by using helpers Use the new __free() mechanism to remove all gotos and simplify the error paths. Signed-off-by: Bartosz Golaszewski Signed-off-by: Ingo Molnar Cc: Thomas Gleixner Cc: Nathan Chancellor Link: https://lore.kernel.org/r/20240122124243.44002-5-brgl@bgdev.pl --- kernel/irq/irq_sim.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index b0d50b48dbd16..38d6ae651ac75 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -4,6 +4,7 @@ * Copyright (C) 2020 Bartosz Golaszewski */ +#include #include #include #include @@ -163,33 +164,27 @@ static const struct irq_domain_ops irq_sim_domain_ops = { struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode, unsigned int num_irqs) { - struct irq_sim_work_ctx *work_ctx; + struct irq_sim_work_ctx *work_ctx __free(kfree) = + kmalloc(sizeof(*work_ctx), GFP_KERNEL); - work_ctx = kmalloc(sizeof(*work_ctx), GFP_KERNEL); if (!work_ctx) - goto err_out; + return ERR_PTR(-ENOMEM); - work_ctx->pending = bitmap_zalloc(num_irqs, GFP_KERNEL); - if (!work_ctx->pending) - goto err_free_work_ctx; + unsigned long *pending __free(bitmap) = bitmap_zalloc(num_irqs, GFP_KERNEL); + if (!pending) + return ERR_PTR(-ENOMEM); work_ctx->domain = irq_domain_create_linear(fwnode, num_irqs, &irq_sim_domain_ops, work_ctx); if (!work_ctx->domain) - goto err_free_bitmap; + return ERR_PTR(-ENOMEM); work_ctx->irq_count = num_irqs; work_ctx->work = IRQ_WORK_INIT_HARD(irq_sim_handle_irq); + work_ctx->pending = no_free_ptr(pending); - return work_ctx->domain; - -err_free_bitmap: - bitmap_free(work_ctx->pending); -err_free_work_ctx: - kfree(work_ctx); -err_out: - return ERR_PTR(-ENOMEM); + return no_free_ptr(work_ctx)->domain; } EXPORT_SYMBOL_GPL(irq_domain_create_sim); -- GitLab From a045a272d887575da17ad86d6573e82871b50c27 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 29 Jan 2024 08:11:24 -1000 Subject: [PATCH 0117/1519] workqueue: Move pwq->max_active to wq->max_active max_active is a workqueue-wide setting and the configured value is stored in wq->saved_max_active; however, the effective value was stored in pwq->max_active. While this is harmless, it makes max_active update process more complicated and gets in the way of the planned max_active semantic updates for unbound workqueues. This patches moves pwq->max_active to wq->max_active. This simplifies the code and makes freezing and noop max_active updates cheaper too. No user-visible behavior change is intended. As wq->max_active is updated while holding wq mutex but read without any locking, it now uses WRITE/READ_ONCE(). A new locking locking rule WO is added for it. v2: wq->max_active now uses WRITE/READ_ONCE() as suggested by Lai. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 133 ++++++++++++++++++++++----------------------- 1 file changed, 66 insertions(+), 67 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 45d0a784ba4f5..a23a35dbcd74a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -147,6 +147,9 @@ enum wq_internal_consts { * * WR: wq->mutex protected for writes. RCU protected for reads. * + * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read + * with READ_ONCE() without locking. + * * MD: wq_mayday_lock protected. * * WD: Used internally by the watchdog. @@ -254,7 +257,6 @@ struct pool_workqueue { * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. */ int nr_active; /* L: nr of active works */ - int max_active; /* L: max active works */ struct list_head inactive_works; /* L: inactive works */ struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ @@ -302,7 +304,8 @@ struct workqueue_struct { struct worker *rescuer; /* MD: rescue worker */ int nr_drainers; /* WQ: drain in progress */ - int saved_max_active; /* WQ: saved pwq max_active */ + int max_active; /* WO: max active works */ + int saved_max_active; /* WQ: saved max_active */ struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */ struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */ @@ -1496,7 +1499,7 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_ pwq->nr_active--; if (!list_empty(&pwq->inactive_works)) { /* one down, submit an inactive one */ - if (pwq->nr_active < pwq->max_active) + if (pwq->nr_active < READ_ONCE(pwq->wq->max_active)) pwq_activate_first_inactive(pwq); } } @@ -1797,7 +1800,13 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, pwq->nr_in_flight[pwq->work_color]++; work_flags = work_color_to_flags(pwq->work_color); - if (likely(pwq->nr_active < pwq->max_active)) { + /* + * Limit the number of concurrently active work items to max_active. + * @work must also queue behind existing inactive work items to maintain + * ordering when max_active changes. See wq_adjust_max_active(). + */ + if (list_empty(&pwq->inactive_works) && + pwq->nr_active < READ_ONCE(pwq->wq->max_active)) { if (list_empty(&pool->worklist)) pool->watchdog_ts = jiffies; @@ -4146,50 +4155,6 @@ static void pwq_release_workfn(struct kthread_work *work) } } -/** - * pwq_adjust_max_active - update a pwq's max_active to the current setting - * @pwq: target pool_workqueue - * - * If @pwq isn't freezing, set @pwq->max_active to the associated - * workqueue's saved_max_active and activate inactive work items - * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. - */ -static void pwq_adjust_max_active(struct pool_workqueue *pwq) -{ - struct workqueue_struct *wq = pwq->wq; - bool freezable = wq->flags & WQ_FREEZABLE; - unsigned long flags; - - /* for @wq->saved_max_active */ - lockdep_assert_held(&wq->mutex); - - /* fast exit for non-freezable wqs */ - if (!freezable && pwq->max_active == wq->saved_max_active) - return; - - /* this function can be called during early boot w/ irq disabled */ - raw_spin_lock_irqsave(&pwq->pool->lock, flags); - - /* - * During [un]freezing, the caller is responsible for ensuring that - * this function is called at least once after @workqueue_freezing - * is updated and visible. - */ - if (!freezable || !workqueue_freezing) { - pwq->max_active = wq->saved_max_active; - - while (!list_empty(&pwq->inactive_works) && - pwq->nr_active < pwq->max_active) - pwq_activate_first_inactive(pwq); - - kick_pool(pwq->pool); - } else { - pwq->max_active = 0; - } - - raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); -} - /* initialize newly allocated @pwq which is associated with @wq and @pool */ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, struct worker_pool *pool) @@ -4222,9 +4187,6 @@ static void link_pwq(struct pool_workqueue *pwq) /* set the matching work_color */ pwq->work_color = wq->work_color; - /* sync max_active to the current setting */ - pwq_adjust_max_active(pwq); - /* link in @pwq */ list_add_rcu(&pwq->pwqs_node, &wq->pwqs); } @@ -4665,6 +4627,52 @@ static int init_rescuer(struct workqueue_struct *wq) return 0; } +/** + * wq_adjust_max_active - update a wq's max_active to the current setting + * @wq: target workqueue + * + * If @wq isn't freezing, set @wq->max_active to the saved_max_active and + * activate inactive work items accordingly. If @wq is freezing, clear + * @wq->max_active to zero. + */ +static void wq_adjust_max_active(struct workqueue_struct *wq) +{ + struct pool_workqueue *pwq; + + lockdep_assert_held(&wq->mutex); + + if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) { + WRITE_ONCE(wq->max_active, 0); + return; + } + + if (wq->max_active == wq->saved_max_active) + return; + + /* + * Update @wq->max_active and then kick inactive work items if more + * active work items are allowed. This doesn't break work item ordering + * because new work items are always queued behind existing inactive + * work items if there are any. + */ + WRITE_ONCE(wq->max_active, wq->saved_max_active); + + for_each_pwq(pwq, wq) { + unsigned long flags; + + /* this function can be called during early boot w/ irq disabled */ + raw_spin_lock_irqsave(&pwq->pool->lock, flags); + + while (!list_empty(&pwq->inactive_works) && + pwq->nr_active < wq->max_active) + pwq_activate_first_inactive(pwq); + + kick_pool(pwq->pool); + + raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); + } +} + __printf(1, 4) struct workqueue_struct *alloc_workqueue(const char *fmt, unsigned int flags, @@ -4672,7 +4680,6 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, { va_list args; struct workqueue_struct *wq; - struct pool_workqueue *pwq; int len; /* @@ -4711,6 +4718,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, /* init wq */ wq->flags = flags; + wq->max_active = max_active; wq->saved_max_active = max_active; mutex_init(&wq->mutex); atomic_set(&wq->nr_pwqs_to_flush, 0); @@ -4739,8 +4747,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, mutex_lock(&wq_pool_mutex); mutex_lock(&wq->mutex); - for_each_pwq(pwq, wq) - pwq_adjust_max_active(pwq); + wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); list_add_tail_rcu(&wq->list, &workqueues); @@ -4878,8 +4885,6 @@ EXPORT_SYMBOL_GPL(destroy_workqueue); */ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) { - struct pool_workqueue *pwq; - /* disallow meddling with max_active for ordered workqueues */ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return; @@ -4890,9 +4895,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) wq->flags &= ~__WQ_ORDERED; wq->saved_max_active = max_active; - - for_each_pwq(pwq, wq) - pwq_adjust_max_active(pwq); + wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); } @@ -5139,8 +5142,8 @@ static void show_pwq(struct pool_workqueue *pwq) pr_info(" pwq %d:", pool->id); pr_cont_pool_info(pool); - pr_cont(" active=%d/%d refcnt=%d%s\n", - pwq->nr_active, pwq->max_active, pwq->refcnt, + pr_cont(" active=%d refcnt=%d%s\n", + pwq->nr_active, pwq->refcnt, !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); hash_for_each(pool->busy_hash, bkt, worker, hentry) { @@ -5688,7 +5691,6 @@ EXPORT_SYMBOL_GPL(work_on_cpu_safe_key); void freeze_workqueues_begin(void) { struct workqueue_struct *wq; - struct pool_workqueue *pwq; mutex_lock(&wq_pool_mutex); @@ -5697,8 +5699,7 @@ void freeze_workqueues_begin(void) list_for_each_entry(wq, &workqueues, list) { mutex_lock(&wq->mutex); - for_each_pwq(pwq, wq) - pwq_adjust_max_active(pwq); + wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); } @@ -5763,7 +5764,6 @@ bool freeze_workqueues_busy(void) void thaw_workqueues(void) { struct workqueue_struct *wq; - struct pool_workqueue *pwq; mutex_lock(&wq_pool_mutex); @@ -5775,8 +5775,7 @@ void thaw_workqueues(void) /* restore max_active and repopulate worklist */ list_for_each_entry(wq, &workqueues, list) { mutex_lock(&wq->mutex); - for_each_pwq(pwq, wq) - pwq_adjust_max_active(pwq); + wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); } -- GitLab From afa87ce85379e2d93863fce595afdb5771a84004 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 29 Jan 2024 08:11:24 -1000 Subject: [PATCH 0118/1519] workqueue: Factor out pwq_is_empty() "!pwq->nr_active && list_empty(&pwq->inactive_works)" test is repeated multiple times. Let's factor it out into pwq_is_empty(). Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a23a35dbcd74a..171d0e6d29f6c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1460,6 +1460,11 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) } } +static bool pwq_is_empty(struct pool_workqueue *pwq) +{ + return !pwq->nr_active && list_empty(&pwq->inactive_works); +} + static void pwq_activate_inactive_work(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); @@ -3329,7 +3334,7 @@ void drain_workqueue(struct workqueue_struct *wq) bool drained; raw_spin_lock_irq(&pwq->pool->lock); - drained = !pwq->nr_active && list_empty(&pwq->inactive_works); + drained = pwq_is_empty(pwq); raw_spin_unlock_irq(&pwq->pool->lock); if (drained) @@ -4779,7 +4784,7 @@ static bool pwq_busy(struct pool_workqueue *pwq) if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1)) return true; - if (pwq->nr_active || !list_empty(&pwq->inactive_works)) + if (!pwq_is_empty(pwq)) return true; return false; @@ -5217,7 +5222,7 @@ void show_one_workqueue(struct workqueue_struct *wq) unsigned long flags; for_each_pwq(pwq, wq) { - if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { + if (!pwq_is_empty(pwq)) { idle = false; break; } @@ -5229,7 +5234,7 @@ void show_one_workqueue(struct workqueue_struct *wq) for_each_pwq(pwq, wq) { raw_spin_lock_irqsave(&pwq->pool->lock, flags); - if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { + if (!pwq_is_empty(pwq)) { /* * Defer printing to avoid deadlocks in console * drivers that queue work while holding locks -- GitLab From 4c6380305d21e36581b451f7337a36c93b64e050 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 29 Jan 2024 08:11:24 -1000 Subject: [PATCH 0119/1519] workqueue: Replace pwq_activate_inactive_work() with [__]pwq_activate_work() To prepare for unbound nr_active handling improvements, move work activation part of pwq_activate_inactive_work() into __pwq_activate_work() and add pwq_activate_work() which tests WORK_STRUCT_INACTIVE and updates nr_active. pwq_activate_first_inactive() and try_to_grab_pending() are updated to use pwq_activate_work(). The latter conversion is functionally identical. For the former, this conversion adds an unnecessary WORK_STRUCT_INACTIVE testing. This is temporary and will be removed by the next patch. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 171d0e6d29f6c..403f3a14166d9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1465,16 +1465,36 @@ static bool pwq_is_empty(struct pool_workqueue *pwq) return !pwq->nr_active && list_empty(&pwq->inactive_works); } -static void pwq_activate_inactive_work(struct work_struct *work) +static void __pwq_activate_work(struct pool_workqueue *pwq, + struct work_struct *work) { - struct pool_workqueue *pwq = get_work_pwq(work); - trace_workqueue_activate_work(work); if (list_empty(&pwq->pool->worklist)) pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work)); +} + +/** + * pwq_activate_work - Activate a work item if inactive + * @pwq: pool_workqueue @work belongs to + * @work: work item to activate + * + * Returns %true if activated. %false if already active. + */ +static bool pwq_activate_work(struct pool_workqueue *pwq, + struct work_struct *work) +{ + struct worker_pool *pool = pwq->pool; + + lockdep_assert_held(&pool->lock); + + if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE)) + return false; + pwq->nr_active++; + __pwq_activate_work(pwq, work); + return true; } static void pwq_activate_first_inactive(struct pool_workqueue *pwq) @@ -1482,7 +1502,7 @@ static void pwq_activate_first_inactive(struct pool_workqueue *pwq) struct work_struct *work = list_first_entry(&pwq->inactive_works, struct work_struct, entry); - pwq_activate_inactive_work(work); + pwq_activate_work(pwq, work); } /** @@ -1620,8 +1640,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, * management later on and cause stall. Make sure the work * item is activated before grabbing. */ - if (*work_data_bits(work) & WORK_STRUCT_INACTIVE) - pwq_activate_inactive_work(work); + pwq_activate_work(pwq, work); list_del_init(&work->entry); pwq_dec_nr_in_flight(pwq, *work_data_bits(work)); -- GitLab From 1c270b79ce0b8290f146255ea9057243f6dd3c17 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 29 Jan 2024 08:11:24 -1000 Subject: [PATCH 0120/1519] workqueue: Move nr_active handling into helpers __queue_work(), pwq_dec_nr_in_flight() and wq_adjust_max_active() were open-coding nr_active handling, which is fine given that the operations are trivial. However, the planned unbound nr_active update will make them more complicated, so let's move them into helpers. - pwq_tryinc_nr_active() is added. It increments nr_active if under max_active limit and return a boolean indicating whether inc was successful. Note that the function is structured to accommodate future changes. __queue_work() is updated to use the new helper. - pwq_activate_first_inactive() is updated to use pwq_tryinc_nr_active() and thus no longer assumes that nr_active is under max_active and returns a boolean to indicate whether a work item has been activated. - wq_adjust_max_active() no longer tests directly whether a work item can be activated. Instead, it's updated to use the return value of pwq_activate_first_inactive() to tell whether a work item has been activated. - nr_active decrement and activating the first inactive work item is factored into pwq_dec_nr_active(). v3: - WARN_ON_ONCE(!WORK_STRUCT_INACTIVE) added to __pwq_activate_work() as now we're calling the function unconditionally from pwq_activate_first_inactive(). v2: - wq->max_active now uses WRITE/READ_ONCE() as suggested by Lai. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 86 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 67 insertions(+), 19 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 403f3a14166d9..b3d818d46e995 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1468,11 +1468,14 @@ static bool pwq_is_empty(struct pool_workqueue *pwq) static void __pwq_activate_work(struct pool_workqueue *pwq, struct work_struct *work) { + unsigned long *wdb = work_data_bits(work); + + WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE)); trace_workqueue_activate_work(work); if (list_empty(&pwq->pool->worklist)) pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); - __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work)); + __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb); } /** @@ -1497,12 +1500,66 @@ static bool pwq_activate_work(struct pool_workqueue *pwq, return true; } -static void pwq_activate_first_inactive(struct pool_workqueue *pwq) +/** + * pwq_tryinc_nr_active - Try to increment nr_active for a pwq + * @pwq: pool_workqueue of interest + * + * Try to increment nr_active for @pwq. Returns %true if an nr_active count is + * successfully obtained. %false otherwise. + */ +static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq) +{ + struct workqueue_struct *wq = pwq->wq; + struct worker_pool *pool = pwq->pool; + bool obtained; + + lockdep_assert_held(&pool->lock); + + obtained = pwq->nr_active < READ_ONCE(wq->max_active); + + if (obtained) + pwq->nr_active++; + return obtained; +} + +/** + * pwq_activate_first_inactive - Activate the first inactive work item on a pwq + * @pwq: pool_workqueue of interest + * + * Activate the first inactive work item of @pwq if available and allowed by + * max_active limit. + * + * Returns %true if an inactive work item has been activated. %false if no + * inactive work item is found or max_active limit is reached. + */ +static bool pwq_activate_first_inactive(struct pool_workqueue *pwq) +{ + struct work_struct *work = + list_first_entry_or_null(&pwq->inactive_works, + struct work_struct, entry); + + if (work && pwq_tryinc_nr_active(pwq)) { + __pwq_activate_work(pwq, work); + return true; + } else { + return false; + } +} + +/** + * pwq_dec_nr_active - Retire an active count + * @pwq: pool_workqueue of interest + * + * Decrement @pwq's nr_active and try to activate the first inactive work item. + */ +static void pwq_dec_nr_active(struct pool_workqueue *pwq) { - struct work_struct *work = list_first_entry(&pwq->inactive_works, - struct work_struct, entry); + struct worker_pool *pool = pwq->pool; - pwq_activate_work(pwq, work); + lockdep_assert_held(&pool->lock); + + pwq->nr_active--; + pwq_activate_first_inactive(pwq); } /** @@ -1520,14 +1577,8 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_ { int color = get_work_color(work_data); - if (!(work_data & WORK_STRUCT_INACTIVE)) { - pwq->nr_active--; - if (!list_empty(&pwq->inactive_works)) { - /* one down, submit an inactive one */ - if (pwq->nr_active < READ_ONCE(pwq->wq->max_active)) - pwq_activate_first_inactive(pwq); - } - } + if (!(work_data & WORK_STRUCT_INACTIVE)) + pwq_dec_nr_active(pwq); pwq->nr_in_flight[color]--; @@ -1829,13 +1880,11 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, * @work must also queue behind existing inactive work items to maintain * ordering when max_active changes. See wq_adjust_max_active(). */ - if (list_empty(&pwq->inactive_works) && - pwq->nr_active < READ_ONCE(pwq->wq->max_active)) { + if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq)) { if (list_empty(&pool->worklist)) pool->watchdog_ts = jiffies; trace_workqueue_activate_work(work); - pwq->nr_active++; insert_work(pwq, work, &pool->worklist, work_flags); kick_pool(pool); } else { @@ -4687,9 +4736,8 @@ static void wq_adjust_max_active(struct workqueue_struct *wq) /* this function can be called during early boot w/ irq disabled */ raw_spin_lock_irqsave(&pwq->pool->lock, flags); - while (!list_empty(&pwq->inactive_works) && - pwq->nr_active < wq->max_active) - pwq_activate_first_inactive(pwq); + while (pwq_activate_first_inactive(pwq)) + ; kick_pool(pwq->pool); -- GitLab From c5404d4e6df6faba1007544b5f4e62c7c14416dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 29 Jan 2024 08:11:24 -1000 Subject: [PATCH 0121/1519] workqueue: Make wq_adjust_max_active() round-robin pwqs while activating wq_adjust_max_active() needs to activate work items after max_active is increased. Previously, it did that by visiting each pwq once activating all that could be activated. While this makes sense with per-pwq nr_active, nr_active will be shared across multiple pwqs for unbound wqs. Then, we'd want to round-robin through pwqs to be fairer. In preparation, this patch makes wq_adjust_max_active() round-robin pwqs while activating. While the activation ordering changes, this shouldn't cause user-noticeable behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b3d818d46e995..489f70846ac9d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4710,7 +4710,7 @@ static int init_rescuer(struct workqueue_struct *wq) */ static void wq_adjust_max_active(struct workqueue_struct *wq) { - struct pool_workqueue *pwq; + bool activated; lockdep_assert_held(&wq->mutex); @@ -4730,19 +4730,26 @@ static void wq_adjust_max_active(struct workqueue_struct *wq) */ WRITE_ONCE(wq->max_active, wq->saved_max_active); - for_each_pwq(pwq, wq) { - unsigned long flags; - - /* this function can be called during early boot w/ irq disabled */ - raw_spin_lock_irqsave(&pwq->pool->lock, flags); - - while (pwq_activate_first_inactive(pwq)) - ; + /* + * Round-robin through pwq's activating the first inactive work item + * until max_active is filled. + */ + do { + struct pool_workqueue *pwq; - kick_pool(pwq->pool); + activated = false; + for_each_pwq(pwq, wq) { + unsigned long flags; - raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); - } + /* can be called during early boot w/ irq disabled */ + raw_spin_lock_irqsave(&pwq->pool->lock, flags); + if (pwq_activate_first_inactive(pwq)) { + activated = true; + kick_pool(pwq->pool); + } + raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); + } + } while (activated); } __printf(1, 4) -- GitLab From 9f66cff212bb3c1cd25996aaa0dfd0c9e9d8baab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 29 Jan 2024 08:11:24 -1000 Subject: [PATCH 0122/1519] workqueue: RCU protect wq->dfl_pwq and implement accessors for it wq->cpu_pwq is RCU protected but wq->dfl_pwq isn't. This is okay because currently wq->dfl_pwq is used only accessed to install it into wq->cpu_pwq which doesn't require RCU access. However, we want to be able to access wq->dfl_pwq under RCU in the future to access its __pod_cpumask and the code can be made easier to read by making the two pwq fields behave in the same way. - Make wq->dfl_pwq RCU protected. - Add unbound_pwq_slot() and unbound_pwq() which can access both ->dfl_pwq and ->cpu_pwq. The former returns the double pointer that can be used access and update the pwqs. The latter performs locking check and dereferences the double pointer. - pwq accesses and updates are converted to use unbound_pwq[_slot](). Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 64 +++++++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 489f70846ac9d..1579b8c9a5791 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -308,7 +308,7 @@ struct workqueue_struct { int saved_max_active; /* WQ: saved max_active */ struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */ - struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */ + struct pool_workqueue __rcu *dfl_pwq; /* PW: only for unbound wqs */ #ifdef CONFIG_SYSFS struct wq_device *wq_dev; /* I: for sysfs interface */ @@ -639,6 +639,23 @@ static int worker_pool_assign_id(struct worker_pool *pool) return ret; } +static struct pool_workqueue __rcu ** +unbound_pwq_slot(struct workqueue_struct *wq, int cpu) +{ + if (cpu >= 0) + return per_cpu_ptr(wq->cpu_pwq, cpu); + else + return &wq->dfl_pwq; +} + +/* @cpu < 0 for dfl_pwq */ +static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu) +{ + return rcu_dereference_check(*unbound_pwq_slot(wq, cpu), + lockdep_is_held(&wq_pool_mutex) || + lockdep_is_held(&wq->mutex)); +} + static unsigned int work_color_to_flags(int color) { return color << WORK_STRUCT_COLOR_SHIFT; @@ -4328,10 +4345,11 @@ static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu, "possible intersect\n"); } -/* install @pwq into @wq's cpu_pwq and return the old pwq */ +/* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */ static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq, int cpu, struct pool_workqueue *pwq) { + struct pool_workqueue __rcu **slot = unbound_pwq_slot(wq, cpu); struct pool_workqueue *old_pwq; lockdep_assert_held(&wq_pool_mutex); @@ -4340,8 +4358,8 @@ static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq, /* link_pwq() can handle duplicate calls */ link_pwq(pwq); - old_pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu)); - rcu_assign_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu), pwq); + old_pwq = rcu_access_pointer(*slot); + rcu_assign_pointer(*slot, pwq); return old_pwq; } @@ -4441,14 +4459,11 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs); - /* save the previous pwq and install the new one */ + /* save the previous pwqs and install the new ones */ for_each_possible_cpu(cpu) ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu, ctx->pwq_tbl[cpu]); - - /* @dfl_pwq might not have been used, ensure it's linked */ - link_pwq(ctx->dfl_pwq); - swap(ctx->wq->dfl_pwq, ctx->dfl_pwq); + ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq); mutex_unlock(&ctx->wq->mutex); } @@ -4558,9 +4573,7 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu, /* nothing to do if the target cpumask matches the current pwq */ wq_calc_pod_cpumask(target_attrs, cpu, off_cpu); - pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu), - lockdep_is_held(&wq_pool_mutex)); - if (wqattrs_equal(target_attrs, pwq->pool->attrs)) + if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs)) return; /* create a new pwq */ @@ -4578,10 +4591,11 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu, use_dfl_pwq: mutex_lock(&wq->mutex); - raw_spin_lock_irq(&wq->dfl_pwq->pool->lock); - get_pwq(wq->dfl_pwq); - raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock); - old_pwq = install_unbound_pwq(wq, cpu, wq->dfl_pwq); + pwq = unbound_pwq(wq, -1); + raw_spin_lock_irq(&pwq->pool->lock); + get_pwq(pwq); + raw_spin_unlock_irq(&pwq->pool->lock); + old_pwq = install_unbound_pwq(wq, cpu, pwq); out_unlock: mutex_unlock(&wq->mutex); put_pwq_unlocked(old_pwq); @@ -4619,10 +4633,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) cpus_read_lock(); if (wq->flags & __WQ_ORDERED) { + struct pool_workqueue *dfl_pwq; + ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); /* there should only be single pwq for ordering guarantee */ - WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || - wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), + dfl_pwq = rcu_access_pointer(wq->dfl_pwq); + WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node || + wq->pwqs.prev != &dfl_pwq->pwqs_node), "ordering guarantee broken for workqueue %s\n", wq->name); } else { ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); @@ -4856,7 +4873,7 @@ static bool pwq_busy(struct pool_workqueue *pwq) if (pwq->nr_in_flight[i]) return true; - if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1)) + if ((pwq != rcu_access_pointer(pwq->wq->dfl_pwq)) && (pwq->refcnt > 1)) return true; if (!pwq_is_empty(pwq)) return true; @@ -4940,13 +4957,12 @@ void destroy_workqueue(struct workqueue_struct *wq) rcu_read_lock(); for_each_possible_cpu(cpu) { - pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu)); - RCU_INIT_POINTER(*per_cpu_ptr(wq->cpu_pwq, cpu), NULL); - put_pwq_unlocked(pwq); + put_pwq_unlocked(unbound_pwq(wq, cpu)); + RCU_INIT_POINTER(*unbound_pwq_slot(wq, cpu), NULL); } - put_pwq_unlocked(wq->dfl_pwq); - wq->dfl_pwq = NULL; + put_pwq_unlocked(unbound_pwq(wq, -1)); + RCU_INIT_POINTER(*unbound_pwq_slot(wq, -1), NULL); rcu_read_unlock(); } -- GitLab From dd6c3c5441263723305a9c52c5ccc899a4653000 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 29 Jan 2024 08:11:24 -1000 Subject: [PATCH 0123/1519] workqueue: Move pwq_dec_nr_in_flight() to the end of work item handling The planned shared nr_active handling for unbound workqueues will make pwq_dec_nr_active() sometimes drop the pool lock temporarily to acquire other pool locks, which is necessary as retirement of an nr_active count from one pool may need kick off an inactive work item in another pool. This patch moves pwq_dec_nr_in_flight() call in try_to_grab_pending() to the end of work item handling so that work item state changes stay atomic. process_one_work() which is the other user of pwq_dec_nr_in_flight() already calls it at the end of work item handling. Comments are added to both call sites and pwq_dec_nr_in_flight(). This shouldn't cause any behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1579b8c9a5791..b5aba0e5a6990 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1587,6 +1587,11 @@ static void pwq_dec_nr_active(struct pool_workqueue *pwq) * A work either has completed or is removed from pending queue, * decrement nr_in_flight of its pwq and handle workqueue flushing. * + * NOTE: + * For unbound workqueues, this function may temporarily drop @pwq->pool->lock + * and thus should be called after all other state updates for the in-flight + * work item is complete. + * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ @@ -1711,11 +1716,13 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, pwq_activate_work(pwq, work); list_del_init(&work->entry); - pwq_dec_nr_in_flight(pwq, *work_data_bits(work)); /* work->data points to pwq iff queued, point to pool */ set_work_pool_and_keep_pending(work, pool->id); + /* must be the last step, see the function comment */ + pwq_dec_nr_in_flight(pwq, *work_data_bits(work)); + raw_spin_unlock(&pool->lock); rcu_read_unlock(); return 1; @@ -2780,6 +2787,8 @@ __acquires(&pool->lock) worker->current_func = NULL; worker->current_pwq = NULL; worker->current_color = INT_MAX; + + /* must be the last step, see the function comment */ pwq_dec_nr_in_flight(pwq, work_data); } -- GitLab From 91ccc6e7233bb10a9c176aa4cc70d6f432a441a5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 29 Jan 2024 08:11:24 -1000 Subject: [PATCH 0124/1519] workqueue: Introduce struct wq_node_nr_active Currently, for both percpu and unbound workqueues, max_active applies per-cpu, which is a recent change for unbound workqueues. The change for unbound workqueues was a significant departure from the previous behavior of per-node application. It made some use cases create undesirable number of concurrent work items and left no good way of fixing them. To address the problem, workqueue is implementing a NUMA node segmented global nr_active mechanism, which will be explained further in the next patch. As a preparation, this patch introduces struct wq_node_nr_active. It's a data structured allocated for each workqueue and NUMA node pair and currently only tracks the workqueue's number of active work items on the node. This is split out from the next patch to make it easier to understand and review. Note that there is an extra wq_node_nr_active allocated for the invalid node nr_node_ids which is used to track nr_active for pools which don't have NUMA node associated such as the default fallback system-wide pool. This doesn't cause any behavior changes visible to userland yet. The next patch will expand to implement the control mechanism on top. v4: - Fixed out-of-bound access when freeing per-cpu workqueues. v3: - Use flexible array for wq->node_nr_active as suggested by Lai. v2: - wq->max_active now uses WRITE/READ_ONCE() as suggested by Lai. - Lai pointed out that pwq_tryinc_nr_active() incorrectly dropped pwq->max_active check. Restored. As the next patch replaces the max_active enforcement mechanism, this doesn't change the end result. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 142 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 135 insertions(+), 7 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b5aba0e5a6990..8d465478adb92 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -284,6 +284,16 @@ struct wq_flusher { struct wq_device; +/* + * Unlike in a per-cpu workqueue where max_active limits its concurrency level + * on each CPU, in an unbound workqueue, max_active applies to the whole system. + * As sharing a single nr_active across multiple sockets can be very expensive, + * the counting and enforcement is per NUMA node. + */ +struct wq_node_nr_active { + atomic_t nr; /* per-node nr_active count */ +}; + /* * The externally visible workqueue. It relays the issued work items to * the appropriate worker_pool through its pool_workqueues. @@ -330,6 +340,7 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */ + struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */ }; static struct kmem_cache *pwq_cache; @@ -1425,6 +1436,31 @@ work_func_t wq_worker_last_func(struct task_struct *task) return worker->last_func; } +/** + * wq_node_nr_active - Determine wq_node_nr_active to use + * @wq: workqueue of interest + * @node: NUMA node, can be %NUMA_NO_NODE + * + * Determine wq_node_nr_active to use for @wq on @node. Returns: + * + * - %NULL for per-cpu workqueues as they don't need to use shared nr_active. + * + * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE. + * + * - Otherwise, node_nr_active[@node]. + */ +static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq, + int node) +{ + if (!(wq->flags & WQ_UNBOUND)) + return NULL; + + if (node == NUMA_NO_NODE) + node = nr_node_ids; + + return wq->node_nr_active[node]; +} + /** * get_pwq - get an extra reference on the specified pool_workqueue * @pwq: pool_workqueue to get @@ -1506,12 +1542,17 @@ static bool pwq_activate_work(struct pool_workqueue *pwq, struct work_struct *work) { struct worker_pool *pool = pwq->pool; + struct wq_node_nr_active *nna; lockdep_assert_held(&pool->lock); if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE)) return false; + nna = wq_node_nr_active(pwq->wq, pool->node); + if (nna) + atomic_inc(&nna->nr); + pwq->nr_active++; __pwq_activate_work(pwq, work); return true; @@ -1528,14 +1569,18 @@ static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq) { struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; + struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node); bool obtained; lockdep_assert_held(&pool->lock); obtained = pwq->nr_active < READ_ONCE(wq->max_active); - if (obtained) + if (obtained) { pwq->nr_active++; + if (nna) + atomic_inc(&nna->nr); + } return obtained; } @@ -1572,10 +1617,26 @@ static bool pwq_activate_first_inactive(struct pool_workqueue *pwq) static void pwq_dec_nr_active(struct pool_workqueue *pwq) { struct worker_pool *pool = pwq->pool; + struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node); lockdep_assert_held(&pool->lock); + /* + * @pwq->nr_active should be decremented for both percpu and unbound + * workqueues. + */ pwq->nr_active--; + + /* + * For a percpu workqueue, it's simple. Just need to kick the first + * inactive work item on @pwq itself. + */ + if (!nna) { + pwq_activate_first_inactive(pwq); + return; + } + + atomic_dec(&nna->nr); pwq_activate_first_inactive(pwq); } @@ -4039,11 +4100,63 @@ static void wq_free_lockdep(struct workqueue_struct *wq) } #endif +static void free_node_nr_active(struct wq_node_nr_active **nna_ar) +{ + int node; + + for_each_node(node) { + kfree(nna_ar[node]); + nna_ar[node] = NULL; + } + + kfree(nna_ar[nr_node_ids]); + nna_ar[nr_node_ids] = NULL; +} + +static void init_node_nr_active(struct wq_node_nr_active *nna) +{ + atomic_set(&nna->nr, 0); +} + +/* + * Each node's nr_active counter will be accessed mostly from its own node and + * should be allocated in the node. + */ +static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar) +{ + struct wq_node_nr_active *nna; + int node; + + for_each_node(node) { + nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node); + if (!nna) + goto err_free; + init_node_nr_active(nna); + nna_ar[node] = nna; + } + + /* [nr_node_ids] is used as the fallback */ + nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE); + if (!nna) + goto err_free; + init_node_nr_active(nna); + nna_ar[nr_node_ids] = nna; + + return 0; + +err_free: + free_node_nr_active(nna_ar); + return -ENOMEM; +} + static void rcu_free_wq(struct rcu_head *rcu) { struct workqueue_struct *wq = container_of(rcu, struct workqueue_struct, rcu); + if (wq->flags & WQ_UNBOUND) + free_node_nr_active(wq->node_nr_active); + wq_free_lockdep(wq); free_percpu(wq->cpu_pwq); free_workqueue_attrs(wq->unbound_attrs); @@ -4785,7 +4898,8 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, { va_list args; struct workqueue_struct *wq; - int len; + size_t wq_size; + int name_len; /* * Unbound && max_active == 1 used to imply ordered, which is no longer @@ -4801,7 +4915,12 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, flags |= WQ_UNBOUND; /* allocate wq and format name */ - wq = kzalloc(sizeof(*wq), GFP_KERNEL); + if (flags & WQ_UNBOUND) + wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1); + else + wq_size = sizeof(*wq); + + wq = kzalloc(wq_size, GFP_KERNEL); if (!wq) return NULL; @@ -4812,11 +4931,12 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, } va_start(args, max_active); - len = vsnprintf(wq->name, sizeof(wq->name), fmt, args); + name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args); va_end(args); - if (len >= WQ_NAME_LEN) - pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", wq->name); + if (name_len >= WQ_NAME_LEN) + pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", + wq->name); max_active = max_active ?: WQ_DFL_ACTIVE; max_active = wq_clamp_max_active(max_active, flags, wq->name); @@ -4835,8 +4955,13 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, wq_init_lockdep(wq); INIT_LIST_HEAD(&wq->list); + if (flags & WQ_UNBOUND) { + if (alloc_node_nr_active(wq->node_nr_active) < 0) + goto err_unreg_lockdep; + } + if (alloc_and_link_pwqs(wq) < 0) - goto err_unreg_lockdep; + goto err_free_node_nr_active; if (wq_online && init_rescuer(wq) < 0) goto err_destroy; @@ -4861,6 +4986,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, return wq; +err_free_node_nr_active: + if (wq->flags & WQ_UNBOUND) + free_node_nr_active(wq->node_nr_active); err_unreg_lockdep: wq_unregister_lockdep(wq); wq_free_lockdep(wq); -- GitLab From 5797b1c18919cd9c289ded7954383e499f729ce0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 29 Jan 2024 08:11:25 -1000 Subject: [PATCH 0125/1519] workqueue: Implement system-wide nr_active enforcement for unbound workqueues A pool_workqueue (pwq) represents the connection between a workqueue and a worker_pool. One of the roles that a pwq plays is enforcement of the max_active concurrency limit. Before 636b927eba5b ("workqueue: Make unbound workqueues to use per-cpu pool_workqueues"), there was one pwq per each CPU for per-cpu workqueues and per each NUMA node for unbound workqueues, which was a natural result of per-cpu workqueues being served by per-cpu pools and unbound by per-NUMA pools. In terms of max_active enforcement, this was, while not perfect, workable. For per-cpu workqueues, it was fine. For unbound, it wasn't great in that NUMA machines would get max_active that's multiplied by the number of nodes but didn't cause huge problems because NUMA machines are relatively rare and the node count is usually pretty low. However, cache layouts are more complex now and sharing a worker pool across a whole node didn't really work well for unbound workqueues. Thus, a series of commits culminating on 8639ecebc9b1 ("workqueue: Make unbound workqueues to use per-cpu pool_workqueues") implemented more flexible affinity mechanism for unbound workqueues which enables using e.g. last-level-cache aligned pools. In the process, 636b927eba5b ("workqueue: Make unbound workqueues to use per-cpu pool_workqueues") made unbound workqueues use per-cpu pwqs like per-cpu workqueues. While the change was necessary to enable more flexible affinity scopes, this came with the side effect of blowing up the effective max_active for unbound workqueues. Before, the effective max_active for unbound workqueues was multiplied by the number of nodes. After, by the number of CPUs. 636b927eba5b ("workqueue: Make unbound workqueues to use per-cpu pool_workqueues") claims that this should generally be okay. It is okay for users which self-regulates concurrency level which are the vast majority; however, there are enough use cases which actually depend on max_active to prevent the level of concurrency from going bonkers including several IO handling workqueues that can issue a work item for each in-flight IO. With targeted benchmarks, the misbehavior can easily be exposed as reported in http://lkml.kernel.org/r/dbu6wiwu3sdhmhikb2w6lns7b27gbobfavhjj57kwi2quafgwl@htjcc5oikcr3. Unfortunately, there is no way to express what these use cases need using per-cpu max_active. A CPU may issue most of in-flight IOs, so we don't want to set max_active too low but as soon as we increase max_active a bit, we can end up with unreasonable number of in-flight work items when many CPUs issue IOs at the same time. ie. The acceptable lowest max_active is higher than the acceptable highest max_active. Ideally, max_active for an unbound workqueue should be system-wide so that the users can regulate the total level of concurrency regardless of node and cache layout. The reasons workqueue hasn't implemented that yet are: - One max_active enforcement decouples from pool boundaires, chaining execution after a work item finishes requires inter-pool operations which would require lock dancing, which is nasty. - Sharing a single nr_active count across the whole system can be pretty expensive on NUMA machines. - Per-pwq enforcement had been more or less okay while we were using per-node pools. It looks like we no longer can avoid decoupling max_active enforcement from pool boundaries. This patch implements system-wide nr_active mechanism with the following design characteristics: - To avoid sharing a single counter across multiple nodes, the configured max_active is split across nodes according to the proportion of each workqueue's online effective CPUs per node. e.g. A node with twice more online effective CPUs will get twice higher portion of max_active. - Workqueue used to be able to process a chain of interdependent work items which is as long as max_active. We can't do this anymore as max_active is distributed across the nodes. Instead, a new parameter min_active is introduced which determines the minimum level of concurrency within a node regardless of how max_active distribution comes out to be. It is set to the smaller of max_active and WQ_DFL_MIN_ACTIVE which is 8. This can lead to higher effective max_weight than configured and also deadlocks if a workqueue was depending on being able to handle chains of interdependent work items that are longer than 8. I believe these should be fine given that the number of CPUs in each NUMA node is usually higher than 8 and work item chain longer than 8 is pretty unlikely. However, if these assumptions turn out to be wrong, we'll need to add an interface to adjust min_active. - Each unbound wq has an array of struct wq_node_nr_active which tracks per-node nr_active. When its pwq wants to run a work item, it has to obtain the matching node's nr_active. If over the node's max_active, the pwq is queued on wq_node_nr_active->pending_pwqs. As work items finish, the completion path round-robins the pending pwqs activating the first inactive work item of each, which involves some pool lock dancing and kicking other pools. It's not the simplest code but doesn't look too bad. v4: - wq_adjust_max_active() updated to invoke wq_update_node_max_active(). - wq_adjust_max_active() is now protected by wq->mutex instead of wq_pool_mutex. v3: - wq_node_max_active() used to calculate per-node max_active on the fly based on system-wide CPU online states. Lai pointed out that this can lead to skewed distributions for workqueues with restricted cpumasks. Update the max_active distribution to use per-workqueue effective online CPU counts instead of system-wide and cache the calculation results in node_nr_active->max. v2: - wq->min/max_active now uses WRITE/READ_ONCE() as suggested by Lai. Signed-off-by: Tejun Heo Reported-by: Naohiro Aota Link: http://lkml.kernel.org/r/dbu6wiwu3sdhmhikb2w6lns7b27gbobfavhjj57kwi2quafgwl@htjcc5oikcr3 Fixes: 636b927eba5b ("workqueue: Make unbound workqueues to use per-cpu pool_workqueues") Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 35 +++- kernel/workqueue.c | 341 ++++++++++++++++++++++++++++++++++---- 2 files changed, 341 insertions(+), 35 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 78047d0d98820..232baea90a1d6 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -398,6 +398,13 @@ enum wq_consts { WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_UNBOUND_MAX_ACTIVE = WQ_MAX_ACTIVE, WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, + + /* + * Per-node default cap on min_active. Unless explicitly set, min_active + * is set to min(max_active, WQ_DFL_MIN_ACTIVE). For more details, see + * workqueue_struct->min_active definition. + */ + WQ_DFL_MIN_ACTIVE = 8, }; /* @@ -440,11 +447,33 @@ extern struct workqueue_struct *system_freezable_power_efficient_wq; * alloc_workqueue - allocate a workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags - * @max_active: max in-flight work items per CPU, 0 for default + * @max_active: max in-flight work items, 0 for default * remaining args: args for @fmt * - * Allocate a workqueue with the specified parameters. For detailed - * information on WQ_* flags, please refer to + * For a per-cpu workqueue, @max_active limits the number of in-flight work + * items for each CPU. e.g. @max_active of 1 indicates that each CPU can be + * executing at most one work item for the workqueue. + * + * For unbound workqueues, @max_active limits the number of in-flight work items + * for the whole system. e.g. @max_active of 16 indicates that that there can be + * at most 16 work items executing for the workqueue in the whole system. + * + * As sharing the same active counter for an unbound workqueue across multiple + * NUMA nodes can be expensive, @max_active is distributed to each NUMA node + * according to the proportion of the number of online CPUs and enforced + * independently. + * + * Depending on online CPU distribution, a node may end up with per-node + * max_active which is significantly lower than @max_active, which can lead to + * deadlocks if the per-node concurrency limit is lower than the maximum number + * of interdependent work items for the workqueue. + * + * To guarantee forward progress regardless of online CPU distribution, the + * concurrency limit on every node is guaranteed to be equal to or greater than + * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means + * that the sum of per-node max_active's may be larger than @max_active. + * + * For detailed information on %WQ_* flags, please refer to * Documentation/core-api/workqueue.rst. * * RETURNS: diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8d465478adb92..903be39bd2d19 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -126,6 +126,9 @@ enum wq_internal_consts { * * L: pool->lock protected. Access with pool->lock held. * + * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for + * reads. + * * K: Only modified by worker while holding pool->lock. Can be safely read by * self, while holding pool->lock or from IRQ context if %current is the * kworker. @@ -247,17 +250,18 @@ struct pool_workqueue { * pwq->inactive_works instead of pool->worklist and marked with * WORK_STRUCT_INACTIVE. * - * All work items marked with WORK_STRUCT_INACTIVE do not participate - * in pwq->nr_active and all work items in pwq->inactive_works are - * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE - * work items are in pwq->inactive_works. Some of them are ready to - * run in pool->worklist or worker->scheduled. Those work itmes are - * only struct wq_barrier which is used for flush_work() and should - * not participate in pwq->nr_active. For non-barrier work item, it - * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. + * All work items marked with WORK_STRUCT_INACTIVE do not participate in + * nr_active and all work items in pwq->inactive_works are marked with + * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are + * in pwq->inactive_works. Some of them are ready to run in + * pool->worklist or worker->scheduled. Those work itmes are only struct + * wq_barrier which is used for flush_work() and should not participate + * in nr_active. For non-barrier work item, it is marked with + * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. */ int nr_active; /* L: nr of active works */ struct list_head inactive_works; /* L: inactive works */ + struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */ struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ @@ -289,9 +293,19 @@ struct wq_device; * on each CPU, in an unbound workqueue, max_active applies to the whole system. * As sharing a single nr_active across multiple sockets can be very expensive, * the counting and enforcement is per NUMA node. + * + * The following struct is used to enforce per-node max_active. When a pwq wants + * to start executing a work item, it should increment ->nr using + * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over + * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish + * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in + * round-robin order. */ struct wq_node_nr_active { - atomic_t nr; /* per-node nr_active count */ + int max; /* per-node max_active */ + atomic_t nr; /* per-node nr_active */ + raw_spinlock_t lock; /* nests inside pool locks */ + struct list_head pending_pwqs; /* LN: pwqs with inactive works */ }; /* @@ -314,8 +328,12 @@ struct workqueue_struct { struct worker *rescuer; /* MD: rescue worker */ int nr_drainers; /* WQ: drain in progress */ + + /* See alloc_workqueue() function comment for info on min/max_active */ int max_active; /* WO: max active works */ + int min_active; /* WO: min active works */ int saved_max_active; /* WQ: saved max_active */ + int saved_min_active; /* WQ: saved min_active */ struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */ struct pool_workqueue __rcu *dfl_pwq; /* PW: only for unbound wqs */ @@ -667,6 +685,19 @@ static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu) lockdep_is_held(&wq->mutex)); } +/** + * unbound_effective_cpumask - effective cpumask of an unbound workqueue + * @wq: workqueue of interest + * + * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which + * is masked with wq_unbound_cpumask to determine the effective cpumask. The + * default pwq is always mapped to the pool with the current effective cpumask. + */ +static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq) +{ + return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask; +} + static unsigned int work_color_to_flags(int color) { return color << WORK_STRUCT_COLOR_SHIFT; @@ -1461,6 +1492,46 @@ static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq, return wq->node_nr_active[node]; } +/** + * wq_update_node_max_active - Update per-node max_actives to use + * @wq: workqueue to update + * @off_cpu: CPU that's going down, -1 if a CPU is not going down + * + * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is + * distributed among nodes according to the proportions of numbers of online + * cpus. The result is always between @wq->min_active and max_active. + */ +static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu) +{ + struct cpumask *effective = unbound_effective_cpumask(wq); + int min_active = READ_ONCE(wq->min_active); + int max_active = READ_ONCE(wq->max_active); + int total_cpus, node; + + lockdep_assert_held(&wq->mutex); + + if (!cpumask_test_cpu(off_cpu, effective)) + off_cpu = -1; + + total_cpus = cpumask_weight_and(effective, cpu_online_mask); + if (off_cpu >= 0) + total_cpus--; + + for_each_node(node) { + int node_cpus; + + node_cpus = cpumask_weight_and(effective, cpumask_of_node(node)); + if (off_cpu >= 0 && cpu_to_node(off_cpu) == node) + node_cpus--; + + wq_node_nr_active(wq, node)->max = + clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus), + min_active, max_active); + } + + wq_node_nr_active(wq, NUMA_NO_NODE)->max = min_active; +} + /** * get_pwq - get an extra reference on the specified pool_workqueue * @pwq: pool_workqueue to get @@ -1558,35 +1629,98 @@ static bool pwq_activate_work(struct pool_workqueue *pwq, return true; } +static bool tryinc_node_nr_active(struct wq_node_nr_active *nna) +{ + int max = READ_ONCE(nna->max); + + while (true) { + int old, tmp; + + old = atomic_read(&nna->nr); + if (old >= max) + return false; + tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1); + if (tmp == old) + return true; + } +} + /** * pwq_tryinc_nr_active - Try to increment nr_active for a pwq * @pwq: pool_workqueue of interest + * @fill: max_active may have increased, try to increase concurrency level * * Try to increment nr_active for @pwq. Returns %true if an nr_active count is * successfully obtained. %false otherwise. */ -static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq) +static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill) { struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node); - bool obtained; + bool obtained = false; lockdep_assert_held(&pool->lock); - obtained = pwq->nr_active < READ_ONCE(wq->max_active); + if (!nna) { + /* per-cpu workqueue, pwq->nr_active is sufficient */ + obtained = pwq->nr_active < READ_ONCE(wq->max_active); + goto out; + } + + /* + * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is + * already waiting on $nna, pwq_dec_nr_active() will maintain the + * concurrency level. Don't jump the line. + * + * We need to ignore the pending test after max_active has increased as + * pwq_dec_nr_active() can only maintain the concurrency level but not + * increase it. This is indicated by @fill. + */ + if (!list_empty(&pwq->pending_node) && likely(!fill)) + goto out; + + obtained = tryinc_node_nr_active(nna); + if (obtained) + goto out; + + /* + * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs + * and try again. The smp_mb() is paired with the implied memory barrier + * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either + * we see the decremented $nna->nr or they see non-empty + * $nna->pending_pwqs. + */ + raw_spin_lock(&nna->lock); + + if (list_empty(&pwq->pending_node)) + list_add_tail(&pwq->pending_node, &nna->pending_pwqs); + else if (likely(!fill)) + goto out_unlock; + + smp_mb(); + + obtained = tryinc_node_nr_active(nna); - if (obtained) { + /* + * If @fill, @pwq might have already been pending. Being spuriously + * pending in cold paths doesn't affect anything. Let's leave it be. + */ + if (obtained && likely(!fill)) + list_del_init(&pwq->pending_node); + +out_unlock: + raw_spin_unlock(&nna->lock); +out: + if (obtained) pwq->nr_active++; - if (nna) - atomic_inc(&nna->nr); - } return obtained; } /** * pwq_activate_first_inactive - Activate the first inactive work item on a pwq * @pwq: pool_workqueue of interest + * @fill: max_active may have increased, try to increase concurrency level * * Activate the first inactive work item of @pwq if available and allowed by * max_active limit. @@ -1594,13 +1728,13 @@ static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq) * Returns %true if an inactive work item has been activated. %false if no * inactive work item is found or max_active limit is reached. */ -static bool pwq_activate_first_inactive(struct pool_workqueue *pwq) +static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill) { struct work_struct *work = list_first_entry_or_null(&pwq->inactive_works, struct work_struct, entry); - if (work && pwq_tryinc_nr_active(pwq)) { + if (work && pwq_tryinc_nr_active(pwq, fill)) { __pwq_activate_work(pwq, work); return true; } else { @@ -1608,11 +1742,93 @@ static bool pwq_activate_first_inactive(struct pool_workqueue *pwq) } } +/** + * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active + * @nna: wq_node_nr_active to activate a pending pwq for + * @caller_pool: worker_pool the caller is locking + * + * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked. + * @caller_pool may be unlocked and relocked to lock other worker_pools. + */ +static void node_activate_pending_pwq(struct wq_node_nr_active *nna, + struct worker_pool *caller_pool) +{ + struct worker_pool *locked_pool = caller_pool; + struct pool_workqueue *pwq; + struct work_struct *work; + + lockdep_assert_held(&caller_pool->lock); + + raw_spin_lock(&nna->lock); +retry: + pwq = list_first_entry_or_null(&nna->pending_pwqs, + struct pool_workqueue, pending_node); + if (!pwq) + goto out_unlock; + + /* + * If @pwq is for a different pool than @locked_pool, we need to lock + * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock + * / lock dance. For that, we also need to release @nna->lock as it's + * nested inside pool locks. + */ + if (pwq->pool != locked_pool) { + raw_spin_unlock(&locked_pool->lock); + locked_pool = pwq->pool; + if (!raw_spin_trylock(&locked_pool->lock)) { + raw_spin_unlock(&nna->lock); + raw_spin_lock(&locked_pool->lock); + raw_spin_lock(&nna->lock); + goto retry; + } + } + + /* + * $pwq may not have any inactive work items due to e.g. cancellations. + * Drop it from pending_pwqs and see if there's another one. + */ + work = list_first_entry_or_null(&pwq->inactive_works, + struct work_struct, entry); + if (!work) { + list_del_init(&pwq->pending_node); + goto retry; + } + + /* + * Acquire an nr_active count and activate the inactive work item. If + * $pwq still has inactive work items, rotate it to the end of the + * pending_pwqs so that we round-robin through them. This means that + * inactive work items are not activated in queueing order which is fine + * given that there has never been any ordering across different pwqs. + */ + if (likely(tryinc_node_nr_active(nna))) { + pwq->nr_active++; + __pwq_activate_work(pwq, work); + + if (list_empty(&pwq->inactive_works)) + list_del_init(&pwq->pending_node); + else + list_move_tail(&pwq->pending_node, &nna->pending_pwqs); + + /* if activating a foreign pool, make sure it's running */ + if (pwq->pool != caller_pool) + kick_pool(pwq->pool); + } + +out_unlock: + raw_spin_unlock(&nna->lock); + if (locked_pool != caller_pool) { + raw_spin_unlock(&locked_pool->lock); + raw_spin_lock(&caller_pool->lock); + } +} + /** * pwq_dec_nr_active - Retire an active count * @pwq: pool_workqueue of interest * * Decrement @pwq's nr_active and try to activate the first inactive work item. + * For unbound workqueues, this function may temporarily drop @pwq->pool->lock. */ static void pwq_dec_nr_active(struct pool_workqueue *pwq) { @@ -1632,12 +1848,29 @@ static void pwq_dec_nr_active(struct pool_workqueue *pwq) * inactive work item on @pwq itself. */ if (!nna) { - pwq_activate_first_inactive(pwq); + pwq_activate_first_inactive(pwq, false); return; } - atomic_dec(&nna->nr); - pwq_activate_first_inactive(pwq); + /* + * If @pwq is for an unbound workqueue, it's more complicated because + * multiple pwqs and pools may be sharing the nr_active count. When a + * pwq needs to wait for an nr_active count, it puts itself on + * $nna->pending_pwqs. The following atomic_dec_return()'s implied + * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to + * guarantee that either we see non-empty pending_pwqs or they see + * decremented $nna->nr. + * + * $nna->max may change as CPUs come online/offline and @pwq->wq's + * max_active gets updated. However, it is guaranteed to be equal to or + * larger than @pwq->wq->min_active which is above zero unless freezing. + * This maintains the forward progress guarantee. + */ + if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max)) + return; + + if (!list_empty(&nna->pending_pwqs)) + node_activate_pending_pwq(nna, pool); } /** @@ -1965,7 +2198,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, * @work must also queue behind existing inactive work items to maintain * ordering when max_active changes. See wq_adjust_max_active(). */ - if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq)) { + if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) { if (list_empty(&pool->worklist)) pool->watchdog_ts = jiffies; @@ -3200,7 +3433,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, barr->task = current; - /* The barrier work item does not participate in pwq->nr_active. */ + /* The barrier work item does not participate in nr_active. */ work_flags |= WORK_STRUCT_INACTIVE; /* @@ -4116,6 +4349,8 @@ static void free_node_nr_active(struct wq_node_nr_active **nna_ar) static void init_node_nr_active(struct wq_node_nr_active *nna) { atomic_set(&nna->nr, 0); + raw_spin_lock_init(&nna->lock); + INIT_LIST_HEAD(&nna->pending_pwqs); } /* @@ -4355,6 +4590,15 @@ static void pwq_release_workfn(struct kthread_work *work) mutex_unlock(&wq_pool_mutex); } + if (!list_empty(&pwq->pending_node)) { + struct wq_node_nr_active *nna = + wq_node_nr_active(pwq->wq, pwq->pool->node); + + raw_spin_lock_irq(&nna->lock); + list_del_init(&pwq->pending_node); + raw_spin_unlock_irq(&nna->lock); + } + call_rcu(&pwq->rcu, rcu_free_pwq); /* @@ -4380,6 +4624,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, pwq->flush_color = -1; pwq->refcnt = 1; INIT_LIST_HEAD(&pwq->inactive_works); + INIT_LIST_HEAD(&pwq->pending_node); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); kthread_init_work(&pwq->release_work, pwq_release_workfn); @@ -4587,6 +4832,9 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) ctx->pwq_tbl[cpu]); ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq); + /* update node_nr_active->max */ + wq_update_node_max_active(ctx->wq, -1); + mutex_unlock(&ctx->wq->mutex); } @@ -4850,24 +5098,35 @@ static int init_rescuer(struct workqueue_struct *wq) static void wq_adjust_max_active(struct workqueue_struct *wq) { bool activated; + int new_max, new_min; lockdep_assert_held(&wq->mutex); if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) { - WRITE_ONCE(wq->max_active, 0); - return; + new_max = 0; + new_min = 0; + } else { + new_max = wq->saved_max_active; + new_min = wq->saved_min_active; } - if (wq->max_active == wq->saved_max_active) + if (wq->max_active == new_max && wq->min_active == new_min) return; /* - * Update @wq->max_active and then kick inactive work items if more + * Update @wq->max/min_active and then kick inactive work items if more * active work items are allowed. This doesn't break work item ordering * because new work items are always queued behind existing inactive * work items if there are any. */ - WRITE_ONCE(wq->max_active, wq->saved_max_active); + WRITE_ONCE(wq->max_active, new_max); + WRITE_ONCE(wq->min_active, new_min); + + if (wq->flags & WQ_UNBOUND) + wq_update_node_max_active(wq, -1); + + if (new_max == 0) + return; /* * Round-robin through pwq's activating the first inactive work item @@ -4882,7 +5141,7 @@ static void wq_adjust_max_active(struct workqueue_struct *wq) /* can be called during early boot w/ irq disabled */ raw_spin_lock_irqsave(&pwq->pool->lock, flags); - if (pwq_activate_first_inactive(pwq)) { + if (pwq_activate_first_inactive(pwq, true)) { activated = true; kick_pool(pwq->pool); } @@ -4944,7 +5203,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, /* init wq */ wq->flags = flags; wq->max_active = max_active; - wq->saved_max_active = max_active; + wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE); + wq->saved_max_active = wq->max_active; + wq->saved_min_active = wq->min_active; mutex_init(&wq->mutex); atomic_set(&wq->nr_pwqs_to_flush, 0); INIT_LIST_HEAD(&wq->pwqs); @@ -5110,7 +5371,8 @@ EXPORT_SYMBOL_GPL(destroy_workqueue); * @wq: target workqueue * @max_active: new max_active value. * - * Set max_active of @wq to @max_active. + * Set max_active of @wq to @max_active. See the alloc_workqueue() function + * comment. * * CONTEXT: * Don't call from IRQ context. @@ -5127,6 +5389,9 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) wq->flags &= ~__WQ_ORDERED; wq->saved_max_active = max_active; + if (wq->flags & WQ_UNBOUND) + wq->saved_min_active = min(wq->saved_min_active, max_active); + wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); @@ -5808,6 +6073,10 @@ int workqueue_online_cpu(unsigned int cpu) for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) wq_update_pod(wq, tcpu, cpu, true); + + mutex_lock(&wq->mutex); + wq_update_node_max_active(wq, -1); + mutex_unlock(&wq->mutex); } } @@ -5836,6 +6105,10 @@ int workqueue_offline_cpu(unsigned int cpu) for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) wq_update_pod(wq, tcpu, cpu, false); + + mutex_lock(&wq->mutex); + wq_update_node_max_active(wq, cpu); + mutex_unlock(&wq->mutex); } } mutex_unlock(&wq_pool_mutex); @@ -7127,8 +7400,12 @@ void __init workqueue_init_topology(void) * combinations to apply per-pod sharing. */ list_for_each_entry(wq, &workqueues, list) { - for_each_online_cpu(cpu) { + for_each_online_cpu(cpu) wq_update_pod(wq, cpu, cpu, true); + if (wq->flags & WQ_UNBOUND) { + mutex_lock(&wq->mutex); + wq_update_node_max_active(wq, -1); + mutex_unlock(&wq->mutex); } } -- GitLab From 07daa99b7fd7adfffa22180184e39ec124e73013 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 29 Jan 2024 08:11:25 -1000 Subject: [PATCH 0126/1519] tools/workqueue/wq_dump.py: Add node_nr/max_active dump Print out per-node nr/max_active numbers to improve visibility into node_nr_active operations. Signed-off-by: Tejun Heo --- tools/workqueue/wq_dump.py | 41 +++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/tools/workqueue/wq_dump.py b/tools/workqueue/wq_dump.py index 333b2fc00b829..bd381511bd9a2 100644 --- a/tools/workqueue/wq_dump.py +++ b/tools/workqueue/wq_dump.py @@ -50,6 +50,7 @@ import drgn from drgn.helpers.linux.list import list_for_each_entry,list_empty from drgn.helpers.linux.percpu import per_cpu_ptr from drgn.helpers.linux.cpumask import for_each_cpu,for_each_possible_cpu +from drgn.helpers.linux.nodemask import for_each_node from drgn.helpers.linux.idr import idr_for_each import argparse @@ -107,7 +108,6 @@ WQ_AFFN_NUMA = prog['WQ_AFFN_NUMA'] WQ_AFFN_SYSTEM = prog['WQ_AFFN_SYSTEM'] WQ_NAME_LEN = prog['WQ_NAME_LEN'].value_() - cpumask_str_len = len(cpumask_str(wq_unbound_cpumask)) print('Affinity Scopes') @@ -205,3 +205,42 @@ for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_( print(f' {wq.rescuer.task.pid.value_():6}', end='') print(f' {cpumask_str(wq.rescuer.task.cpus_ptr):{rcpus_len}}', end='') print('') + +print('') +print('Unbound workqueue -> node_nr/max_active') +print('=======================================') + +if 'node_to_cpumask_map' in prog: + __cpu_online_mask = prog['__cpu_online_mask'] + node_to_cpumask_map = prog['node_to_cpumask_map'] + nr_node_ids = prog['nr_node_ids'].value_() + + print(f'online_cpus={cpumask_str(__cpu_online_mask.address_of_())}') + for node in for_each_node(): + print(f'NODE[{node:02}]={cpumask_str(node_to_cpumask_map[node])}') + print('') + + print(f'[{"workqueue":^{WQ_NAME_LEN-2}}\\ min max', end='') + first = True + for node in for_each_node(): + if first: + print(f' NODE {node}', end='') + first = False + else: + print(f' {node:7}', end='') + print(f' {"dfl":>7} ]') + print('') + + for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_(), 'list'): + if not (wq.flags & WQ_UNBOUND): + continue + + print(f'{wq.name.string_().decode():{WQ_NAME_LEN}} ', end='') + print(f'{wq.min_active.value_():3} {wq.max_active.value_():3}', end='') + for node in for_each_node(): + nna = wq.node_nr_active[node] + print(f' {nna.nr.counter.value_():3}/{nna.max.value_():3}', end='') + nna = wq.node_nr_active[nr_node_ids] + print(f' {nna.nr.counter.value_():3}/{nna.max.value_():3}') +else: + printf(f'node_to_cpumask_map not present, is NUMA enabled?') -- GitLab From aae17ebb53cd3da37f5dfbde937acd091eb4340c Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Mon, 29 Jan 2024 22:00:46 -0300 Subject: [PATCH 0127/1519] workqueue: Avoid using isolated cpus' timers on queue_delayed_work When __queue_delayed_work() is called, it chooses a cpu for handling the timer interrupt. As of today, it will pick either the cpu passed as parameter or the last cpu used for this. This is not good if a system does use CPU isolation, because it can take away some valuable cpu time to: 1 - deal with the timer interrupt, 2 - schedule-out the desired task, 3 - queue work on a random workqueue, and 4 - schedule the desired task back to the cpu. So to fix this, during __queue_delayed_work(), if cpu isolation is in place, pick a random non-isolated cpu to handle the timer interrupt. As an optimization, if the current cpu is not isolated, use it instead of looking for another candidate. Signed-off-by: Leonardo Bras Signed-off-by: Tejun Heo --- kernel/workqueue.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 903be39bd2d19..9221a4c57ae1c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2362,10 +2362,18 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, dwork->cpu = cpu; timer->expires = jiffies + delay; - if (unlikely(cpu != WORK_CPU_UNBOUND)) + if (housekeeping_enabled(HK_TYPE_TIMER)) { + /* If the current cpu is a housekeeping cpu, use it. */ + cpu = smp_processor_id(); + if (!housekeeping_test_cpu(cpu, HK_TYPE_TIMER)) + cpu = housekeeping_any_cpu(HK_TYPE_TIMER); add_timer_on(timer, cpu); - else - add_timer(timer); + } else { + if (likely(cpu == WORK_CPU_UNBOUND)) + add_timer(timer); + else + add_timer_on(timer, cpu); + } } /** -- GitLab From efd7def00406ac57400501cdc76d0d95d4691927 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 12 Jan 2024 10:44:36 +0100 Subject: [PATCH 0128/1519] x86/setup: Move UAPI setup structures into setup_data.h The type definition of struct pci_setup_rom in requires struct setup_data from . Many drivers include , but do not use boot parameters. Changes to bootparam.h or its included header files could easily trigger a large, unnecessary rebuild of the kernel. Moving struct setup_data and related code into its own header file avoids including in . Instead include the new header and remove the include statement for x86_init.h, which is unnecessary but pulls in bootparams.h. Suggested-by: Ard Biesheuvel Signed-off-by: Thomas Zimmermann Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20240112095000.8952-2-tzimmermann@suse.de --- arch/x86/include/asm/pci.h | 2 +- arch/x86/include/uapi/asm/bootparam.h | 72 +--------------------- arch/x86/include/uapi/asm/setup_data.h | 83 ++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 72 deletions(-) create mode 100644 arch/x86/include/uapi/asm/setup_data.h diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index b40c462b4af36..f6100df3652e1 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -10,7 +10,7 @@ #include #include #include -#include +#include struct pci_sysdata { int domain; /* PCI domain */ diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 01d19fc223463..4a38e7917756f 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -2,21 +2,7 @@ #ifndef _ASM_X86_BOOTPARAM_H #define _ASM_X86_BOOTPARAM_H -/* setup_data/setup_indirect types */ -#define SETUP_NONE 0 -#define SETUP_E820_EXT 1 -#define SETUP_DTB 2 -#define SETUP_PCI 3 -#define SETUP_EFI 4 -#define SETUP_APPLE_PROPERTIES 5 -#define SETUP_JAILHOUSE 6 -#define SETUP_CC_BLOB 7 -#define SETUP_IMA 8 -#define SETUP_RNG_SEED 9 -#define SETUP_ENUM_MAX SETUP_RNG_SEED - -#define SETUP_INDIRECT (1<<31) -#define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT) +#include /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF @@ -48,22 +34,6 @@ #include #include