[PATCH 3.11 006/137] fuse: timeout comparison fix

Luis Henriques

unread,

Aug 18, 2014, 5:40:02 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Miklos Szeredi <msze...@suse.cz>

commit 126b9d4365b110c157bc4cbc32540dfa66c9c85a upstream.

As suggested by checkpatch.pl, use time_before64() instead of direct
comparison of jiffies64 values.

Signed-off-by: Miklos Szeredi <msze...@suse.cz>
[ luis: backported to 3.11: adjusted context ]
Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---
fs/fuse/dir.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8fec28ff4a0d..ac00468b93d5 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -186,7 +186,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
inode = ACCESS_ONCE(entry->d_inode);
if (inode && is_bad_inode(inode))
return 0;
- else if (fuse_dentry_time(entry) < get_jiffies_64()) {
+ else if (time_before64(fuse_dentry_time(entry), get_jiffies_64())) {
int err;
struct fuse_entry_out outarg;
struct fuse_req *req;
@@ -937,7 +937,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
int err;
bool r;

- if (fi->i_time < get_jiffies_64()) {
+ if (time_before64(fi->i_time, get_jiffies_64())) {
r = true;
err = fuse_do_getattr(inode, stat, file);
} else {
@@ -1121,7 +1121,7 @@ static int fuse_permission(struct inode *inode, int mask)
((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) {
struct fuse_inode *fi = get_fuse_inode(inode);

- if (fi->i_time < get_jiffies_64()) {
+ if (time_before64(fi->i_time, get_jiffies_64())) {
refreshed = true;

err = fuse_perm_getattr(inode, mask);
--
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Luis Henriques

unread,

Aug 18, 2014, 5:40:03 AM8/18/14

to

This is the start of the review cycle for the Linux 3.11.10.15 stable kernel.

PLEASE NOTE:

This will be the LAST 3.11.y.z extended stable kernel release.

This version contains 137 new patches, summarized below. The new patches are
posted as replies to this message and also available in this git branch:

http://kernel.ubuntu.com/git?p=ubuntu/linux.git;h=linux-3.11.y-review;a=shortlog

git://kernel.ubuntu.com/ubuntu/linux.git linux-3.11.y-review

The review period for version 3.11.10.15 will be open for the next three days.
To report a problem, please reply to the relevant follow-up patch message.

For more information about the Linux 3.11.y.z extended stable kernel version,
see https://wiki.ubuntu.com/Kernel/Dev/ExtendedStable .

-Luis

--
Documentation/x86/x86_64/mm.txt | 2 +
Makefile | 2 +
arch/arm/mm/idmap.c | 7 +
arch/parisc/include/uapi/asm/signal.h | 2 -
arch/powerpc/perf/core-book3s.c | 6 +-
arch/s390/kernel/ptrace.c | 9 +-
arch/sparc/include/asm/pgtable_64.h | 6 +-
arch/sparc/include/asm/tlbflush_64.h | 12 +-
arch/sparc/kernel/ldc.c | 2 +-
arch/sparc/kernel/smp_64.c | 6 +-
arch/sparc/kernel/sys32.S | 2 +-
arch/sparc/kernel/unaligned_64.c | 12 +-
arch/sparc/lib/NG2memcpy.S | 1 +
arch/sparc/math-emu/math_32.c | 2 +-
arch/sparc/mm/fault_64.c | 102 ++++++------
arch/sparc/mm/init_64.c | 27 +++
arch/sparc/mm/tsb.c | 14 +-
arch/x86/Kconfig | 25 ++-
arch/x86/boot/header.S | 26 ++-
arch/x86/boot/tools/build.c | 37 ++++-
arch/x86/include/asm/espfix.h | 16 ++
arch/x86/include/asm/irqflags.h | 2 +-
arch/x86/include/asm/pgtable_64_types.h | 2 +
arch/x86/include/asm/setup.h | 2 +
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/cpu/perf_event_intel.c | 9 +
arch/x86/kernel/entry_32.S | 21 ++-
arch/x86/kernel/entry_64.S | 77 ++++++++-
arch/x86/kernel/espfix_64.c | 208 +++++++++++++++++++++++
arch/x86/kernel/ldt.c | 10 +-
arch/x86/kernel/paravirt_patch_64.c | 2 -
arch/x86/kernel/smpboot.c | 7 +
arch/x86/mm/dump_pagetables.c | 40 +++--
arch/x86/vdso/vdso32-setup.c | 8 -
block/blk-cgroup.c | 7 +
block/blk-tag.c | 33 +---
block/compat_ioctl.c | 1 +
crypto/af_alg.c | 2 +
drivers/ata/ahci.c | 1 +
drivers/ata/libata-core.c | 12 +-
drivers/bluetooth/hci_h5.c | 1 +
drivers/gpu/drm/qxl/qxl_irq.c | 3 +
drivers/gpu/drm/radeon/atombios_encoders.c | 10 +-
drivers/gpu/drm/radeon/cik.c | 1 +
drivers/gpu/drm/radeon/evergreen.c | 1 +
drivers/gpu/drm/radeon/r600.c | 1 +
drivers/gpu/drm/radeon/radeon_display.c | 5 +
drivers/gpu/drm/radeon/si.c | 1 +
drivers/hv/hv_kvp.c | 16 +-
drivers/hv/hv_util.c | 2 +-
drivers/hwmon/adt7470.c | 6 +-
drivers/hwmon/da9052-hwmon.c | 2 +-
drivers/hwmon/da9055-hwmon.c | 2 +-
drivers/hwmon/smsc47m192.c | 4 +-
drivers/iio/industrialio-buffer.c | 2 +-
drivers/input/input.c | 6 +-
drivers/irqchip/irq-gic.c | 6 +-
drivers/md/dm-cache-metadata.c | 9 +
drivers/md/dm-cache-target.c | 13 +-
drivers/md/dm-thin-metadata.c | 9 +
drivers/media/dvb-frontends/tda10071.c | 6 +-
drivers/media/usb/gspca/pac7302.c | 1 +
drivers/media/usb/hdpvr/hdpvr-video.c | 6 +-
drivers/net/can/c_can/c_can_platform.c | 3 +-
drivers/net/ethernet/broadcom/bnx2x/bnx2x.h | 1 +
drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 12 +-
drivers/net/ethernet/emulex/benet/be_main.c | 2 +-
drivers/net/ethernet/intel/igb/igb_main.c | 2 +
drivers/net/ethernet/marvell/mvneta.c | 211 +++++++++++++-----------
drivers/net/ethernet/sun/sunvnet.c | 20 ++-
drivers/net/macvlan.c | 1 +
drivers/net/ppp/pppoe.c | 2 +-
drivers/net/ppp/pptp.c | 2 +-
drivers/net/usb/qmi_wwan.c | 3 +
drivers/net/wireless/iwlwifi/dvm/rxon.c | 12 --
drivers/net/wireless/mwifiex/main.c | 1 +
drivers/rapidio/devices/tsi721_dma.c | 8 +-
drivers/sbus/char/bbc_envctrl.c | 6 +
drivers/sbus/char/bbc_i2c.c | 11 +-
drivers/scsi/scsi_lib.c | 8 +
drivers/staging/vt6655/bssdb.c | 2 +-
drivers/staging/vt6655/device_main.c | 7 +-
drivers/tty/serial/sunsab.c | 9 +
drivers/usb/chipidea/udc.c | 4 +-
drivers/usb/core/hub.c | 19 +++
drivers/xen/manage.c | 5 +-
fs/coredump.c | 2 +-
fs/fuse/dir.c | 7 +-
fs/fuse/inode.c | 20 ++-
fs/namei.c | 183 ++++++++++++++++++++
fs/namespace.c | 2 +-
include/linux/libata.h | 1 +
include/linux/namei.h | 1 +
include/linux/printk.h | 6 +-
include/net/inetpeer.h | 16 +-
include/net/ip.h | 31 ++--
include/net/ipv6.h | 11 +-
include/net/secure_seq.h | 2 -
include/net/sock.h | 16 +-
init/main.c | 4 +
kernel/power/process.c | 1 +
kernel/printk/printk.c | 2 +-
kernel/sched/core.c | 2 +-
kernel/sched/debug.c | 2 +-
kernel/sched/rt.c | 2 +-
kernel/time/alarmtimer.c | 20 ++-
kernel/time/clockevents.c | 10 +-
kernel/trace/ftrace.c | 4 +-
kernel/trace/ring_buffer.c | 4 -
kernel/trace/trace.c | 20 ++-
kernel/trace/trace_clock.c | 9 +-
lib/btree.c | 1 +
mm/kmemleak.c | 4 +-
mm/page_alloc.c | 16 +-
mm/shmem.c | 104 ++++++++++--
mm/slab_common.c | 2 +
mm/vmalloc.c | 14 +-
net/8021q/vlan_core.c | 5 +-
net/appletalk/ddp.c | 3 -
net/compat.c | 9 +-
net/core/dst.c | 16 +-
net/core/iovec.c | 10 +-
net/core/secure_seq.c | 25 ---
net/core/skbuff.c | 2 +-
net/dns_resolver/dns_query.c | 4 +-
net/ipv4/icmp.c | 2 -
net/ipv4/igmp.c | 14 +-
net/ipv4/inetpeer.c | 18 --
net/ipv4/ip_options.c | 4 +
net/ipv4/ip_output.c | 7 +-
net/ipv4/ip_tunnel.c | 12 +-
net/ipv4/ip_tunnel_core.c | 2 +-
net/ipv4/ipmr.c | 2 +-
net/ipv4/raw.c | 2 +-
net/ipv4/route.c | 84 ++++++----
net/ipv4/tcp.c | 3 +-
net/ipv4/tcp_input.c | 10 +-
net/ipv4/tcp_output.c | 6 +-
net/ipv4/tcp_vegas.c | 3 +-
net/ipv4/tcp_veno.c | 2 +-
net/ipv4/xfrm4_mode_tunnel.c | 2 +-
net/ipv6/addrconf.c | 14 +-
net/ipv6/ip6_output.c | 17 ++
net/ipv6/output_core.c | 24 ---
net/l2tp/l2tp_ppp.c | 4 +-
net/mac80211/tx.c | 26 +--
net/netfilter/ipvs/ip_vs_xmit.c | 2 +-
net/netlink/af_netlink.c | 4 +-
net/sctp/associola.c | 1 +
net/sctp/output.c | 2 +-
net/sctp/sysctl.c | 3 +-
net/sctp/ulpevent.c | 122 ++------------
net/tipc/bcast.c | 1 +
net/wireless/trace.h | 3 +-
154 files changed, 1519 insertions(+), 697 deletions(-)

Abbas Raza (1):
usb: chipidea: udc: Disable auto ZLP generation on ep0

Alex Deucher (2):
drm/radeon: avoid leaking edid data
drm/radeon: set default bl level to something reasonable

Alexandre Bounine (1):
rapidio/tsi721_dma: fix failure to obtain transaction descriptor

Amitkumar Karwar (1):
mwifiex: fix Tx timeout issue

Anand Avati (1):
fuse: ignore entry-timeout on LOOKUP_REVAL

Andrey Ryabinin (1):
net: sendmsg: fix NULL pointer dereference

Andrey Utkin (2):
appletalk: Fix socket referencing in skb
arch/sparc/math-emu/math_32.c: drop stray break operator

Andy Lutomirski (1):
x86_64/entry/xen: Do not invoke espfix64 on Xen

Anssi Hannula (1):
dm cache: fix race affecting dirty block count

Antti Palosaari (1):
[media] tda10071: force modulation to QPSK on DVB-S

Axel Lin (2):
hwmon: (da9052) Don't use dash in the name attribute
hwmon: (da9055) Don't use dash in the name attribute

Ben Hutchings (1):
dns_resolver: Null-terminate the right string

Ben Pfaff (1):
netlink: Fix handling of error from netlink_dump().

Bernd Wachter (1):
net: qmi_wwan: Add ID for Telewell TW-LTE 4G v2

Bjørn Mork (1):
net: qmi_wwan: add two Sierra Wireless/Netgear devices

Boris Ostrovsky (1):
x86/espfix/xen: Fix allocation of pages for paravirt page tables

Catalin Marinas (1):
mm: kmemleak: avoid false negatives on vmalloc'ed objects

Christian König (1):
drm/radeon: fix irq ring buffer overflow handling

Christoph Hellwig (1):
block: don't assume last put of shared tags is for the host

Christoph Lameter (1):
slab_common: Do not check for duplicate slab names

Christoph Paasch (3):
tcp: Fix divide by zero when pushing during tcp-repair
tcp: Fix integer-overflows in TCP veno
tcp: Fix integer-overflow in TCP vegas

Christoph Schulz (1):
net: pppoe: use correct channel MTU when using Multilink PPP

Christopher Alexander Tobias Schulze (2):
bbc-i2c: Fix BBC I2C envctrl on SunBlade 2000
sunsab: Fix detection of BREAK on sunsab serial console

Daniel Borkmann (3):
net: sctp: check proc_dointvec result in proc_sctp_do_auth
net: sctp: fix information leaks in ulpevent layer
net: sctp: inherit auth_capable on INIT collisions

David Rientjes (1):
mm, thp: do not allow thp faults to avoid cpuset restrictions

David S. Miller (8):
sparc64: Fix argument sign extension for compat_sys_futex().
sparc64: Handle 32-bit tasks properly in compute_effective_address().
sparc64: Fix top-level fault handling bugs.
sparc64: Don't bark so loudly about 32-bit tasks generating 64-bit fault addresses.
sparc64: Fix huge TSB mapping on pre-UltraSPARC-III cpus.
sparc64: Add membar to Niagara2 memcpy code.
sparc64: Do not insert non-valid PTEs into the TSB hash table.
sparc64: Guard against flushing openfirmware mappings.

David Vrabel (1):
xen/manage: fix potential deadlock when resuming the console

Dmitry Kravkov (1):
bnx2x: fix crash during TSO tunneling

Dmitry Popov (1):
ip_tunnel: fix ip_tunnel_lookup

Dmitry Torokhov (1):
Input: fix defuzzing logic

Edward Allcutt (1):
ipv4: icmp: Fix pMTU handling for rare case

Eliad Peller (1):
cfg80211: fix mic_failure tracing

Emmanuel Grumbach (1):
iwlwifi: dvm: don't enable CTS to self

Eric Dumazet (8):
ipv4: fix dst race in sk_dst_get()
ipv4: irq safe sk_dst_[re]set() and ipv4_sk_update_pmtu() fix
net: fix sparse warning in sk_dst_set()
bnx2x: fix possible panic under memory stress
ipv4: fix buffer overflow in ip_options_compile()
inetpeer: get rid of ip_id_count
ip: make IP identifiers less predictable
sctp: fix possible seqlock seadlock in sctp_packet_transmit()

Gao feng (1):
ipv6: reallocate addrconf router for ipv6 address when lo device up

Gavin Guo (1):
usb: Check if port status is equal to RxDetect

George Cherian (1):
can: c_can_platform: Fix raminit, use devm_ioremap() instead of devm_ioremap_resource()

Guenter Roeck (2):
hwmon: (adt7470) Fix writes to temperature limit registers
hwmon: (smsc47m192) Fix temperature limit and vrm write operations

H. Peter Anvin (6):
Revert "x86-64, modify_ldt: Make support for 16-bit segments a runtime option"
x86-64, espfix: Don't leak bits 31:16 of %esp returning to 16-bit stack
x86, espfix: Move espfix definitions into a separate header file
x86, espfix: Fix broken header guard
x86, espfix: Make espfix64 a Kconfig option, fix UML
x86, espfix: Make it possible to disable 16-bit support

HATAYAMA Daisuke (1):
perf/x86/intel: ignore CondChgd bit to avoid false NMI handling

Hans Verkuil (1):
[media] hdpvr: fix two audio bugs

Hans de Goede (1):
[media] gspca_pac7302: Add new usb-id for Genius i-Look 317

Hugh Dickins (3):
shmem: fix faulting into a hole while it's punched
shmem: fix faulting into a hole, not taking i_mutex
shmem: fix splicing from a hole while it's punched

James Bottomley (1):
scsi: handle flush errors properly

Jan Kara (1):
timer: Fix lock inversion between hrtimer_bases.lock and scheduler locks

Jason Wang (1):
drm/qxl: return IRQ_NONE if it was not our irq

Jeff Layton (1):
vfs: allow umount to handle mountpoints without revalidating them

Johannes Berg (1):
Revert "mac80211: move "bufferable MMPDU" check to fix AP mode scan"

John David Anglin (1):
parisc: Remove SA_RESTORER define

John Stultz (2):
alarmtimer: Fix bug where relative alarm timers were treated as absolute
printk: rename printk_sched to printk_deferred

Jon Paul Maloy (1):
tipc: clear 'next'-pointer of message fragments before reassembly

K. Y. Srinivasan (1):
Drivers: hv: util: Fix a bug in the KVP code

Kevin Hao (1):
libata: support the ata host which implements a queue depth less than 32

Kirill Tkhai (1):
sparc64: Make itc_sync_lock raw

Konstantin Khlebnikov (1):
ARM: 8115/1: LPAE: reduce damage caused by idmap to virtual memory layout

Lars-Peter Clausen (1):
iio: buffer: Fix demux table creation

Li RongQing (1):
8021q: fix a potential memory leak

Linus Torvalds (1):
Fix gcc-4.9.0 miscompilation of load_balance() in scheduler

Loic Poulain (1):
Bluetooth: Ignore H5 non-link packets in non-active state

Malcolm Priestley (2):
staging: vt6655: Fix Warning on boot handle_irq_event_percpu.
staging: vt6655: Fix disassociated messages every 10 seconds

Manuel Schölling (1):
dns_resolver: assure that dns_query() result is null-terminated

Martin Lau (1):
ring-buffer: Fix polling on trace_pipe

Martin Schwidefsky (1):
s390/ptrace: fix PSW mask check

Mateusz Guzik (1):
sched: Fix possible divide by zero in avg_atom() calculation

Matthias Brugger (1):
irqchip: gic: Add support for cortex a7 compatible string

Michael Brown (1):
x86/efi: Include a .bss section within the PE/COFF headers

Michael Ellerman (1):
powerpc/perf: Fix MMCR2 handling for EBB

Mike Snitzer (2):
dm thin metadata: do not allow the data block size to change
dm cache metadata: do not allow the data block size to change

Miklos Szeredi (2):
fuse: timeout comparison fix
fuse: handle large user and group ID

Mikulas Patocka (2):
slab_common: fix the check for duplicate slab names
block: provide compat ioctl for BLKZEROOUT

Milan Broz (1):
crypto: af_alg - properly label AF_ALG socket

Minfei Huang (1):
lib/btree.c: fix leak of whole btree nodes

Neal Cardwell (1):
tcp: fix tcp_match_skb_to_sack() for unaligned SACK at end of an skb

Romain Degez (1):
ahci: add support for the Promise FastTrak TX8660 SATA HBA (ahci mode)

Sasha Levin (2):
net/l2tp: don't fall back on UDP [get|set]sockopt
iovec: make sure the caller actually wants anything in memcpy_fromiovecend

Silesh C V (1):
coredump: fix the setting of PF_DUMPCORE

Sowmini Varadhan (2):
sunvnet: clean up objects created in vnet_new() on vnet_exit()
sparc64: ldc_connect() should not return EINVAL when handshake is in progress.

Stefan Assmann (1):
igb: do a reset on SR-IOV re-init if device is down

Steven Rostedt (Red Hat) (1):
tracing: Fix graph tracer with stack tracer on other archs

Suresh Reddy (1):
be2net: set EQ DB clear-intr bit in be_open()

Sven Wegener (1):
x86_32, entry: Store badsys error code in %eax

Takashi Iwai (1):
PM / sleep: Fix request_firmware() error at resume

Tejun Heo (2):
blkcg: don't call into policy draining if root_blkg is already gone
libata: introduce ata_host->n_tags to avoid oops on SAS controllers

Thomas Fitzsimmons (1):
net: mvneta: Fix big endian issue in mvneta_txq_desc_csum()

Thomas Petazzoni (1):
net: mvneta: fix operation in 10 Mbit/s mode

Tomasz Figa (1):
irqchip: gic: Fix core ID calculation when topology is read from DT

Tony Luck (1):
tracing: Fix wraparound problems in "uptime" trace clock

Vasily Averin (1):
fs: umount on symlink leaks mnt count

Vlad Yasevich (2):
macvlan: Initialize vlan_features to turn on offload support.
net: Correctly set segment mac_len in skb_segment().

Yuchung Cheng (1):
tcp: fix false undo corner cases

dingtianhong (1):
igmp: fix the problem when mc leave group

willy tarreau (5):
net: mvneta: increase the 64-bit rx/tx stats out of the hot path
net: mvneta: use per_cpu stats to fix an SMP lock up
net: mvneta: do not schedule in mvneta_tx_timeout
net: mvneta: add missing bit descriptions for interrupt masks and causes
net: mvneta: replace Tx timer with a real interrupt

zhangwei(Jovi) (2):
tracing: Add ftrace_trace_stack into __trace_puts/__trace_bputs
tracing: Add TRACE_ITER_PRINTK flag check in __trace_puts/__trace_bputs

Luis Henriques

unread,

Aug 18, 2014, 5:50:01 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Thomas Fitzsimmons <fit...@fitzsim.org>

commit 0a1985879437d14bda8c90d0dae3455c467d7642 upstream.

This commit fixes the command value generated for CSUM calculation
when running in big endian mode. The Ethernet protocol ID for IP was
being unconditionally byte-swapped in the layer 3 protocol check (with
swab16), which caused the mvneta driver to not function correctly in
big endian mode. This patch byte-swaps the ID conditionally with
htons.

Signed-off-by: Thomas Fitzsimmons <fit...@fitzsim.org>
Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/net/ethernet/marvell/mvneta.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index c762b805164c..1594c61d80de 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -1149,7 +1149,7 @@ static u32 mvneta_txq_desc_csum(int l3_offs, int l3_proto,
command = l3_offs << MVNETA_TX_L3_OFF_SHIFT;
command |= ip_hdr_len << MVNETA_TX_IP_HLEN_SHIFT;

- if (l3_proto == swab16(ETH_P_IP))
+ if (l3_proto == htons(ETH_P_IP))
command |= MVNETA_TXD_IP_CSUM;
else
command |= MVNETA_TX_L3_IP6;
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:01 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Christoph Paasch <christop...@uclouvain.be>

commit 1f74e613ded11517db90b2bd57e9464d9e0fb161 upstream.

In vegas we do a multiplication of the cwnd and the rtt. This
may overflow and thus their result is stored in a u64. However, we first
need to cast the cwnd so that actually 64-bit arithmetic is done.

Then, we need to do do_div to allow this to be used on 32-bit arches.

Cc: Stephen Hemminger <ste...@networkplumber.org>
Cc: Neal Cardwell <ncar...@google.com>
Cc: Eric Dumazet <eric.d...@gmail.com>
Cc: David Laight <David....@ACULAB.COM>
Cc: Doug Leith <doug....@nuim.ie>
Fixes: 8d3a564da34e (tcp: tcp_vegas cong avoid fix)
Signed-off-by: Christoph Paasch <christop...@uclouvain.be>

Signed-off-by: David S. Miller <da...@davemloft.net>
Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

net/ipv4/tcp_vegas.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 80fa2bfd7ede..c042e529a11e 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -218,7 +218,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
* This is:
* (actual rate in segments) * baseRTT
*/
- target_cwnd = tp->snd_cwnd * vegas->baseRTT / rtt;
+ target_cwnd = (u64)tp->snd_cwnd * vegas->baseRTT;
+ do_div(target_cwnd, rtt);

/* Calculate the difference between the window we had,
* and the window we would like to have. This quantity
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:01 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Thomas Petazzoni <thomas.p...@free-electrons.com>

commit 4d12bc63ab5e48c1d78fa13883cf6fefcea3afb1 upstream.

As reported by Maggie Mae Roxas, the mvneta driver doesn't behave
properly in 10 Mbit/s mode. This is due to a misconfiguration of the
MVNETA_GMAC_AUTONEG_CONFIG register: bit MVNETA_GMAC_CONFIG_MII_SPEED
must be set for a 100 Mbit/s speed, but cleared for a 10 Mbit/s speed,
which the driver was not properly doing. This commit adjusts that by
setting the MVNETA_GMAC_CONFIG_MII_SPEED bit only in 100 Mbit/s mode,
and relying on the fact that all the speed related bits of this
register are cleared at the beginning of the mvneta_adjust_link()
function.

This problem exists since c5aff18204da0 ("net: mvneta: driver for
Marvell Armada 370/XP network unit") which is the commit that
introduced the mvneta driver in the kernel.

Fixes: c5aff18204da0 ("net: mvneta: driver for Marvell Armada 370/XP network unit")
Reported-by: Maggie Mae Roxas <maggie.m...@gmail.com>
Cc: Maggie Mae Roxas <maggie.m...@gmail.com>
Signed-off-by: Thomas Petazzoni <thomas.p...@free-electrons.com>

Signed-off-by: David S. Miller <da...@davemloft.net>
Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/net/ethernet/marvell/mvneta.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c

index b8aea9aa3566..c762b805164c 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -2325,7 +2325,7 @@ static void mvneta_adjust_link(struct net_device *ndev)

if (phydev->speed == SPEED_1000)
val |= MVNETA_GMAC_CONFIG_GMII_SPEED;
- else
+ else if (phydev->speed == SPEED_100)
val |= MVNETA_GMAC_CONFIG_MII_SPEED;

mvreg_write(pp, MVNETA_GMAC_AUTONEG_CONFIG, val);
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:02 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Christoph Paasch <christop...@uclouvain.be>

commit 5924f17a8a30c2ae18d034a86ee7581b34accef6 upstream.

When in repair-mode and TCP_RECV_QUEUE is set, we end up calling
tcp_push with mss_now being 0. If data is in the send-queue and
tcp_set_skb_tso_segs gets called, we crash because it will divide by
mss_now:

[ 347.151939] divide error: 0000 [#1] SMP
[ 347.152907] Modules linked in:
[ 347.152907] CPU: 1 PID: 1123 Comm: packetdrill Not tainted 3.16.0-rc2 #4
[ 347.152907] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[ 347.152907] task: f5b88540 ti: f3c82000 task.ti: f3c82000
[ 347.152907] EIP: 0060:[<c1601359>] EFLAGS: 00210246 CPU: 1
[ 347.152907] EIP is at tcp_set_skb_tso_segs+0x49/0xa0
[ 347.152907] EAX: 00000b67 EBX: f5acd080 ECX: 00000000 EDX: 00000000
[ 347.152907] ESI: f5a28f40 EDI: f3c88f00 EBP: f3c83d10 ESP: f3c83d00
[ 347.152907] DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
[ 347.152907] CR0: 80050033 CR2: 083158b0 CR3: 35146000 CR4: 000006b0
[ 347.152907] Stack:
[ 347.152907] c167f9d9 f5acd080 000005b4 00000002 f3c83d20 c16013e6 f3c88f00 f5acd080
[ 347.152907] f3c83da0 c1603b5a f3c83d38 c10a0188 00000000 00000000 f3c83d84 c10acc85
[ 347.152907] c1ad5ec0 00000000 00000000 c1ad679c 010003e0 00000000 00000000 f3c88fc8
[ 347.152907] Call Trace:
[ 347.152907] [<c167f9d9>] ? apic_timer_interrupt+0x2d/0x34
[ 347.152907] [<c16013e6>] tcp_init_tso_segs+0x36/0x50
[ 347.152907] [<c1603b5a>] tcp_write_xmit+0x7a/0xbf0
[ 347.152907] [<c10a0188>] ? up+0x28/0x40
[ 347.152907] [<c10acc85>] ? console_unlock+0x295/0x480
[ 347.152907] [<c10ad24f>] ? vprintk_emit+0x1ef/0x4b0
[ 347.152907] [<c1605716>] __tcp_push_pending_frames+0x36/0xd0
[ 347.152907] [<c15f4860>] tcp_push+0xf0/0x120
[ 347.152907] [<c15f7641>] tcp_sendmsg+0xf1/0xbf0
[ 347.152907] [<c116d920>] ? kmem_cache_free+0xf0/0x120
[ 347.152907] [<c106a682>] ? __sigqueue_free+0x32/0x40
[ 347.152907] [<c106a682>] ? __sigqueue_free+0x32/0x40
[ 347.152907] [<c114f0f0>] ? do_wp_page+0x3e0/0x850
[ 347.152907] [<c161c36a>] inet_sendmsg+0x4a/0xb0
[ 347.152907] [<c1150269>] ? handle_mm_fault+0x709/0xfb0
[ 347.152907] [<c15a006b>] sock_aio_write+0xbb/0xd0
[ 347.152907] [<c1180b79>] do_sync_write+0x69/0xa0
[ 347.152907] [<c1181023>] vfs_write+0x123/0x160
[ 347.152907] [<c1181d55>] SyS_write+0x55/0xb0
[ 347.152907] [<c167f0d8>] sysenter_do_call+0x12/0x28

This can easily be reproduced with the following packetdrill-script (the
"magic" with netem, sk_pacing and limit_output_bytes is done to prevent
the kernel from pushing all segments, because hitting the limit without
doing this is not so easy with packetdrill):

0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0

+0 bind(3, ..., ...) = 0
+0 listen(3, 1) = 0

+0 < S 0:0(0) win 32792 <mss 1460>
+0 > S. 0:0(0) ack 1 <mss 1460>
+0.1 < . 1:1(0) ack 1 win 65000

+0 accept(3, ..., ...) = 4

// This forces that not all segments of the snd-queue will be pushed
+0 `tc qdisc add dev tun0 root netem delay 10ms`
+0 `sysctl -w net.ipv4.tcp_limit_output_bytes=2`
+0 setsockopt(4, SOL_SOCKET, 47, [2], 4) = 0

+0 write(4,...,10000) = 10000
+0 write(4,...,10000) = 10000

// Set tcp-repair stuff, particularly TCP_RECV_QUEUE
+0 setsockopt(4, SOL_TCP, 19, [1], 4) = 0
+0 setsockopt(4, SOL_TCP, 20, [1], 4) = 0

// This now will make the write push the remaining segments
+0 setsockopt(4, SOL_SOCKET, 47, [20000], 4) = 0
+0 `sysctl -w net.ipv4.tcp_limit_output_bytes=130000`

// Now we will crash
+0 write(4,...,1000) = 1000

This happens since ec3423257508 (tcp: fix retransmission in repair
mode). Prior to that, the call to tcp_push was prevented by a check for
tp->repair.

The patch fixes it, by adding the new goto-label out_nopush. When exiting
tcp_sendmsg and a push is not required, which is the case for tp->repair,
we go to this label.

When repairing and calling send() with TCP_RECV_QUEUE, the data is
actually put in the receive-queue. So, no push is required because no
data has been added to the send-queue.

Cc: Andrew Vagin <ava...@openvz.org>
Cc: Pavel Emelyanov <xe...@parallels.com>
Fixes: ec3423257508 (tcp: fix retransmission in repair mode)
Signed-off-by: Christoph Paasch <christop...@uclouvain.be>
Acked-by: Andrew Vagin <ava...@openvz.org>
Acked-by: Pavel Emelyanov <xe...@parallels.com>

Signed-off-by: David S. Miller <da...@davemloft.net>
Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

net/ipv4/tcp.c | 3 ++-

1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8e5188639d0b..1ee87b1a5126 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1068,7 +1068,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (unlikely(tp->repair)) {
if (tp->repair_queue == TCP_RECV_QUEUE) {
copied = tcp_send_rcvq(sk, msg, size);
- goto out;
+ goto out_nopush;
}

err = -EINVAL;
@@ -1241,6 +1241,7 @@ wait_for_memory:
out:
if (copied)
tcp_push(sk, flags, mss_now, tp->nonagle);
+out_nopush:
release_sock(sk);
return copied + copied_syn;

--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:02 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Christoph Lameter <c...@linux.com>

commit 3e374919b314f20e2a04f641ebc1093d758f66a4 upstream.

SLUB can alias multiple slab kmem_create_requests to one slab cache to save
memory and increase the cache hotness. As a result the name of the slab can be
stale. Only check the name for duplicates if we are in debug mode where we do
not merge multiple caches.

This fixes the following problem reported by Jonathan Brassow:

The problem with kmem_cache* is this:

*) Assume CONFIG_SLUB is set
1) kmem_cache_create(name="foo-a")
- creates new kmem_cache structure
2) kmem_cache_create(name="foo-b")
- If identical cache characteristics, it will be merged with the previously
created cache associated with "foo-a". The cache's refcount will be
incremented and an alias will be created via sysfs_slab_alias().
3) kmem_cache_destroy(<ptr>)
- Attempting to destroy cache associated with "foo-a", but instead the
refcount is simply decremented. I don't even think the sysfs aliases are
ever removed...
4) kmem_cache_create(name="foo-a")
- This FAILS because kmem_cache_sanity_check colides with the existing
name ("foo-a") associated with the non-removed cache.

This is a problem for RAID (specifically dm-raid) because the name used
for the kmem_cache_create is ("raid%d-%p", level, mddev). If the cache
persists for long enough, the memory address of an old mddev will be
reused for a new mddev - causing an identical formulation of the cache
name. Even though kmem_cache_destory had long ago been used to delete
the old cache, the merging of caches has cause the name and cache of that
old instance to be preserved and causes a colision (and thus failure) in
kmem_cache_create(). I see this regularly in my testing.

Reported-by: Jonathan Brassow <jbra...@redhat.com>
Signed-off-by: Christoph Lameter <c...@linux.com>
Signed-off-by: Pekka Enberg <pen...@kernel.org>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

mm/slab_common.c | 2 ++
1 file changed, 2 insertions(+)

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 538bade6df7d..d43477196b22 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -55,6 +55,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
continue;
}

+#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON)
/*
* For simplicity, we won't check this in the list of memcg
* caches. We have control over memcg naming, and if there
@@ -68,6 +69,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
s = NULL;
return -EINVAL;
}
+#endif
}

WARN_ON(strchr(name, ' ')); /* It confuses parsers */
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:02 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Vlad Yasevich <vyas...@redhat.com>

commit fcdfe3a7fa4cb74391d42b6a26dc07c20dab1d82 upstream.

When performing segmentation, the mac_len value is copied right
out of the original skb. However, this value is not always set correctly
(like when the packet is VLAN-tagged) and we'll end up copying a bad
value.

One way to demonstrate this is to configure a VM which tags
packets internally and turn off VLAN acceleration on the forwarding
bridge port. The packets show up corrupt like this:
16:18:24.985548 52:54:00:ab:be:25 > 52:54:00:26:ce:a3, ethertype 802.1Q
(0x8100), length 1518: vlan 100, p 0, ethertype 0x05e0,
0x0000: 8cdb 1c7c 8cdb 0064 4006 b59d 0a00 6402 ...|...d@.....d.
0x0010: 0a00 6401 9e0d b441 0a5e 64ec 0330 14fa ..d....A.^d..0..
0x0020: 29e3 01c9 f871 0000 0101 080a 000a e833)....q.........3
0x0030: 000f 8c75 6e65 7470 6572 6600 6e65 7470 ...unetperf.netp
0x0040: 6572 6600 6e65 7470 6572 6600 6e65 7470 erf.netperf.netp
0x0050: 6572 6600 6e65 7470 6572 6600 6e65 7470 erf.netperf.netp
0x0060: 6572 6600 6e65 7470 6572 6600 6e65 7470 erf.netperf.netp
...

This also leads to awful throughput as GSO packets are dropped and
cause retransmissions.

The solution is to set the mac_len using the values already available
in then new skb. We've already adjusted all of the header offset, so we
might as well correctly figure out the mac_len using skb_reset_mac_len().
After this change, packets are segmented correctly and performance
is restored.

CC: Eric Dumazet <edum...@google.com>
Signed-off-by: Vlad Yasevich <vyas...@redhat.com>

Signed-off-by: David S. Miller <da...@davemloft.net>

[ luis: backported to 3.11: used davem's backport to 3.10 ]

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

net/core/skbuff.c | 2 +-

1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 39766e4077d6..a8976de2e1a0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2804,7 +2804,6 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
tail = nskb;

__copy_skb_header(nskb, skb);
- nskb->mac_len = skb->mac_len;

/* nskb and skb might have different headroom */
if (nskb->ip_summed == CHECKSUM_PARTIAL)
@@ -2814,6 +2813,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
skb_set_network_header(nskb, skb->mac_len);
nskb->transport_header = (nskb->network_header +
skb_network_header_len(skb));
+ skb_reset_mac_len(nskb);

skb_copy_from_linear_data_offset(skb, -tnl_hlen,
nskb->data - tnl_hlen,
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:02 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Anssi Hannula <anssi....@iki.fi>

commit 44fa816bb778edbab6b6ddaaf24908dd6295937e upstream.

nr_dirty is updated without locking, causing it to drift so that it is
non-zero (either a small positive integer, or a very large one when an
underflow occurs) even when there are no actual dirty blocks. This was
due to a race between the workqueue and map function accessing nr_dirty
in parallel without proper protection.

People were seeing under runs due to a race on increment/decrement of
nr_dirty, see: https://lkml.org/lkml/2014/6/3/648

Fix this by using an atomic_t for nr_dirty.

Reported-by: roma...@gmail.com
Signed-off-by: Anssi Hannula <anssi....@iki.fi>
Signed-off-by: Joe Thornber <e...@redhat.com>
Signed-off-by: Mike Snitzer <sni...@redhat.com>

[ luis: backported to 3.11: adjusted context ]

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/md/dm-cache-target.c | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index cb51c1d05417..1ab8cd335d01 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -157,7 +157,7 @@ struct cache {
/*
* cache_size entries, dirty if set
*/
- dm_cblock_t nr_dirty;
+ atomic_t nr_dirty;
unsigned long *dirty_bitset;

/*
@@ -406,7 +406,7 @@ static bool is_dirty(struct cache *cache, dm_cblock_t b)
static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
{
if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
- cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
+ atomic_inc(&cache->nr_dirty);
policy_set_dirty(cache->policy, oblock);
}
}
@@ -415,8 +415,7 @@ static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cbl
{
if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
policy_clear_dirty(cache->policy, oblock);
- cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
- if (!from_cblock(cache->nr_dirty))
+ if (atomic_dec_return(&cache->nr_dirty) == 0)
dm_table_event(cache->ti->table);
}
}
@@ -2003,7 +2002,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
atomic_set(&cache->quiescing_ack, 0);

r = -ENOMEM;
- cache->nr_dirty = 0;
+ atomic_set(&cache->nr_dirty, 0);
cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
if (!cache->dirty_bitset) {
*error = "could not allocate dirty bitset";
@@ -2475,7 +2474,7 @@ static void cache_status(struct dm_target *ti, status_type_t type,

residency = policy_residency(cache->policy);

- DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
+ DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %lu ",
(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
(unsigned long long)nr_blocks_metadata,
(unsigned) atomic_read(&cache->stats.read_hit),
@@ -2485,7 +2484,7 @@ static void cache_status(struct dm_target *ti, status_type_t type,
(unsigned) atomic_read(&cache->stats.demotion),
(unsigned) atomic_read(&cache->stats.promotion),
(unsigned long long) from_cblock(residency),
- cache->nr_dirty);
+ (unsigned long) atomic_read(&cache->nr_dirty));

if (cache->features.write_through)
DMEMIT("1 writethrough ");
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: "David S. Miller" <da...@davemloft.net>

commit 18f38132528c3e603c66ea464727b29e9bbcb91b upstream.

The assumption was that update_mmu_cache() (and the equivalent for PMDs) would
only be called when the PTE being installed will be accessible by the user.

This is not true for code paths originating from remove_migration_pte().

There are dire consequences for placing a non-valid PTE into the TSB. The TLB
miss frramework assumes thatwhen a TSB entry matches we can just load it into
the TLB and return from the TLB miss trap.

So if a non-valid PTE is in there, we will deadlock taking the TLB miss over
and over, never satisfying the miss.

Just exit early from update_mmu_cache() and friends in this situation.

Based upon a report and patch from Christopher Alexander Tobias Schulze.

Signed-off-by: David S. Miller <da...@davemloft.net>

[ luis: backported to 3.11: used davem's backport to 3.10 and 3.12 ]

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/sparc/mm/init_64.c | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index ed82edad1a39..774ba41dba4d 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -350,6 +350,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *

mm = vma->vm_mm;

+ /* Don't insert a non-valid PTE into the TSB, we'll deadlock. */
+ if (!pte_accessible(mm, pte))
+ return;
+
spin_lock_irqsave(&mm->context.lock, flags);

#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: "zhangwei(Jovi)" <jovi.z...@huawei.com>

commit 8abfb8727f4a724d31f9ccfd8013fbd16d539445 upstream.

Currently trace option stacktrace is not applicable for
trace_printk with constant string argument, the reason is
in __trace_puts/__trace_bputs ftrace_trace_stack is missing.

In contrast, when using trace_printk with non constant string
argument(will call into __trace_printk/__trace_bprintk), then
trace option stacktrace is workable, this inconstant result
will confuses users a lot.

Link: http://lkml.kernel.org/p/51E7A7C9...@huawei.com

Signed-off-by: zhangwei(Jovi) <jovi.z...@huawei.com>
Signed-off-by: Steven Rostedt <ros...@goodmis.org>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

kernel/trace/trace.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c8b8717afc23..c75803fba4b9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -434,6 +434,9 @@ int __trace_puts(unsigned long ip, const char *str, int size)
struct print_entry *entry;
unsigned long irq_flags;
int alloc;
+ int pc;
+
+ pc = preempt_count();

if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
@@ -443,7 +446,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
local_save_flags(irq_flags);
buffer = global_trace.trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
- irq_flags, preempt_count());
+ irq_flags, pc);
if (!event)
return 0;

@@ -460,6 +463,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
entry->buf[size] = '\0';

__buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(buffer, irq_flags, 4, pc);

return size;
}
@@ -477,6 +481,9 @@ int __trace_bputs(unsigned long ip, const char *str)
struct bputs_entry *entry;
unsigned long irq_flags;
int size = sizeof(struct bputs_entry);
+ int pc;
+
+ pc = preempt_count();

if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
@@ -484,7 +491,7 @@ int __trace_bputs(unsigned long ip, const char *str)
local_save_flags(irq_flags);
buffer = global_trace.trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
- irq_flags, preempt_count());
+ irq_flags, pc);
if (!event)
return 0;

@@ -493,6 +500,7 @@ int __trace_bputs(unsigned long ip, const char *str)
entry->str = str;

__buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(buffer, irq_flags, 4, pc);

return 1;
}
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Vlad Yasevich <vyas...@redhat.com>

commit 081e83a78db9b0ae1f5eabc2dedecc865f509b98 upstream.

Macvlan devices do not initialize vlan_features. As a result,
any vlan devices configured on top of macvlans perform very poorly.
Initialize vlan_features based on the vlan features of the lower-level
device.

Signed-off-by: Vlad Yasevich <vyas...@redhat.com>

Signed-off-by: David S. Miller <da...@davemloft.net>

[ luis: backported to 3.11: adjusted context ]

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/net/macvlan.c | 1 +
1 file changed, 1 insertion(+)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 123c37f4f8d5..1a9fe5776596 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -503,6 +503,7 @@ static int macvlan_init(struct net_device *dev)
(lowerdev->state & MACVLAN_STATE_MASK);
dev->features = lowerdev->features & MACVLAN_FEATURES;
dev->features |= NETIF_F_LLTX;
+ dev->vlan_features = lowerdev->vlan_features & MACVLAN_FEATURES;
dev->gso_max_size = lowerdev->gso_max_size;
dev->iflink = lowerdev->ifindex;
dev->hard_header_len = lowerdev->hard_header_len;
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:02 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: "David S. Miller" <da...@davemloft.net>

commit d037d16372bbe4d580342bebbb8826821ad9edf0 upstream.

If we have a 32-bit task we must chop off the top 32-bits of the
64-bit value just as the cpu would.

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/sparc/kernel/unaligned_64.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/sparc/kernel/unaligned_64.c b/arch/sparc/kernel/unaligned_64.c
index 8201c25e7669..4db8898199f7 100644
--- a/arch/sparc/kernel/unaligned_64.c
+++ b/arch/sparc/kernel/unaligned_64.c
@@ -163,17 +163,23 @@ static unsigned long *fetch_reg_addr(unsigned int reg, struct pt_regs *regs)
unsigned long compute_effective_address(struct pt_regs *regs,
unsigned int insn, unsigned int rd)
{
+ int from_kernel = (regs->tstate & TSTATE_PRIV) != 0;
unsigned int rs1 = (insn >> 14) & 0x1f;
unsigned int rs2 = insn & 0x1f;
- int from_kernel = (regs->tstate & TSTATE_PRIV) != 0;
+ unsigned long addr;

if (insn & 0x2000) {
maybe_flush_windows(rs1, 0, rd, from_kernel);
- return (fetch_reg(rs1, regs) + sign_extend_imm13(insn));
+ addr = (fetch_reg(rs1, regs) + sign_extend_imm13(insn));
} else {
maybe_flush_windows(rs1, rs2, rd, from_kernel);
- return (fetch_reg(rs1, regs) + fetch_reg(rs2, regs));
+ addr = (fetch_reg(rs1, regs) + fetch_reg(rs2, regs));
}
+
+ if (!from_kernel && test_thread_flag(TIF_32BIT))
+ addr &= 0xffffffff;
+
+ return addr;
}

/* This is just to make gcc think die_if_kernel does return... */
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Mateusz Guzik <mgu...@redhat.com>

commit b0ab99e7736af88b8ac1b7ae50ea287fffa2badc upstream.

proc_sched_show_task() does:

if (nr_switches)
do_div(avg_atom, nr_switches);

nr_switches is unsigned long and do_div truncates it to 32 bits, which
means it can test non-zero on e.g. x86-64 and be truncated to zero for
division.

Fix the problem by using div64_ul() instead.

As a side effect calculations of avg_atom for big nr_switches are now correct.

Signed-off-by: Mateusz Guzik <mgu...@redhat.com>
Signed-off-by: Peter Zijlstra <pet...@infradead.org>
Cc: Linus Torvalds <torv...@linux-foundation.org>
Link: http://lkml.kernel.org/r/1402750809-31991-1-g...@redhat.com
Signed-off-by: Ingo Molnar <mi...@kernel.org>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

kernel/sched/debug.c | 2 +-

1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 631daf2612c1..8304ee67c56b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -554,7 +554,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)

avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
- do_div(avg_atom, nr_switches);
+ avg_atom = div64_ul(avg_atom, nr_switches);
else
avg_atom = -1LL;

--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:02 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Alex Deucher <alexande...@amd.com>

commit 0ac66effe7fcdee55bda6d5d10d3372c95a41920 upstream.

In some cases we fetch the edid in the detect() callback
in order to determine what sort of monitor is connected.
If that happens, don't fetch the edid again in the get_modes()
callback or we will leak the edid.

Signed-off-by: Alex Deucher <alexande...@amd.com>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/gpu/drm/radeon/radeon_display.c | 5 +++++
1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
index 7b9b98112545..bc166f693263 100644
--- a/drivers/gpu/drm/radeon/radeon_display.c
+++ b/drivers/gpu/drm/radeon/radeon_display.c
@@ -707,6 +707,10 @@ int radeon_ddc_get_modes(struct radeon_connector *radeon_connector)
struct radeon_device *rdev = dev->dev_private;
int ret = 0;

+ /* don't leak the edid if we already fetched it in detect() */
+ if (radeon_connector->edid)
+ goto got_edid;
+
/* on hw with routers, select right port */
if (radeon_connector->router.ddc_valid)
radeon_router_select_ddc_port(radeon_connector);
@@ -746,6 +750,7 @@ int radeon_ddc_get_modes(struct radeon_connector *radeon_connector)
radeon_connector->edid = radeon_bios_get_hardcoded_edid(rdev);
}
if (radeon_connector->edid) {
+got_edid:
drm_mode_connector_update_edid_property(&radeon_connector->base, radeon_connector->edid);
ret = drm_add_edid_modes(&radeon_connector->base, radeon_connector->edid);
drm_edid_to_eld(&radeon_connector->base, radeon_connector->edid);
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: John David Anglin <dave....@bell.net>

commit 20dbea494543aefaace874cc3ec93a39b94b1ec4 upstream.

The sa_restorer field in struct sigaction is obsolete and no longer in
the parisc implementation. However, the core code assumes the field is
present if SA_RESTORER is defined. So, the define needs to be removed.

Signed-off-by: John David Anglin <dave....@bell.net>
Signed-off-by: Helge Deller <del...@gmx.de>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/parisc/include/uapi/asm/signal.h | 2 --
1 file changed, 2 deletions(-)

diff --git a/arch/parisc/include/uapi/asm/signal.h b/arch/parisc/include/uapi/asm/signal.h
index a2fa297196bc..f5645d6a89f2 100644
--- a/arch/parisc/include/uapi/asm/signal.h
+++ b/arch/parisc/include/uapi/asm/signal.h
@@ -69,8 +69,6 @@
#define SA_NOMASK SA_NODEFER
#define SA_ONESHOT SA_RESETHAND

-#define SA_RESTORER 0x04000000 /* obsolete -- ignored */
-
#define MINSIGSTKSZ 2048
#define SIGSTKSZ 8192

--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Romain Degez <romain...@gmail.com>

commit b32bfc06aefab61acc872dec3222624e6cd867ed upstream.

Add support of the Promise FastTrak TX8660 SATA HBA in ahci mode by
registering the board in the ahci_pci_tbl[].

Note: this HBA also provide a hardware RAID mode when activated in
BIOS but specific drivers from the manufacturer are required in this
case.

Signed-off-by: Romain Degez <romain...@gmail.com>
Tested-by: Romain Degez <romain...@gmail.com>
Signed-off-by: Tejun Heo <t...@kernel.org>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/ata/ahci.c | 1 +

1 file changed, 1 insertion(+)

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 7ce164157768..7d8ef58b4421 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -455,6 +455,7 @@ static const struct pci_device_id ahci_pci_tbl[] = {

/* Promise */
{ PCI_VDEVICE(PROMISE, 0x3f20), board_ahci }, /* PDC42819 */
+ { PCI_VDEVICE(PROMISE, 0x3781), board_ahci }, /* FastTrak TX8660 ahci-mode */

/* Asmedia */
{ PCI_VDEVICE(ASMEDIA, 0x0601), board_ahci }, /* ASM1060 */
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Martin Lau <ka...@fb.com>

commit 97b8ee845393701edc06e27ccec2876ff9596019 upstream.

ring_buffer_poll_wait() should always put the poll_table to its wait_queue
even there is immediate data available. Otherwise, the following epoll and
read sequence will eventually hang forever:

1. Put some data to make the trace_pipe ring_buffer read ready first
2. epoll_ctl(efd, EPOLL_CTL_ADD, trace_pipe_fd, ee)
3. epoll_wait()
4. read(trace_pipe_fd) till EAGAIN
5. Add some more data to the trace_pipe ring_buffer
6. epoll_wait() -> this epoll_wait() will block forever

~ During the epoll_ctl(efd, EPOLL_CTL_ADD,...) call in step 2,
ring_buffer_poll_wait() returns immediately without adding poll_table,
which has poll_table->_qproc pointing to ep_poll_callback(), to its
wait_queue.
~ During the epoll_wait() call in step 3 and step 6,
ring_buffer_poll_wait() cannot add ep_poll_callback() to its wait_queue
because the poll_table->_qproc is NULL and it is how epoll works.
~ When there is new data available in step 6, ring_buffer does not know
it has to call ep_poll_callback() because it is not in its wait queue.
Hence, block forever.

Other poll implementation seems to call poll_wait() unconditionally as the very
first thing to do. For example, tcp_poll() in tcp.c.

Link: http://lkml.kernel.org/p/20140610060...@devbig242.prn2.facebook.com

Fixes: 2a2cc8f7c4d0 "ftrace: allow the event pipe to be polled"
Reviewed-by: Chris Mason <c...@fb.com>
Signed-off-by: Martin Lau <ka...@fb.com>
Signed-off-by: Steven Rostedt <ros...@goodmis.org>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

kernel/trace/ring_buffer.c | 4 ----
1 file changed, 4 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 15c4ae203885..a758ec217bc0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -616,10 +616,6 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
struct ring_buffer_per_cpu *cpu_buffer;
struct rb_irq_work *work;

- if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
- (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
- return POLLIN | POLLRDNORM;
-
if (cpu == RING_BUFFER_ALL_CPUS)
work = &buffer->irq_work;
else {
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: "David S. Miller" <da...@davemloft.net>

commit 4ca9a23765da3260058db3431faf5b4efd8cf926 upstream.

Based almost entirely upon a patch by Christopher Alexander Tobias
Schulze.

In commit db64fe02258f1507e13fe5212a989922323685ce ("mm: rewrite vmap
layer") lazy VMAP tlb flushing was added to the vmalloc layer. This
causes problems on sparc64.

Sparc64 has two VMAP mapped regions and they are not contiguous with
eachother. First we have the malloc mapping area, then another
unrelated region, then the vmalloc region.

This "another unrelated region" is where the firmware is mapped.

If the lazy TLB flushing logic in the vmalloc code triggers after
we've had both a module unload and a vfree or similar, it will pass an
address range that goes from somewhere inside the malloc region to
somewhere inside the vmalloc region, and thus covering the
openfirmware area entirely.

The sparc64 kernel learns about openfirmware's dynamic mappings in
this region early in the boot, and then services TLB misses in this
area. But openfirmware has some locked TLB entries which are not
mentioned in those dynamic mappings and we should thus not disturb
them.

These huge lazy TLB flush ranges causes those openfirmware locked TLB
entries to be removed, resulting in all kinds of problems including
hard hangs and crashes during reboot/reset.

Besides causing problems like this, such huge TLB flush ranges are
also incredibly inefficient. A plea has been made with the author of
the VMAP lazy TLB flushing code, but for now we'll put a safety guard
into our flush_tlb_kernel_range() implementation.

Since the implementation has become non-trivial, stop defining it as a
macro and instead make it a function in a C source file.

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/sparc/include/asm/tlbflush_64.h | 12 ++----------
arch/sparc/mm/init_64.c | 23 +++++++++++++++++++++++
2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/arch/sparc/include/asm/tlbflush_64.h b/arch/sparc/include/asm/tlbflush_64.h
index f0d6a9700f4c..1a4bb971e06d 100644
--- a/arch/sparc/include/asm/tlbflush_64.h
+++ b/arch/sparc/include/asm/tlbflush_64.h
@@ -35,6 +35,8 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
{
}

+void flush_tlb_kernel_range(unsigned long start, unsigned long end);
+
#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE

extern void flush_tlb_pending(void);
@@ -49,11 +51,6 @@ extern void __flush_tlb_kernel_range(unsigned long start, unsigned long end);

#ifndef CONFIG_SMP

-#define flush_tlb_kernel_range(start,end) \
-do { flush_tsb_kernel_range(start,end); \
- __flush_tlb_kernel_range(start,end); \
-} while (0)
-
static inline void global_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr)
{
__flush_tlb_page(CTX_HWBITS(mm->context), vaddr);
@@ -64,11 +61,6 @@ static inline void global_flush_tlb_page(struct mm_struct *mm, unsigned long vad
extern void smp_flush_tlb_kernel_range(unsigned long start, unsigned long end);
extern void smp_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr);

-#define flush_tlb_kernel_range(start, end) \
-do { flush_tsb_kernel_range(start,end); \
- smp_flush_tlb_kernel_range(start, end); \
-} while (0)
-
#define global_flush_tlb_page(mm, vaddr) \
smp_flush_tlb_page(mm, vaddr)

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 774ba41dba4d..b26015f49c0d 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2750,3 +2750,26 @@ void hugetlb_setup(struct pt_regs *regs)
}
}
#endif
+
+#ifdef CONFIG_SMP
+#define do_flush_tlb_kernel_range smp_flush_tlb_kernel_range
+#else
+#define do_flush_tlb_kernel_range __flush_tlb_kernel_range
+#endif
+
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+ if (start < HI_OBP_ADDRESS && end > LOW_OBP_ADDRESS) {
+ if (start < LOW_OBP_ADDRESS) {
+ flush_tsb_kernel_range(start, LOW_OBP_ADDRESS);
+ do_flush_tlb_kernel_range(start, LOW_OBP_ADDRESS);
+ }
+ if (end > HI_OBP_ADDRESS) {
+ flush_tsb_kernel_range(end, HI_OBP_ADDRESS);
+ do_flush_tlb_kernel_range(end, HI_OBP_ADDRESS);
+ }
+ } else {
+ flush_tsb_kernel_range(start, end);
+ do_flush_tlb_kernel_range(start, end);
+ }
+}
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: HATAYAMA Daisuke <d.hat...@jp.fujitsu.com>

commit b292d7a10487aee6e74b1c18b8d95b92f40d4a4f upstream.

Currently, any NMI is falsely handled by a NMI handler of NMI watchdog
if CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR is set.

For example, we use external NMI to make system panic to get crash
dump, but in this case, the external NMI is falsely handled do to the
issue.

This commit deals with the issue simply by ignoring CondChgd bit.

Here is explanation in detail.

On x86 NMI watchdog uses performance monitoring feature to
periodically signal NMI each time performance counter gets overflowed.

intel_pmu_handle_irq() is called as a NMI_LOCAL handler from a NMI
handler of NMI watchdog, perf_event_nmi_handler(). It identifies an
owner of a given NMI by looking at overflow status bits in
MSR_CORE_PERF_GLOBAL_STATUS MSR. If some of the bits are set, then it
handles the given NMI as its own NMI.

The problem is that the intel_pmu_handle_irq() doesn't distinguish
CondChgd bit from other bits. Unlike the other status bits, CondChgd
bit doesn't represent overflow status for performance counters. Thus,
CondChgd bit cannot be thought of as a mark indicating a given NMI is
NMI watchdog's.

As a result, if CondChgd bit is set, any NMI is falsely handled by the
NMI handler of NMI watchdog. Also, if type of the falsely handled NMI
is either NMI_UNKNOWN, NMI_SERR or NMI_IO_CHECK, the corresponding
action is never performed until CondChgd bit is cleared.

I noticed this behavior on systems with Ivy Bridge processors: Intel
Xeon CPU E5-2630 v2 and Intel Xeon CPU E7-8890 v2. On both systems,
CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR has already been set
in the beginning at boot. Then the CondChgd bit is immediately cleared
by next wrmsr to MSR_CORE_PERF_GLOBAL_CTRL MSR and appears to remain
0.

On the other hand, on older processors such as Nehalem, Xeon E7540,
CondChgd bit is not set in the beginning at boot.

I'm not sure about exact behavior of CondChgd bit, in particular when
this bit is set. Although I read Intel System Programmer's Manual to
figure out that, the descriptions I found are:

In 18.9.1:

"The MSR_PERF_GLOBAL_STATUS MSR also provides a ¡sticky bit¢ to
indicate changes to the state of performancmonitoring hardware"

In Table 35-2 IA-32 Architectural MSRs

63 CondChg: status bits of this register has changed.

These are different from the bahviour I see on the actual system as I
explained above.

At least, I think ignoring CondChgd bit should be enough for NMI
watchdog perspective.

Signed-off-by: HATAYAMA Daisuke <d.hat...@jp.fujitsu.com>
Acked-by: Don Zickus <dzi...@redhat.com>
Signed-off-by: Peter Zijlstra <pet...@infradead.org>
Cc: Arnaldo Carvalho de Melo <ac...@kernel.org>
Cc: Linus Torvalds <torv...@linux-foundation.org>
Cc: linux-...@vger.kernel.org
Link: http://lkml.kernel.org/r/20140625.103503.40...@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mi...@kernel.org>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/x86/kernel/cpu/perf_event_intel.c | 9 +++++++++
1 file changed, 9 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a45d8d4ace10..6eb3e3e7fc4c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1217,6 +1217,15 @@ again:
intel_pmu_lbr_read();

/*
+ * CondChgd bit 63 doesn't mean any overflow status. Ignore
+ * and clear the bit.
+ */
+ if (__test_and_clear_bit(63, (unsigned long *)&status)) {
+ if (!status)
+ goto done;
+ }
+
+ /*
* PEBS overflow sets bit 62 in the global status register
*/
if (__test_and_clear_bit(62, (unsigned long *)&status)) {
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Hans Verkuil <hver...@xs4all.nl>

commit 3445857b22eafb70a6ac258979e955b116bfd2c6 upstream.

When the audio encoding is changed the driver calls hdpvr_set_audio
with the current opt->audio_input value. However, that should have
been opt->audio_input + 1. So changing the audio encoding inadvertently
changes the input as well. This bug has always been there.

The second bug was introduced in kernel 3.10 and that broke the
default_audio_input module option handling: the audio encoding was
never switched to AC3 if default_audio_input was set to 2 (SPDIF input).

In addition, since starting with 3.10 the audio encoding is always set
at the start the first bug now always happens when the driver is loaded.
In the past this bug would only surface if the user would change the
audio encoding after the driver was loaded.

Also fixes a small trivial typo (bufffer -> buffer).

Signed-off-by: Hans Verkuil <hans.v...@cisco.com>
Reported-by: Scott Doty <sc...@corp.sonic.net>
Signed-off-by: Mauro Carvalho Chehab <m.ch...@samsung.com>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/media/usb/hdpvr/hdpvr-video.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/media/usb/hdpvr/hdpvr-video.c b/drivers/media/usb/hdpvr/hdpvr-video.c
index 4f8567aa99d8..068e9c5bfa7f 100644
--- a/drivers/media/usb/hdpvr/hdpvr-video.c
+++ b/drivers/media/usb/hdpvr/hdpvr-video.c
@@ -81,7 +81,7 @@ static void hdpvr_read_bulk_callback(struct urb *urb)
}

/*=========================================================================*/
-/* bufffer bits */
+/* buffer bits */

/* function expects dev->io_mutex to be hold by caller */
int hdpvr_cancel_queue(struct hdpvr_device *dev)
@@ -927,7 +927,7 @@ static int hdpvr_s_ctrl(struct v4l2_ctrl *ctrl)
case V4L2_CID_MPEG_AUDIO_ENCODING:
if (dev->flags & HDPVR_FLAG_AC3_CAP) {
opt->audio_codec = ctrl->val;
- return hdpvr_set_audio(dev, opt->audio_input,
+ return hdpvr_set_audio(dev, opt->audio_input + 1,
opt->audio_codec);
}
return 0;
@@ -1199,7 +1199,7 @@ int hdpvr_register_videodev(struct hdpvr_device *dev, struct device *parent,
v4l2_ctrl_new_std_menu(hdl, &hdpvr_ctrl_ops,
V4L2_CID_MPEG_AUDIO_ENCODING,
ac3 ? V4L2_MPEG_AUDIO_ENCODING_AC3 : V4L2_MPEG_AUDIO_ENCODING_AAC,
- 0x7, V4L2_MPEG_AUDIO_ENCODING_AAC);
+ 0x7, ac3 ? dev->options.audio_codec : V4L2_MPEG_AUDIO_ENCODING_AAC);
v4l2_ctrl_new_std_menu(hdl, &hdpvr_ctrl_ops,
V4L2_CID_MPEG_VIDEO_ENCODING,
V4L2_MPEG_VIDEO_ENCODING_MPEG_4_AVC, 0x3,
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Hugh Dickins <hu...@google.com>

commit f00cdc6df7d7cfcabb5b740911e6788cb0802bdb upstream.

Trinity finds that mmap access to a hole while it's punched from shmem
can prevent the madvise(MADV_REMOVE) or fallocate(FALLOC_FL_PUNCH_HOLE)
from completing, until the reader chooses to stop; with the puncher's
hold on i_mutex locking out all other writers until it can complete.

It appears that the tmpfs fault path is too light in comparison with its
hole-punching path, lacking an i_data_sem to obstruct it; but we don't
want to slow down the common case.

Extend shmem_fallocate()'s existing range notification mechanism, so
shmem_fault() can refrain from faulting pages into the hole while it's
punched, waiting instead on i_mutex (when safe to sleep; or repeatedly
faulting when not).

[ak...@linux-foundation.org: coding-style fixes]
Signed-off-by: Hugh Dickins <hu...@google.com>
Reported-by: Sasha Levin <sasha...@oracle.com>
Tested-by: Sasha Levin <sasha...@oracle.com>
Cc: Dave Jones <da...@redhat.com>
Signed-off-by: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Linus Torvalds <torv...@linux-foundation.org>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

mm/shmem.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 52 insertions(+), 4 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index e43dc555069d..d530cde82494 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -80,11 +80,12 @@ static struct vfsmount *shm_mnt;
#define SHORT_SYMLINK_LEN 128

/*
- * shmem_fallocate and shmem_writepage communicate via inode->i_private
- * (with i_mutex making sure that it has only one user at a time):
- * we would prefer not to enlarge the shmem inode just for that.
+ * shmem_fallocate communicates with shmem_fault or shmem_writepage via
+ * inode->i_private (with i_mutex making sure that it has only one user at
+ * a time): we would prefer not to enlarge the shmem inode just for that.
*/
struct shmem_falloc {
+ int mode; /* FALLOC_FL mode currently operating */
pgoff_t start; /* start of range currently being fallocated */
pgoff_t next; /* the next page offset to be fallocated */
pgoff_t nr_falloced; /* how many new pages have been fallocated */
@@ -826,6 +827,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
spin_lock(&inode->i_lock);
shmem_falloc = inode->i_private;
if (shmem_falloc &&
+ !shmem_falloc->mode &&
index >= shmem_falloc->start &&
index < shmem_falloc->next)
shmem_falloc->nr_unswapped++;
@@ -1300,6 +1302,44 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
int error;
int ret = VM_FAULT_LOCKED;

+ /*
+ * Trinity finds that probing a hole which tmpfs is punching can
+ * prevent the hole-punch from ever completing: which in turn
+ * locks writers out with its hold on i_mutex. So refrain from
+ * faulting pages into the hole while it's being punched, and
+ * wait on i_mutex to be released if vmf->flags permits.
+ */
+ if (unlikely(inode->i_private)) {
+ struct shmem_falloc *shmem_falloc;
+
+ spin_lock(&inode->i_lock);
+ shmem_falloc = inode->i_private;
+ if (!shmem_falloc ||
+ shmem_falloc->mode != FALLOC_FL_PUNCH_HOLE ||
+ vmf->pgoff < shmem_falloc->start ||
+ vmf->pgoff >= shmem_falloc->next)
+ shmem_falloc = NULL;
+ spin_unlock(&inode->i_lock);
+ /*
+ * i_lock has protected us from taking shmem_falloc seriously
+ * once return from shmem_fallocate() went back up that stack.
+ * i_lock does not serialize with i_mutex at all, but it does
+ * not matter if sometimes we wait unnecessarily, or sometimes
+ * miss out on waiting: we just need to make those cases rare.
+ */
+ if (shmem_falloc) {
+ if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
+ !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
+ up_read(&vma->vm_mm->mmap_sem);
+ mutex_lock(&inode->i_mutex);
+ mutex_unlock(&inode->i_mutex);
+ return VM_FAULT_RETRY;
+ }
+ /* cond_resched? Leave that to GUP or return to user */
+ return VM_FAULT_NOPAGE;
+ }
+ }
+
error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -1815,18 +1855,26 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,

mutex_lock(&inode->i_mutex);

+ shmem_falloc.mode = mode & ~FALLOC_FL_KEEP_SIZE;
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
struct address_space *mapping = file->f_mapping;
loff_t unmap_start = round_up(offset, PAGE_SIZE);
loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;

+ shmem_falloc.start = unmap_start >> PAGE_SHIFT;
+ shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
+ spin_lock(&inode->i_lock);
+ inode->i_private = &shmem_falloc;
+ spin_unlock(&inode->i_lock);
+
if ((u64)unmap_end > (u64)unmap_start)
unmap_mapping_range(mapping, unmap_start,
1 + unmap_end - unmap_start, 0);
shmem_truncate_range(inode, offset, offset + len - 1);
/* No need to unmap again: hole-punching leaves COWed pages */
error = 0;
- goto out;
+ goto undone;
}

/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: David Rientjes <rien...@google.com>

commit b104a35d32025ca740539db2808aa3385d0f30eb upstream.

The page allocator relies on __GFP_WAIT to determine if ALLOC_CPUSET
should be set in allocflags. ALLOC_CPUSET controls if a page allocation
should be restricted only to the set of allowed cpuset mems.

Transparent hugepages clears __GFP_WAIT when defrag is disabled to prevent
the fault path from using memory compaction or direct reclaim. Thus, it
is unfairly able to allocate outside of its cpuset mems restriction as a
side-effect.

This patch ensures that ALLOC_CPUSET is only cleared when the gfp mask is
truly GFP_ATOMIC by verifying it is also not a thp allocation.

Signed-off-by: David Rientjes <rien...@google.com>
Reported-by: Alex Thorlton <atho...@sgi.com>
Tested-by: Alex Thorlton <atho...@sgi.com>
Cc: Bob Liu <lli...@gmail.com>
Cc: Dave Hansen <dave....@linux.intel.com>
Cc: Hedi Berriche <he...@sgi.com>
Cc: Hugh Dickins <hu...@google.com>
Cc: Johannes Weiner <han...@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill....@linux.intel.com>
Cc: Mel Gorman <mgo...@suse.de>
Cc: Rik van Riel <ri...@redhat.com>
Cc: Srivatsa S. Bhat <srivat...@linux.vnet.ibm.com>

Signed-off-by: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Linus Torvalds <torv...@linux-foundation.org>
Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

mm/page_alloc.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1b5e9fb49ed4..04dc7a35e299 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2350,7 +2350,7 @@ static inline int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
- const gfp_t wait = gfp_mask & __GFP_WAIT;
+ const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -2359,20 +2359,20 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
- * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+ * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
*/
alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

- if (!wait) {
+ if (atomic) {
/*
- * Not worth trying to allocate harder for
- * __GFP_NOMEMALLOC even if it can't schedule.
+ * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
+ * if it can't schedule.
*/
- if (!(gfp_mask & __GFP_NOMEMALLOC))
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
alloc_flags |= ALLOC_HARDER;
/*
- * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
+ * comment for __cpuset_node_allowed_softwall().
*/
alloc_flags &= ~ALLOC_CPUSET;
} else if (unlikely(rt_task(current)) && !in_interrupt())
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Andrey Utkin <andrey.kri...@gmail.com>

commit 093758e3daede29cb4ce6aedb111becf9d4bfc57 upstream.

This commit is a guesswork, but it seems to make sense to drop this
break, as otherwise the following line is never executed and becomes
dead code. And that following line actually saves the result of
local calculation by the pointer given in function argument. So the
proposed change makes sense if this code in the whole makes sense (but I
am unable to analyze it in the whole).

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=81641
Reported-by: David Binderman <dcb...@hotmail.com>
Signed-off-by: Andrey Utkin <andrey.kri...@gmail.com>

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/sparc/math-emu/math_32.c | 2 +-

1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sparc/math-emu/math_32.c b/arch/sparc/math-emu/math_32.c
index aa4d55b0bdf0..5ce8f2f64604 100644
--- a/arch/sparc/math-emu/math_32.c
+++ b/arch/sparc/math-emu/math_32.c
@@ -499,7 +499,7 @@ static int do_one_mathemu(u32 insn, unsigned long *pfsr, unsigned long *fregs)
case 0: fsr = *pfsr;
if (IR == -1) IR = 2;
/* fcc is always fcc0 */
- fsr &= ~0xc00; fsr |= (IR << 10); break;
+ fsr &= ~0xc00; fsr |= (IR << 10);
*pfsr = fsr;
break;
case 1: rd->s = IR; break;
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Dmitry Kravkov <Dmitry....@qlogic.com>

commit fe26566d8a05151ba1dce75081f6270f73ec4ae1 upstream.

When TSO packet is transmitted additional BD w/o mapping is used
to describe the packed. The BD needs special handling in tx
completion.

kernel: Call Trace:
kernel: <IRQ> [<ffffffff815e19ba>] dump_stack+0x19/0x1b
kernel: [<ffffffff8105dee1>] warn_slowpath_common+0x61/0x80
kernel: [<ffffffff8105df5c>] warn_slowpath_fmt+0x5c/0x80
kernel: [<ffffffff814a8c0d>] ? find_iova+0x4d/0x90
kernel: [<ffffffff814ab0e2>] intel_unmap_page.part.36+0x142/0x160
kernel: [<ffffffff814ad0e6>] intel_unmap_page+0x26/0x30
kernel: [<ffffffffa01f55d7>] bnx2x_free_tx_pkt+0x157/0x2b0 [bnx2x]
kernel: [<ffffffffa01f8dac>] bnx2x_tx_int+0xac/0x220 [bnx2x]
kernel: [<ffffffff8101a0d9>] ? read_tsc+0x9/0x20
kernel: [<ffffffffa01f8fdb>] bnx2x_poll+0xbb/0x3c0 [bnx2x]
kernel: [<ffffffff814d041a>] net_rx_action+0x15a/0x250
kernel: [<ffffffff81067047>] __do_softirq+0xf7/0x290
kernel: [<ffffffff815f3a5c>] call_softirq+0x1c/0x30
kernel: [<ffffffff81014d25>] do_softirq+0x55/0x90
kernel: [<ffffffff810673e5>] irq_exit+0x115/0x120
kernel: [<ffffffff815f4358>] do_IRQ+0x58/0xf0
kernel: [<ffffffff815e94ad>] common_interrupt+0x6d/0x6d
kernel: <EOI> [<ffffffff810bbff7>] ? clockevents_notify+0x127/0x140
kernel: [<ffffffff814834df>] ? cpuidle_enter_state+0x4f/0xc0
kernel: [<ffffffff81483615>] cpuidle_idle_call+0xc5/0x200
kernel: [<ffffffff8101bc7e>] arch_cpu_idle+0xe/0x30
kernel: [<ffffffff810b4725>] cpu_startup_entry+0xf5/0x290
kernel: [<ffffffff815cfee1>] start_secondary+0x265/0x27b
kernel: ---[ end trace 11aa7726f18d7e80 ]---

Fixes: a848ade408b ("bnx2x: add CSUM and TSO support for encapsulation protocols")
Reported-by: Yulong Pei <yp...@redhat.com>
Cc: Michal Schmidt <msch...@redhat.com>
Signed-off-by: Dmitry Kravkov <Dmitry....@qlogic.com>

Signed-off-by: David S. Miller <da...@davemloft.net>
Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/net/ethernet/broadcom/bnx2x/bnx2x.h | 1 +
drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 9 +++++++++
2 files changed, 10 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index 00b88cbfde25..7ae7bdba4183 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -308,6 +308,7 @@ struct sw_tx_bd {
u8 flags;
/* Set on the first BD descriptor when there is a split BD */
#define BNX2X_TSO_SPLIT_BD (1<<0)
+#define BNX2X_HAS_SECOND_PBD (1<<1)
};

struct sw_rx_page {
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 5fedcb6d7ec0..66b4e8be804b 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -186,6 +186,12 @@ static u16 bnx2x_free_tx_pkt(struct bnx2x *bp, struct bnx2x_fp_txdata *txdata,
--nbd;
bd_idx = TX_BD(NEXT_TX_IDX(bd_idx));

+ if (tx_buf->flags & BNX2X_HAS_SECOND_PBD) {
+ /* Skip second parse bd... */
+ --nbd;
+ bd_idx = TX_BD(NEXT_TX_IDX(bd_idx));
+ }
+
/* TSO headers+data bds share a common mapping. See bnx2x_tx_split() */
if (tx_buf->flags & BNX2X_TSO_SPLIT_BD) {
tx_data_bd = &txdata->tx_desc_ring[bd_idx].reg_bd;
@@ -3812,6 +3818,9 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
/* set encapsulation flag in start BD */
SET_FLAG(tx_start_bd->general_data,
ETH_TX_START_BD_TUNNEL_EXIST, 1);
+
+ tx_buf->flags |= BNX2X_HAS_SECOND_PBD;
+
nbd++;
} else if (xmit_type & XMIT_CSUM) {
/* Set PBD in checksum offload case w/o encapsulation */
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:05 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Eric Dumazet <edum...@google.com>

commit 757efd32d5ce31f67193cc0e6a56e4dffcc42fb1 upstream.

Dave reported following splat, caused by improper use of
IP_INC_STATS_BH() in process context.

BUG: using __this_cpu_add() in preemptible [00000000] code: trinity-c117/14551
caller is __this_cpu_preempt_check+0x13/0x20
CPU: 3 PID: 14551 Comm: trinity-c117 Not tainted 3.16.0+ #33
ffffffff9ec898f0 0000000047ea7e23 ffff88022d32f7f0 ffffffff9e7ee207
0000000000000003 ffff88022d32f818 ffffffff9e397eaa ffff88023ee70b40
ffff88022d32f970 ffff8801c026d580 ffff88022d32f828 ffffffff9e397ee3
Call Trace:
[<ffffffff9e7ee207>] dump_stack+0x4e/0x7a
[<ffffffff9e397eaa>] check_preemption_disabled+0xfa/0x100
[<ffffffff9e397ee3>] __this_cpu_preempt_check+0x13/0x20
[<ffffffffc0839872>] sctp_packet_transmit+0x692/0x710 [sctp]
[<ffffffffc082a7f2>] sctp_outq_flush+0x2a2/0xc30 [sctp]
[<ffffffff9e0d985c>] ? mark_held_locks+0x7c/0xb0
[<ffffffff9e7f8c6d>] ? _raw_spin_unlock_irqrestore+0x5d/0x80
[<ffffffffc082b99a>] sctp_outq_uncork+0x1a/0x20 [sctp]
[<ffffffffc081e112>] sctp_cmd_interpreter.isra.23+0x1142/0x13f0 [sctp]
[<ffffffffc081c86b>] sctp_do_sm+0xdb/0x330 [sctp]
[<ffffffff9e0b8f1b>] ? preempt_count_sub+0xab/0x100
[<ffffffffc083b350>] ? sctp_cname+0x70/0x70 [sctp]
[<ffffffffc08389ca>] sctp_primitive_ASSOCIATE+0x3a/0x50 [sctp]
[<ffffffffc083358f>] sctp_sendmsg+0x88f/0xe30 [sctp]
[<ffffffff9e0d673a>] ? lock_release_holdtime.part.28+0x9a/0x160
[<ffffffff9e0d62ce>] ? put_lock_stats.isra.27+0xe/0x30
[<ffffffff9e73b624>] inet_sendmsg+0x104/0x220
[<ffffffff9e73b525>] ? inet_sendmsg+0x5/0x220
[<ffffffff9e68ac4e>] sock_sendmsg+0x9e/0xe0
[<ffffffff9e1c0c09>] ? might_fault+0xb9/0xc0
[<ffffffff9e1c0bae>] ? might_fault+0x5e/0xc0
[<ffffffff9e68b234>] SYSC_sendto+0x124/0x1c0
[<ffffffff9e0136b0>] ? syscall_trace_enter+0x250/0x330
[<ffffffff9e68c3ce>] SyS_sendto+0xe/0x10
[<ffffffff9e7f9be4>] tracesys+0xdd/0xe2

This is a followup of commits f1d8cba61c3c4b ("inet: fix possible
seqlock deadlocks") and 7f88c6b23afbd315 ("ipv6: fix possible seqlock
deadlock in ip6_finish_output2")

Signed-off-by: Eric Dumazet <edum...@google.com>
Cc: Hannes Frederic Sowa <han...@stressinduktion.org>
Reported-by: Dave Jones <da...@redhat.com>
Acked-by: Neil Horman <nho...@tuxdriver.com>
Acked-by: Hannes Frederic Sowa <han...@stressinduktion.org>

Signed-off-by: David S. Miller <da...@davemloft.net>
Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

net/sctp/output.c | 2 +-

1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sctp/output.c b/net/sctp/output.c
index 013a07d9c454..543fd836f439 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -612,7 +612,7 @@ out:
return err;
no_route:
kfree_skb(nskb);
- IP_INC_STATS_BH(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES);

/* FIXME: Returning the 'err' will effect all the associations
* associated with a socket, although only one of the paths of the
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:05 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: James Bottomley <JBott...@Parallels.com>

commit 89fb4cd1f717a871ef79fa7debbe840e3225cd54 upstream.

Flush commands don't transfer data and thus need to be special cased
in the I/O completion handler so that we can propagate errors to
the block layer and filesystem.

Signed-off-by: James Bottomley <JBott...@Parallels.com>
Reported-by: Steven Haber <ste...@qumulo.com>
Tested-by: Steven Haber <ste...@qumulo.com>
Reviewed-by: Martin K. Petersen <martin....@oracle.com>
Signed-off-by: Christoph Hellwig <h...@lst.de>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/scsi/scsi_lib.c | 8 ++++++++
1 file changed, 8 insertions(+)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 124392f3091e..96e537a8604d 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -831,6 +831,14 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
scsi_next_command(cmd);
return;
}
+ } else if (blk_rq_bytes(req) == 0 && result && !sense_deferred) {
+ /*
+ * Certain non BLOCK_PC requests are commands that don't
+ * actually transfer anything (FLUSH), so cannot use
+ * good_bytes != blk_rq_bytes(req) as the signal for an error.
+ * This sets the error explicitly for the problem case.
+ */
+ error = __scsi_error_from_host_byte(cmd, result);
}

/* no bidi support for !REQ_TYPE_BLOCK_PC yet */
--
1.9.1

Luis Henriques

unread,

Aug 18, 2014, 5:50:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Bernd Wachter <bernd....@jolla.com>

commit 8dcb4b1526747d8431f9895e153dd478c9d16186 upstream.

There's a new version of the Telewell 4G modem working with, but not
recognized by this driver.

Signed-off-by: Bernd Wachter <bernd....@jolla.com>
Acked-by: Bjørn Mork <bj...@mork.no>

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/net/usb/qmi_wwan.c | 1 +

1 file changed, 1 insertion(+)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 6fb0082b3308..bf2e5c19b9be 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -721,6 +721,7 @@ static const struct usb_device_id products[] = {
{QMI_FIXED_INTF(0x19d2, 0x1424, 2)},
{QMI_FIXED_INTF(0x19d2, 0x1425, 2)},
{QMI_FIXED_INTF(0x19d2, 0x1426, 2)}, /* ZTE MF91 */
+ {QMI_FIXED_INTF(0x19d2, 0x1428, 2)}, /* Telewell TW-LTE 4G v2 */
{QMI_FIXED_INTF(0x19d2, 0x2002, 4)}, /* ZTE (Vodafone) K3765-Z */
{QMI_FIXED_INTF(0x0f3d, 0x68a2, 8)}, /* Sierra Wireless MC7700 */
{QMI_FIXED_INTF(0x114f, 0x68a2, 8)}, /* Sierra Wireless MC7750 */

Luis Henriques

unread,

Aug 18, 2014, 5:50:05 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Christopher Alexander Tobias Schulze <cat.s...@alice-dsl.net>

commit 5cdceab3d5e02eb69ea0f5d8fa9181800baf6f77 upstream.

Fix regression in bbc i2c temperature and fan control on some Sun systems
that causes the driver to refuse to load due to the bbc_i2c_bussel resource not
being present on the (second) i2c bus where the temperature sensors and fan
control are located. (The check for the number of resources was removed when
the driver was ported to a pure OF driver in mid 2008.)

Signed-off-by: Christopher Alexander Tobias Schulze <cat.s...@alice-dsl.net>

Signed-off-by: David S. Miller <da...@davemloft.net>
Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/sbus/char/bbc_envctrl.c | 6 ++++++
drivers/sbus/char/bbc_i2c.c | 11 ++++++++---
2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/drivers/sbus/char/bbc_envctrl.c b/drivers/sbus/char/bbc_envctrl.c
index 160e7510aca6..0787b9756165 100644
--- a/drivers/sbus/char/bbc_envctrl.c
+++ b/drivers/sbus/char/bbc_envctrl.c
@@ -452,6 +452,9 @@ static void attach_one_temp(struct bbc_i2c_bus *bp, struct platform_device *op,
if (!tp)
return;

+ INIT_LIST_HEAD(&tp->bp_list);
+ INIT_LIST_HEAD(&tp->glob_list);
+
tp->client = bbc_i2c_attach(bp, op);
if (!tp->client) {
kfree(tp);
@@ -497,6 +500,9 @@ static void attach_one_fan(struct bbc_i2c_bus *bp, struct platform_device *op,
if (!fp)
return;

+ INIT_LIST_HEAD(&fp->bp_list);
+ INIT_LIST_HEAD(&fp->glob_list);
+
fp->client = bbc_i2c_attach(bp, op);
if (!fp->client) {
kfree(fp);
diff --git a/drivers/sbus/char/bbc_i2c.c b/drivers/sbus/char/bbc_i2c.c
index c1441ed282eb..e0e6cd605cca 100644
--- a/drivers/sbus/char/bbc_i2c.c
+++ b/drivers/sbus/char/bbc_i2c.c
@@ -301,13 +301,18 @@ static struct bbc_i2c_bus * attach_one_i2c(struct platform_device *op, int index
if (!bp)
return NULL;

+ INIT_LIST_HEAD(&bp->temps);
+ INIT_LIST_HEAD(&bp->fans);
+
bp->i2c_control_regs = of_ioremap(&op->resource[0], 0, 0x2, "bbc_i2c_regs");
if (!bp->i2c_control_regs)
goto fail;

- bp->i2c_bussel_reg = of_ioremap(&op->resource[1], 0, 0x1, "bbc_i2c_bussel");
- if (!bp->i2c_bussel_reg)
- goto fail;
+ if (op->num_resources == 2) {
+ bp->i2c_bussel_reg = of_ioremap(&op->resource[1], 0, 0x1, "bbc_i2c_bussel");
+ if (!bp->i2c_bussel_reg)
+ goto fail;
+ }

bp->waiting = 0;
init_waitqueue_head(&bp->wq);

Luis Henriques

unread,

Aug 18, 2014, 5:50:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Tejun Heo <t...@kernel.org>

commit 1a112d10f03e83fb3a2fdc4c9165865dec8a3ca6 upstream.

1871ee134b73 ("libata: support the ata host which implements a queue
depth less than 32") directly used ata_port->scsi_host->can_queue from
ata_qc_new() to determine the number of tags supported by the host;
unfortunately, SAS controllers doing SATA don't initialize ->scsi_host
leading to the following oops.

BUG: unable to handle kernel NULL pointer dereference at 0000000000000058
IP: [<ffffffff814e0618>] ata_qc_new_init+0x188/0x1b0
PGD 0
Oops: 0002 [#1] SMP
Modules linked in: isci libsas scsi_transport_sas mgag200 drm_kms_helper ttm
CPU: 1 PID: 518 Comm: udevd Not tainted 3.16.0-rc6+ #62
Hardware name: Intel Corporation S2600CO/S2600CO, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013
task: ffff880c1a00b280 ti: ffff88061a000000 task.ti: ffff88061a000000
RIP: 0010:[<ffffffff814e0618>] [<ffffffff814e0618>] ata_qc_new_init+0x188/0x1b0
RSP: 0018:ffff88061a003ae8 EFLAGS: 00010012
RAX: 0000000000000001 RBX: ffff88000241ca80 RCX: 00000000000000fa
RDX: 0000000000000020 RSI: 0000000000000020 RDI: ffff8806194aa298
RBP: ffff88061a003ae8 R08: ffff8806194a8000 R09: 0000000000000000
R10: 0000000000000000 R11: ffff88000241ca80 R12: ffff88061ad58200
R13: ffff8806194aa298 R14: ffffffff814e67a0 R15: ffff8806194a8000
FS: 00007f3ad7fe3840(0000) GS:ffff880627620000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000058 CR3: 000000061a118000 CR4: 00000000001407e0
Stack:
ffff88061a003b20 ffffffff814e96e1 ffff88000241ca80 ffff88061ad58200
ffff8800b6bf6000 ffff880c1c988000 ffff880619903850 ffff88061a003b68
ffffffffa0056ce1 ffff88061a003b48 0000000013d6e6f8 ffff88000241ca80
Call Trace:
[<ffffffff814e96e1>] ata_sas_queuecmd+0xa1/0x430
[<ffffffffa0056ce1>] sas_queuecommand+0x191/0x220 [libsas]
[<ffffffff8149afee>] scsi_dispatch_cmd+0x10e/0x300
[<ffffffff814a3bc5>] scsi_request_fn+0x2f5/0x550
[<ffffffff81317613>] __blk_run_queue+0x33/0x40
[<ffffffff8131781a>] queue_unplugged+0x2a/0x90
[<ffffffff8131ceb4>] blk_flush_plug_list+0x1b4/0x210
[<ffffffff8131d274>] blk_finish_plug+0x14/0x50
[<ffffffff8117eaa8>] __do_page_cache_readahead+0x198/0x1f0
[<ffffffff8117ee21>] force_page_cache_readahead+0x31/0x50
[<ffffffff8117ee7e>] page_cache_sync_readahead+0x3e/0x50
[<ffffffff81172ac6>] generic_file_read_iter+0x496/0x5a0
[<ffffffff81219897>] blkdev_read_iter+0x37/0x40
[<ffffffff811e307e>] new_sync_read+0x7e/0xb0
[<ffffffff811e3734>] vfs_read+0x94/0x170
[<ffffffff811e43c6>] SyS_read+0x46/0xb0
[<ffffffff811e33d1>] ? SyS_lseek+0x91/0xb0
[<ffffffff8171ee29>] system_call_fastpath+0x16/0x1b
Code: 00 00 00 88 50 29 83 7f 08 01 19 d2 83 e2 f0 83 ea 50 88 50 34 c6 81 1d 02 00 00 40 c6 81 17 02 00 00 00 5d c3 66 0f 1f 44 00 00 <89> 14 25 58 00 00 00

Fix it by introducing ata_host->n_tags which is initialized to
ATA_MAX_QUEUE - 1 in ata_host_init() for SAS controllers and set to
scsi_host_template->can_queue in ata_host_register() for !SAS ones.
As SAS hosts are never registered, this will give them the same
ATA_MAX_QUEUE - 1 as before. Note that we can't use
scsi_host->can_queue directly for SAS hosts anyway as they can go
higher than the libata maximum.

Signed-off-by: Tejun Heo <t...@kernel.org>
Reported-by: Mike Qiu <qiu...@linux.vnet.ibm.com>
Reported-by: Jesse Brandeburg <jesse.br...@gmail.com>
Reported-by: Peter Hurley <pe...@hurleysoftware.com>
Reported-by: Peter Zijlstra <pet...@infradead.org>
Tested-by: Alexey Kardashevskiy <a...@ozlabs.ru>
Fixes: 1871ee134b73 ("libata: support the ata host which implements a queue depth less than 32")
Cc: Kevin Hao <haok...@gmail.com>
Cc: Dan Williams <dan.j.w...@intel.com>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/ata/libata-core.c | 16 ++++------------
include/linux/libata.h | 1 +
2 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index b53d81fc9dba..6a97fc151b8c 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4769,9 +4769,8 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
{
struct ata_queued_cmd *qc = NULL;
- unsigned int i, tag, max_queue;
-
- max_queue = ap->scsi_host->can_queue;
+ unsigned int max_queue = ap->host->n_tags;
+ unsigned int i, tag;

/* no command while frozen */
if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
@@ -6080,6 +6079,7 @@ void ata_host_init(struct ata_host *host, struct device *dev,
{
spin_lock_init(&host->lock);
mutex_init(&host->eh_mutex);
+ host->n_tags = ATA_MAX_QUEUE - 1;
host->dev = dev;
host->ops = ops;
}
@@ -6161,15 +6161,7 @@ int ata_host_register(struct ata_host *host, struct scsi_host_template *sht)
{
int i, rc;

- /*
- * The max queue supported by hardware must not be greater than
- * ATA_MAX_QUEUE.
- */
- if (sht->can_queue > ATA_MAX_QUEUE) {
- dev_err(host->dev, "BUG: the hardware max queue is too large\n");
- WARN_ON(1);
- return -EINVAL;
- }
+ host->n_tags = clamp(sht->can_queue, 1, ATA_MAX_QUEUE - 1);

/* host must have been started */
if (!(host->flags & ATA_HOST_STARTED)) {
diff --git a/include/linux/libata.h b/include/linux/libata.h
index d0c64ab7375b..9740884fca1b 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -547,6 +547,7 @@ struct ata_host {
struct device *dev;
void __iomem * const *iomap;
unsigned int n_ports;
+ unsigned int n_tags; /* nr of NCQ tags */
void *private_data;
struct ata_port_operations *ops;
unsigned long flags;

Luis Henriques

unread,

Aug 18, 2014, 5:50:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Konstantin Khlebnikov <k.khle...@samsung.com>

commit 811a2407a3cf7bbd027fbe92d73416f17485a3d8 upstream.

On LPAE, each level 1 (pgd) page table entry maps 1GiB, and the level 2
(pmd) entries map 2MiB.

When the identity mapping is created on LPAE, the pgd pointers are copied
from the swapper_pg_dir. If we find that we need to modify the contents
of a pmd, we allocate a new empty pmd table and insert it into the
appropriate 1GB slot, before then filling it with the identity mapping.

However, if the 1GB slot covers the kernel lowmem mappings, we obliterate
those mappings.

When replacing a PMD, first copy the old PMD contents to the new PMD, so
that we preserve the existing mappings, particularly the mappings of the
kernel itself.

[rewrote commit message and added code comment -- rmk]

Fixes: ae2de101739c ("ARM: LPAE: Add identity mapping support for the 3-level page table format")
Signed-off-by: Konstantin Khlebnikov <k.khle...@samsung.com>
Signed-off-by: Russell King <rmk+k...@arm.linux.org.uk>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/arm/mm/idmap.c | 7 +++++++
1 file changed, 7 insertions(+)

diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c
index 83cb3ac27095..c61d2373408c 100644
--- a/arch/arm/mm/idmap.c
+++ b/arch/arm/mm/idmap.c
@@ -24,6 +24,13 @@ static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end,
pr_warning("Failed to allocate identity pmd.\n");
return;
}
+ /*
+ * Copy the original PMD to ensure that the PMD entries for
+ * the kernel image are preserved.
+ */
+ if (!pud_none(*pud))
+ memcpy(pmd, pmd_offset(pud, 0),
+ PTRS_PER_PMD * sizeof(pmd_t));
pud_populate(&init_mm, pud, pmd);
pmd += pmd_index(addr);
} else

Luis Henriques

unread,

Aug 18, 2014, 5:50:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Kirill Tkhai <tk...@yandex.ru>

commit 49b6c01f4c1de3b5e5427ac5aba80f9f6d27837a upstream.

One more place where we must not be able
to be preempted or to be interrupted in RT.

Always actually disable interrupts during
synchronization cycle.

Signed-off-by: Kirill Tkhai <tk...@yandex.ru>

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/sparc/kernel/smp_64.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index e142545244f2..643bf38ed619 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -150,7 +150,7 @@ void cpu_panic(void)
#define NUM_ROUNDS 64 /* magic value */
#define NUM_ITERS 5 /* likewise */

-static DEFINE_SPINLOCK(itc_sync_lock);
+static DEFINE_RAW_SPINLOCK(itc_sync_lock);
static unsigned long go[SLAVE + 1];

#define DEBUG_TICK_SYNC 0
@@ -258,7 +258,7 @@ static void smp_synchronize_one_tick(int cpu)
go[MASTER] = 0;
membar_safe("#StoreLoad");

- spin_lock_irqsave(&itc_sync_lock, flags);
+ raw_spin_lock_irqsave(&itc_sync_lock, flags);
{
for (i = 0; i < NUM_ROUNDS*NUM_ITERS; i++) {
while (!go[MASTER])
@@ -269,7 +269,7 @@ static void smp_synchronize_one_tick(int cpu)
membar_safe("#StoreLoad");
}
}
- spin_unlock_irqrestore(&itc_sync_lock, flags);
+ raw_spin_unlock_irqrestore(&itc_sync_lock, flags);
}

#if defined(CONFIG_SUN_LDOMS) && defined(CONFIG_HOTPLUG_CPU)

Luis Henriques

unread,

Aug 18, 2014, 5:50:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Christopher Alexander Tobias Schulze <cat.s...@alice-dsl.net>

commit fe418231b195c205701c0cc550a03f6c9758fd9e upstream.

Fix detection of BREAK on sunsab serial console: BREAK detection was only
performed when there were also serial characters received simultaneously.
To handle all BREAKs correctly, the check for BREAK and the corresponding
call to uart_handle_break() must also be done if count == 0, therefore
duplicate this code fragment and pull it out of the loop over the received
characters.

Patch applies to 3.16-rc6.

Signed-off-by: Christopher Alexander Tobias Schulze <cat.s...@alice-dsl.net>

Signed-off-by: David S. Miller <da...@davemloft.net>
Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/tty/serial/sunsab.c | 9 +++++++++

1 file changed, 9 insertions(+)

diff --git a/drivers/tty/serial/sunsab.c b/drivers/tty/serial/sunsab.c
index 5d6136b2a04a..edcb84980d60 100644
--- a/drivers/tty/serial/sunsab.c
+++ b/drivers/tty/serial/sunsab.c
@@ -157,6 +157,15 @@ receive_chars(struct uart_sunsab_port *up,
(up->port.line == up->port.cons->index))
saw_console_brk = 1;

+ if (count == 0) {
+ if (unlikely(stat->sreg.isr1 & SAB82532_ISR1_BRK)) {
+ stat->sreg.isr0 &= ~(SAB82532_ISR0_PERR |
+ SAB82532_ISR0_FERR);
+ up->port.icount.brk++;
+ uart_handle_break(&up->port);
+ }
+ }
+
for (i = 0; i < count; i++) {
unsigned char ch = buf[i], flag;

Luis Henriques

unread,

Aug 18, 2014, 5:50:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Li RongQing <roy.q...@gmail.com>

commit 916c1689a09bc1ca81f2d7a34876f8d35aadd11b upstream.

skb_cow called in vlan_reorder_header does not free the skb when it failed,
and vlan_reorder_header returns NULL to reset original skb when it is called
in vlan_untag, lead to a memory leak.

Signed-off-by: Li RongQing <roy.q...@gmail.com>
Acked-by: Eric Dumazet <edum...@google.com>

Signed-off-by: David S. Miller <da...@davemloft.net>
Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

net/8021q/vlan_core.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 6ee48aac776f..7e57135c7cc4 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -108,8 +108,11 @@ EXPORT_SYMBOL(vlan_dev_vlan_id);

static struct sk_buff *vlan_reorder_header(struct sk_buff *skb)
{
- if (skb_cow(skb, skb_headroom(skb)) < 0)
+ if (skb_cow(skb, skb_headroom(skb)) < 0) {
+ kfree_skb(skb);
return NULL;
+ }
+
memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN);
skb->mac_header += VLAN_HLEN;
return skb;

Luis Henriques

unread,

Aug 18, 2014, 5:50:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Minfei Huang <huang...@ucloud.cn>

commit c75b53af2f0043aff500af0a6f878497bef41bca upstream.

I use btree from 3.14-rc2 in my own module. When the btree module is
removed, a warning arises:

kmem_cache_destroy btree_node: Slab cache still has objects
CPU: 13 PID: 9150 Comm: rmmod Tainted: GF O 3.14.0-rc2 #1
Hardware name: Inspur NF5270M3/NF5270M3, BIOS CHEETAH_2.1.3 09/10/2013
Call Trace:
dump_stack+0x49/0x5d
kmem_cache_destroy+0xcf/0xe0
btree_module_exit+0x10/0x12 [btree]
SyS_delete_module+0x198/0x1f0
system_call_fastpath+0x16/0x1b

The cause is that it doesn't release the last btree node, when height = 1
and fill = 1.

[ak...@linux-foundation.org: remove unneeded test of NULL]
Signed-off-by: Minfei Huang <huang...@ucloud.cn>
Cc: Joern Engel <jo...@logfs.org>
Cc: Johannes Berg <joha...@sipsolutions.net>

Signed-off-by: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Linus Torvalds <torv...@linux-foundation.org>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

lib/btree.c | 1 +

1 file changed, 1 insertion(+)

diff --git a/lib/btree.c b/lib/btree.c
index f9a484676cb6..4264871ea1a0 100644
--- a/lib/btree.c
+++ b/lib/btree.c
@@ -198,6 +198,7 @@ EXPORT_SYMBOL_GPL(btree_init);

void btree_destroy(struct btree_head *head)
{
+ mempool_free(head->node, head->mempool);
mempool_destroy(head->mempool);
head->mempool = NULL;

Luis Henriques

unread,

Aug 18, 2014, 5:50:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: "H. Peter Anvin" <h...@zytor.com>

commit 34273f41d57ee8d854dcd2a1d754cbb546cb548f upstream.

Embedded systems, which may be very memory-size-sensitive, are
extremely unlikely to ever encounter any 16-bit software, so make it
a CONFIG_EXPERT option to turn off support for any 16-bit software
whatsoever.

Signed-off-by: H. Peter Anvin <h...@zytor.com>
Link: http://lkml.kernel.org/r/1398816946-3351-1-...@linux.intel.com

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/x86/Kconfig | 23 ++++++++++++++++++-----
arch/x86/kernel/entry_32.S | 12 ++++++++++++
arch/x86/kernel/entry_64.S | 8 ++++++++
arch/x86/kernel/ldt.c | 5 +++++
4 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ba5fdd09c61e..69d336e6964a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -962,14 +962,27 @@ config VM86
default y
depends on X86_32
---help---
- This option is required by programs like DOSEMU to run 16-bit legacy
- code on X86 processors. It also may be needed by software like
- XFree86 to initialize some video cards via BIOS. Disabling this
- option saves about 6k.
+ This option is required by programs like DOSEMU to run
+ 16-bit real mode legacy code on x86 processors. It also may
+ be needed by software like XFree86 to initialize some video
+ cards via BIOS. Disabling this option saves about 6K.
+
+config X86_16BIT
+ bool "Enable support for 16-bit segments" if EXPERT
+ default y
+ ---help---
+ This option is required by programs like Wine to run 16-bit
+ protected mode legacy code on x86 processors. Disabling
+ this option saves about 300 bytes on i386, or around 6K text
+ plus 16K runtime memory on x86-64,
+
+config X86_ESPFIX32
+ def_bool y
+ depends on X86_16BIT && X86_32

config X86_ESPFIX64
def_bool y
- depends on X86_64
+ depends on X86_16BIT && X86_64

config TOSHIBA
tristate "Toshiba Laptop support"
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 7b5e08fc505e..8d37c0cf5f1a 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -532,6 +532,7 @@ syscall_exit:
restore_all:
TRACE_IRQS_IRET
restore_all_notrace:
+#ifdef CONFIG_X86_ESPFIX32
movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
# are returning to the kernel.
@@ -542,6 +543,7 @@ restore_all_notrace:
cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
CFI_REMEMBER_STATE
je ldt_ss # returning to user-space with LDT SS
+#endif
restore_nocheck:
RESTORE_REGS 4 # skip orig_eax/error_code
irq_return:
@@ -554,6 +556,7 @@ ENTRY(iret_exc)
.previous
_ASM_EXTABLE(irq_return,iret_exc)

+#ifdef CONFIG_X86_ESPFIX32
CFI_RESTORE_STATE
ldt_ss:
#ifdef CONFIG_PARAVIRT
@@ -597,6 +600,7 @@ ldt_ss:
lss (%esp), %esp /* switch to espfix segment */
CFI_ADJUST_CFA_OFFSET -8
jmp restore_nocheck
+#endif
CFI_ENDPROC
ENDPROC(system_call)

@@ -709,6 +713,7 @@ END(syscall_badsys)
* the high word of the segment base from the GDT and swiches to the
* normal stack and adjusts ESP with the matching offset.
*/
+#ifdef CONFIG_X86_ESPFIX32
/* fixup the stack */
mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
@@ -718,8 +723,10 @@ END(syscall_badsys)
pushl_cfi %eax
lss (%esp), %esp /* switch to the normal stack segment */
CFI_ADJUST_CFA_OFFSET -8
+#endif
.endm
.macro UNWIND_ESPFIX_STACK
+#ifdef CONFIG_X86_ESPFIX32
movl %ss, %eax
/* see if on espfix stack */
cmpw $__ESPFIX_SS, %ax
@@ -730,6 +737,7 @@ END(syscall_badsys)
/* switch to normal stack */
FIXUP_ESPFIX_STACK
27:
+#endif
.endm

/*
@@ -1347,11 +1355,13 @@ END(debug)
ENTRY(nmi)
RING0_INT_FRAME
ASM_CLAC
+#ifdef CONFIG_X86_ESPFIX32
pushl_cfi %eax
movl %ss, %eax
cmpw $__ESPFIX_SS, %ax
popl_cfi %eax
je nmi_espfix_stack
+#endif
cmpl $ia32_sysenter_target,(%esp)
je nmi_stack_fixup
pushl_cfi %eax
@@ -1391,6 +1401,7 @@ nmi_debug_stack_check:
FIX_STACK 24, nmi_stack_correct, 1
jmp nmi_stack_correct

+#ifdef CONFIG_X86_ESPFIX32
nmi_espfix_stack:
/* We have a RING0_INT_FRAME here.
*
@@ -1412,6 +1423,7 @@ nmi_espfix_stack:
lss 12+4(%esp), %esp # back to espfix stack
CFI_ADJUST_CFA_OFFSET -24
jmp irq_return
+#endif
CFI_ENDPROC
END(nmi)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b7b03a91d713..01cbb4eb9179 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1060,8 +1060,10 @@ irq_return:
* Are we returning to a stack segment from the LDT? Note: in
* 64-bit mode SS:RSP on the exception stack is always valid.
*/
+#ifdef CONFIG_X86_ESPFIX64
testb $4,(SS-RIP)(%rsp)
jnz irq_return_ldt
+#endif

irq_return_iret:
INTERRUPT_RETURN
@@ -1073,6 +1075,7 @@ ENTRY(native_iret)
_ASM_EXTABLE(native_iret, bad_iret)
#endif

+#ifdef CONFIG_X86_ESPFIX64
irq_return_ldt:
pushq_cfi %rax
pushq_cfi %rdi
@@ -1096,6 +1099,7 @@ irq_return_ldt:
movq %rax,%rsp
popq_cfi %rax
jmp irq_return_iret
+#endif

.section .fixup,"ax"
bad_iret:
@@ -1169,6 +1173,7 @@ END(common_interrupt)
* modify the stack to make it look like we just entered
* the #GP handler from user space, similar to bad_iret.
*/
+#ifdef CONFIG_X86_ESPFIX64
ALIGN
__do_double_fault:
XCPT_FRAME 1 RDI+8
@@ -1194,6 +1199,9 @@ __do_double_fault:
retq
CFI_ENDPROC
END(__do_double_fault)
+#else
+# define __do_double_fault do_double_fault
+#endif

/*
* End of kprobes section
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ebc987398923..c37886d759cc 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -229,6 +229,11 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
}
}

+ if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
+ error = -EINVAL;
+ goto out_unlock;
+ }
+
fill_ldt(&ldt, &ldt_info);
if (oldmode)
ldt.avl = 0;

Luis Henriques

unread,

Aug 18, 2014, 5:50:05 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Christoph Paasch <christop...@uclouvain.be>

commit 45a07695bc64b3ab5d6d2215f9677e5b8c05a7d0 upstream.

In veno we do a multiplication of the cwnd and the rtt. This
may overflow and thus their result is stored in a u64. However, we first
need to cast the cwnd so that actually 64-bit arithmetic is done.

A first attempt at fixing 76f1017757aa0 ([TCP]: TCP Veno congestion
control) was made by 159131149c2 (tcp: Overflow bug in Vegas), but it
failed to add the required cast in tcp_veno_cong_avoid().

Fixes: 76f1017757aa0 ([TCP]: TCP Veno congestion control)
Signed-off-by: Christoph Paasch <christop...@uclouvain.be>

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

net/ipv4/tcp_veno.c | 2 +-

1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index ac43cd747bce..b4d1858be550 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -144,7 +144,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)

rtt = veno->minrtt;

- target_cwnd = (tp->snd_cwnd * veno->basertt);
+ target_cwnd = (u64)tp->snd_cwnd * veno->basertt;
target_cwnd <<= V_PARAM_SHIFT;
do_div(target_cwnd, rtt);

Luis Henriques

unread,

Aug 18, 2014, 5:50:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Sasha Levin <sasha...@oracle.com>

commit 3cf521f7dc87c031617fd47e4b7aa2593c2f3daf upstream.

The l2tp [get|set]sockopt() code has fallen back to the UDP functions
for socket option levels != SOL_PPPOL2TP since day one, but that has
never actually worked, since the l2tp socket isn't an inet socket.

As David Miller points out:

"If we wanted this to work, it'd have to look up the tunnel and then
use tunnel->sk, but I wonder how useful that would be"

Since this can never have worked so nobody could possibly have depended
on that functionality, just remove the broken code and return -EINVAL.

Reported-by: Sasha Levin <sasha...@oracle.com>
Acked-by: James Chapman <jcha...@katalix.com>
Acked-by: David Miller <da...@davemloft.net>
Cc: Phil Turnbull <phil.t...@oracle.com>
Cc: Vegard Nossum <vegard...@oracle.com>
Cc: Willy Tarreau <w...@1wt.eu>
Signed-off-by: Linus Torvalds <torv...@linux-foundation.org>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

net/l2tp/l2tp_ppp.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 9a0e5874e73e..164fa9dcd97d 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1365,7 +1365,7 @@ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname,
int err;

if (level != SOL_PPPOL2TP)
- return udp_prot.setsockopt(sk, level, optname, optval, optlen);
+ return -EINVAL;

if (optlen < sizeof(int))
return -EINVAL;
@@ -1491,7 +1491,7 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname,
struct pppol2tp_session *ps;

if (level != SOL_PPPOL2TP)
- return udp_prot.getsockopt(sk, level, optname, optval, optlen);
+ return -EINVAL;

if (get_user(len, optlen))
return -EFAULT;

Luis Henriques

unread,

Aug 18, 2014, 5:50:05 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: willy tarreau <w...@1wt.eu>

commit 290213667ab53a95456397763205e4b1e30f46b5 upstream.

If a queue timeout is reported, we can oops because of some
schedules while the caller is atomic, as shown below :

mvneta d0070000.ethernet eth0: tx timeout
BUG: scheduling while atomic: bash/1528/0x00000100
Modules linked in: slhttp_ethdiv(C) [last unloaded: slhttp_ethdiv]
CPU: 2 PID: 1528 Comm: bash Tainted: G WC 3.13.0-rc4-mvebu-nf #180
[<c0011bd9>] (unwind_backtrace+0x1/0x98) from [<c000f1ab>] (show_stack+0xb/0xc)
[<c000f1ab>] (show_stack+0xb/0xc) from [<c02ad323>] (dump_stack+0x4f/0x64)
[<c02ad323>] (dump_stack+0x4f/0x64) from [<c02abe67>] (__schedule_bug+0x37/0x4c)
[<c02abe67>] (__schedule_bug+0x37/0x4c) from [<c02ae261>] (__schedule+0x325/0x3ec)
[<c02ae261>] (__schedule+0x325/0x3ec) from [<c02adb97>] (schedule_timeout+0xb7/0x118)
[<c02adb97>] (schedule_timeout+0xb7/0x118) from [<c0020a67>] (msleep+0xf/0x14)
[<c0020a67>] (msleep+0xf/0x14) from [<c01dcbe5>] (mvneta_stop_dev+0x21/0x194)
[<c01dcbe5>] (mvneta_stop_dev+0x21/0x194) from [<c01dcfe9>] (mvneta_tx_timeout+0x19/0x24)
[<c01dcfe9>] (mvneta_tx_timeout+0x19/0x24) from [<c024afc7>] (dev_watchdog+0x18b/0x1c4)
[<c024afc7>] (dev_watchdog+0x18b/0x1c4) from [<c0020b53>] (call_timer_fn.isra.27+0x17/0x5c)
[<c0020b53>] (call_timer_fn.isra.27+0x17/0x5c) from [<c0020cad>] (run_timer_softirq+0x115/0x170)
[<c0020cad>] (run_timer_softirq+0x115/0x170) from [<c001ccb9>] (__do_softirq+0xbd/0x1a8)
[<c001ccb9>] (__do_softirq+0xbd/0x1a8) from [<c001cfad>] (irq_exit+0x61/0x98)
[<c001cfad>] (irq_exit+0x61/0x98) from [<c000d4bf>] (handle_IRQ+0x27/0x60)
[<c000d4bf>] (handle_IRQ+0x27/0x60) from [<c000843b>] (armada_370_xp_handle_irq+0x33/0xc8)
[<c000843b>] (armada_370_xp_handle_irq+0x33/0xc8) from [<c000fba9>] (__irq_usr+0x49/0x60)

Ben Hutchings attempted to propose a better fix consisting in using a
scheduled work for this, but while it fixed this panic, it caused other
random freezes and panics proving that the reset sequence in the driver
is unreliable and that additional fixes should be investigated.

When sending multiple streams over a link limited to 100 Mbps, Tx timeouts
happen from time to time, and the driver correctly recovers only when the
function is disabled.

Cc: Thomas Petazzoni <thomas.p...@free-electrons.com>
Cc: Gregory CLEMENT <gregory...@free-electrons.com>
Cc: Ben Hutchings <b...@decadent.org.uk>
Tested-by: Arnaud Ebalard <ar...@natisbad.org>
Signed-off-by: Willy Tarreau <w...@1wt.eu>

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Willy Tarreau <w...@1wt.eu>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/net/ethernet/marvell/mvneta.c | 11 -----------
1 file changed, 11 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 18cee4f1590e..f72112294750 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -2211,16 +2211,6 @@ static void mvneta_stop_dev(struct mvneta_port *pp)
mvneta_rx_reset(pp);
}

-/* tx timeout callback - display a message and stop/start the network device */
-static void mvneta_tx_timeout(struct net_device *dev)
-{
- struct mvneta_port *pp = netdev_priv(dev);
-
- netdev_info(dev, "tx timeout\n");
- mvneta_stop_dev(pp);
- mvneta_start_dev(pp);
-}
-
/* Return positive if MTU is valid */
static int mvneta_check_mtu_valid(struct net_device *dev, int mtu)
{
@@ -2586,7 +2576,6 @@ static const struct net_device_ops mvneta_netdev_ops = {
.ndo_set_rx_mode = mvneta_set_rx_mode,
.ndo_set_mac_address = mvneta_set_mac_addr,
.ndo_change_mtu = mvneta_change_mtu,
- .ndo_tx_timeout = mvneta_tx_timeout,
.ndo_get_stats64 = mvneta_get_stats64,

Luis Henriques

unread,

Aug 18, 2014, 5:50:05 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Mikulas Patocka <mpat...@redhat.com>

commit 3b3a1814d1703027f9867d0f5cbbfaf6c7482474 upstream.

This patch provides the compat BLKZEROOUT ioctl. The argument is a pointer
to two uint64_t values, so there is no need to translate it.

Signed-off-by: Mikulas Patocka <mpat...@redhat.com>
Acked-by: Martin K. Petersen <martin....@oracle.com>
Signed-off-by: Jens Axboe <ax...@fb.com>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

block/compat_ioctl.c | 1 +

1 file changed, 1 insertion(+)

diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 7e5d474dc6ba..351ba4f9d4eb 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -690,6 +690,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case BLKROSET:
case BLKDISCARD:
case BLKSECDISCARD:
+ case BLKZEROOUT:
/*
* the ones below are implemented in blkdev_locked_ioctl,
* but we call blkdev_ioctl, which gets the lock for us

Luis Henriques

unread,

Aug 18, 2014, 5:50:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: "David S. Miller" <da...@davemloft.net>

commit b18eb2d779240631a098626cb6841ee2dd34fda0 upstream.

Access to the TSB hash tables during TLB misses requires that there be
an atomic 128-bit quad load available so that we fetch a matching TAG
and DATA field at the same time.

On cpus prior to UltraSPARC-III only virtual address based quad loads
are available. UltraSPARC-III and later provide physical address
based variants which are easier to use.

When we only have virtual address based quad loads available this
means that we have to lock the TSB into the TLB at a fixed virtual
address on each cpu when it runs that process. We can't just access
the PAGE_OFFSET based aliased mapping of these TSBs because we cannot
take a recursive TLB miss inside of the TLB miss handler without
risking running out of hardware trap levels (some trap combinations
can be deep, such as those generated by register window spill and fill
traps).

Without huge pages it's working perfectly fine, but when the huge TSB
got added another chunk of fixed virtual address space was not
allocated for this second TSB mapping.

So we were mapping both the 8K and 4MB TSBs to the same exact virtual
address, causing multiple TLB matches which gives undefined behavior.

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/sparc/include/asm/pgtable_64.h | 6 ++++--
arch/sparc/mm/tsb.c | 14 +++++++++++++-
2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 90f289f0ec8e..32bb9c4b2182 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -24,7 +24,8 @@

/* The kernel image occupies 0x4000000 to 0x6000000 (4MB --> 96MB).
* The page copy blockops can use 0x6000000 to 0x8000000.
- * The TSB is mapped in the 0x8000000 to 0xa000000 range.
+ * The 8K TSB is mapped in the 0x8000000 to 0x8400000 range.
+ * The 4M TSB is mapped in the 0x8400000 to 0x8800000 range.
* The PROM resides in an area spanning 0xf0000000 to 0x100000000.
* The vmalloc area spans 0x100000000 to 0x200000000.
* Since modules need to be in the lowest 32-bits of the address space,
@@ -33,7 +34,8 @@
* 0x400000000.
*/
#define TLBTEMP_BASE _AC(0x0000000006000000,UL)
-#define TSBMAP_BASE _AC(0x0000000008000000,UL)
+#define TSBMAP_8K_BASE _AC(0x0000000008000000,UL)
+#define TSBMAP_4M_BASE _AC(0x0000000008400000,UL)
#define MODULES_VADDR _AC(0x0000000010000000,UL)
#define MODULES_LEN _AC(0x00000000e0000000,UL)
#define MODULES_END _AC(0x00000000f0000000,UL)
diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c
index 2cc3bce5ee91..71d99a6c75a7 100644
--- a/arch/sparc/mm/tsb.c
+++ b/arch/sparc/mm/tsb.c
@@ -133,7 +133,19 @@ static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsign
mm->context.tsb_block[tsb_idx].tsb_nentries =
tsb_bytes / sizeof(struct tsb);

- base = TSBMAP_BASE;
+ switch (tsb_idx) {
+ case MM_TSB_BASE:
+ base = TSBMAP_8K_BASE;
+ break;
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
+ case MM_TSB_HUGE:
+ base = TSBMAP_4M_BASE;
+ break;
+#endif
+ default:
+ BUG();
+ }
+
tte = pgprot_val(PAGE_KERNEL_LOCKED);
tsb_paddr = __pa(mm->context.tsb_block[tsb_idx].tsb);
BUG_ON(tsb_paddr & (tsb_bytes - 1UL));

Luis Henriques

unread,

Aug 18, 2014, 5:50:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Sasha Levin <sasha...@oracle.com>

commit 06ebb06d49486676272a3c030bfeef4bd969a8e6 upstream.

Check for cases when the caller requests 0 bytes instead of running off
and dereferencing potentially invalid iovecs.

Signed-off-by: Sasha Levin <sasha...@oracle.com>

Signed-off-by: David S. Miller <da...@davemloft.net>

[ luis: backported to 3.11: file rename: lib/iovec.c -> net/core/iovec.c ]

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

net/core/iovec.c | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/net/core/iovec.c b/net/core/iovec.c
index 2145b7150beb..1117a26a8548 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -107,6 +107,10 @@ EXPORT_SYMBOL(memcpy_toiovecend);
int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
int offset, int len)
{
+ /* No data? Done! */
+ if (len == 0)
+ return 0;
+
/* Skip over the finished iovecs */
while (offset >= iov->iov_len) {
offset -= iov->iov_len;

Luis Henriques

unread,

Aug 18, 2014, 5:50:05 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Eric Dumazet <edum...@google.com>

commit 04ca6973f7c1a0d8537f2d9906a0cf8e69886d75 upstream.

In "Counting Packets Sent Between Arbitrary Internet Hosts", Jeffrey and
Jedidiah describe ways exploiting linux IP identifier generation to
infer whether two machines are exchanging packets.

With commit 73f156a6e8c1 ("inetpeer: get rid of ip_id_count"), we
changed IP id generation, but this does not really prevent this
side-channel technique.

This patch adds a random amount of perturbation so that IP identifiers
for a given destination [1] are no longer monotonically increasing after
an idle period.

Note that prandom_u32_max(1) returns 0, so if generator is used at most
once per jiffy, this patch inserts no hole in the ID suite and do not
increase collision probability.

This is jiffies based, so in the worst case (HZ=1000), the id can
rollover after ~65 seconds of idle time, which should be fine.

We also change the hash used in __ip_select_ident() to not only hash
on daddr, but also saddr and protocol, so that ICMP probes can not be
used to infer information for other protocols.

For IPv6, adds saddr into the hash as well, but not nexthdr.

If I ping the patched target, we can see ID are now hard to predict.

21:57:11.008086 IP (...)
A > target: ICMP echo request, seq 1, length 64
21:57:11.010752 IP (... id 2081 ...)
target > A: ICMP echo reply, seq 1, length 64

21:57:12.013133 IP (...)
A > target: ICMP echo request, seq 2, length 64
21:57:12.015737 IP (... id 3039 ...)
target > A: ICMP echo reply, seq 2, length 64

21:57:13.016580 IP (...)
A > target: ICMP echo request, seq 3, length 64
21:57:13.019251 IP (... id 3437 ...)
target > A: ICMP echo reply, seq 3, length 64

[1] TCP sessions uses a per flow ID generator not changed by this patch.

Signed-off-by: Eric Dumazet <edum...@google.com>
Reported-by: Jeffrey Knockel <je...@cs.unm.edu>
Reported-by: Jedidiah R. Crandall <cran...@cs.unm.edu>
Cc: Willy Tarreau <w...@1wt.eu>
Cc: Hannes Frederic Sowa <han...@redhat.com>

Signed-off-by: David S. Miller <da...@davemloft.net>

[ luis: backported to 3.11: used davem's backport to 3.10 and 3.12 ]

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

include/net/ip.h | 11 +----------
net/ipv4/route.c | 36 +++++++++++++++++++++++++++++++++---
net/ipv6/ip6_output.c | 2 ++
3 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index dd72c8f93797..8695359982d1 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -252,16 +252,7 @@ int ip_dont_fragment(struct sock *sk, struct dst_entry *dst)
!(dst_metric_locked(dst, RTAX_MTU)));
}

-#define IP_IDENTS_SZ 2048u
-extern atomic_t *ip_idents;
-
-static inline u32 ip_idents_reserve(u32 hash, int segs)
-{
- atomic_t *id_ptr = ip_idents + hash % IP_IDENTS_SZ;
-
- return atomic_add_return(segs, id_ptr) - segs;
-}
-
+u32 ip_idents_reserve(u32 hash, int segs);
void __ip_select_ident(struct iphdr *iph, int segs);

static inline void ip_select_ident_segs(struct sk_buff *skb, struct sock *sk, int segs)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 69a503c41144..c16926d44938 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -465,8 +465,35 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
return neigh_create(&arp_tbl, pkey, dev);
}

-atomic_t *ip_idents __read_mostly;
-EXPORT_SYMBOL(ip_idents);
+#define IP_IDENTS_SZ 2048u
+struct ip_ident_bucket {
+ atomic_t id;
+ u32 stamp32;
+};
+
+static struct ip_ident_bucket *ip_idents __read_mostly;
+
+/* In order to protect privacy, we add a perturbation to identifiers
+ * if one generator is seldom used. This makes hard for an attacker
+ * to infer how many packets were sent between two points in time.
+ */
+u32 ip_idents_reserve(u32 hash, int segs)
+{
+ struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
+ u32 old = ACCESS_ONCE(bucket->stamp32);
+ u32 now = (u32)jiffies;
+ u32 delta = 0;
+
+ if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
+ u64 x = prandom_u32();
+
+ x *= (now - old);
+ delta = (u32)(x >> 32);
+ }
+
+ return atomic_add_return(segs + delta, &bucket->id) - segs;
+}
+EXPORT_SYMBOL(ip_idents_reserve);

void __ip_select_ident(struct iphdr *iph, int segs)
{
@@ -479,7 +506,10 @@ void __ip_select_ident(struct iphdr *iph, int segs)
get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
}

- hash = jhash_1word((__force u32)iph->daddr, ip_idents_hashrnd);
+ hash = jhash_3words((__force u32)iph->daddr,
+ (__force u32)iph->saddr,
+ iph->protocol,
+ ip_idents_hashrnd);
id = ip_idents_reserve(hash, segs);
iph->id = htons(id);
}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ebd8e1eeaeec..325fd287ae2a 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -552,6 +552,8 @@ static void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
get_random_bytes(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
}
hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd);
+ hash = __ipv6_addr_jhash(&rt->rt6i_src.addr, hash);
+
id = ip_idents_reserve(hash, 1);
fhdr->identification = htonl(id);

Luis Henriques

unread,

Aug 18, 2014, 5:50:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Sowmini Varadhan <sowmini....@oracle.com>

commit 4ec1b01029b4facb651b8ef70bc20a4be4cebc63 upstream.

The LDC handshake could have been asynchronously triggered
after ldc_bind() enables the ldc_rx() receive interrupt-handler
(and thus intercepts incoming control packets)
and before vio_port_up() calls ldc_connect(). If that is the case,
ldc_connect() should return 0 and let the state-machine
progress.

Signed-off-by: Sowmini Varadhan <sowmini....@oracle.com>
Acked-by: Karl Volz <karl...@oracle.com>

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/sparc/kernel/ldc.c | 2 +-

1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sparc/kernel/ldc.c b/arch/sparc/kernel/ldc.c
index 54df554b82d9..fa4c900a0d1f 100644
--- a/arch/sparc/kernel/ldc.c
+++ b/arch/sparc/kernel/ldc.c
@@ -1336,7 +1336,7 @@ int ldc_connect(struct ldc_channel *lp)
if (!(lp->flags & LDC_FLAG_ALLOCED_QUEUES) ||
!(lp->flags & LDC_FLAG_REGISTERED_QUEUES) ||
lp->hs_state != LDC_HS_OPEN)
- err = -EINVAL;
+ err = ((lp->hs_state > LDC_HS_OPEN) ? 0 : -EINVAL);
else
err = start_handshake(lp);

Luis Henriques

unread,

Aug 18, 2014, 5:50:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Gao feng <gao...@cn.fujitsu.com>

commit 33d99113b1102c2d2f8603b9ba72d89d915c13f5 upstream.

commit 25fb6ca4ed9cad72f14f61629b68dc03c0d9713f
"net IPv6 : Fix broken IPv6 routing table after loopback down-up"
allocates addrconf router for ipv6 address when lo device up.
but commit a881ae1f625c599b460cc8f8a7fcb1c438f699ad
"ipv6:don't call addrconf_dst_alloc again when enable lo" breaks
this behavior.

Since the addrconf router is moved to the garbage list when
lo device down, we should release this router and rellocate
a new one for ipv6 address when lo device up.

This patch solves bug 67951 on bugzilla
https://bugzilla.kernel.org/show_bug.cgi?id=67951

change from v1:
use ip6_rt_put to repleace ip6_del_rt, thanks Hannes!
change code style, suggested by Sergei.

CC: Sabrina Dubroca <s...@queasysnail.net>
CC: Hannes Frederic Sowa <han...@stressinduktion.org>
Reported-by: Weilong Chen <chenw...@huawei.com>
Signed-off-by: Weilong Chen <chenw...@huawei.com>
Signed-off-by: Gao feng <gao...@cn.fujitsu.com>

Acked-by: Hannes Frederic Sowa <han...@stressinduktion.org>

Signed-off-by: David S. Miller <da...@davemloft.net>
Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

net/ipv6/addrconf.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a6ddced9d535..e90ae34ae7ff 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2685,8 +2685,18 @@ static void init_loopback(struct net_device *dev)
if (sp_ifa->flags & (IFA_F_DADFAILED | IFA_F_TENTATIVE))
continue;

- if (sp_ifa->rt)
- continue;
+ if (sp_ifa->rt) {
+ /* This dst has been added to garbage list when
+ * lo device down, release this obsolete dst and
+ * reallocate a new router for ifa.
+ */
+ if (sp_ifa->rt->dst.obsolete > 0) {
+ ip6_rt_put(sp_ifa->rt);
+ sp_ifa->rt = NULL;
+ } else {
+ continue;
+ }
+ }

sp_rt = addrconf_dst_alloc(idev, &sp_ifa->addr, 0);

Luis Henriques

unread,

Aug 18, 2014, 5:50:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Eric Dumazet <edum...@google.com>

commit 73f156a6e8c1074ac6327e0abd1169e95eb66463 upstream.

Ideally, we would need to generate IP ID using a per destination IP
generator.

linux kernels used inet_peer cache for this purpose, but this had a huge
cost on servers disabling MTU discovery.

1) each inet_peer struct consumes 192 bytes

2) inetpeer cache uses a binary tree of inet_peer structs,
with a nominal size of ~66000 elements under load.

3) lookups in this tree are hitting a lot of cache lines, as tree depth
is about 20.

4) If server deals with many tcp flows, we have a high probability of
not finding the inet_peer, allocating a fresh one, inserting it in
the tree with same initial ip_id_count, (cf secure_ip_id())

5) We garbage collect inet_peer aggressively.

IP ID generation do not have to be 'perfect'

Goal is trying to avoid duplicates in a short period of time,
so that reassembly units have a chance to complete reassembly of
fragments belonging to one message before receiving other fragments
with a recycled ID.

We simply use an array of generators, and a Jenkin hash using the dst IP
as a key.

ipv6_select_ident() is put back into net/ipv6/ip6_output.c where it
belongs (it is only used from this file)

secure_ip_id() and secure_ipv6_id() no longer are needed.

Rename ip_select_ident_more() to ip_select_ident_segs() to avoid
unnecessary decrement/increment of the number of segments.

Signed-off-by: Eric Dumazet <edum...@google.com>

Signed-off-by: David S. Miller <da...@davemloft.net>

[ luis: backported to 3.11: used davem's backport to 3.10 and 3.12 ]

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/net/ppp/pptp.c | 2 +-
include/net/inetpeer.h | 16 +++-----------
include/net/ip.h | 40 ++++++++++++++++++++---------------
include/net/ipv6.h | 11 ++++++----
include/net/secure_seq.h | 2 --
net/core/secure_seq.c | 25 ----------------------
net/ipv4/igmp.c | 4 ++--
net/ipv4/inetpeer.c | 18 ----------------
net/ipv4/ip_output.c | 7 +++---
net/ipv4/ip_tunnel_core.c | 2 +-
net/ipv4/ipmr.c | 2 +-
net/ipv4/raw.c | 2 +-
net/ipv4/route.c | 47 +++++++++++++++++------------------------
net/ipv4/xfrm4_mode_tunnel.c | 2 +-
net/ipv6/ip6_output.c | 15 +++++++++++++
net/ipv6/output_core.c | 24 ---------------------
net/netfilter/ipvs/ip_vs_xmit.c | 2 +-
17 files changed, 78 insertions(+), 143 deletions(-)

diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index 7f10588fe668..8161c3f066a3 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -281,7 +281,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
nf_reset(skb);

skb->ip_summed = CHECKSUM_NONE;
- ip_select_ident(skb, &rt->dst, NULL);
+ ip_select_ident(skb, NULL);
ip_send_check(iph);

ip_local_out(skb);
diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 6ca347a0717e..bb06fd26a7bd 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -41,14 +41,13 @@ struct inet_peer {
struct rcu_head gc_rcu;
};
/*
- * Once inet_peer is queued for deletion (refcnt == -1), following fields
- * are not available: rid, ip_id_count
+ * Once inet_peer is queued for deletion (refcnt == -1), following field
+ * is not available: rid
* We can share memory with rcu_head to help keep inet_peer small.
*/
union {
struct {
atomic_t rid; /* Frag reception counter */
- atomic_t ip_id_count; /* IP ID for the next packet */
};
struct rcu_head rcu;
struct inet_peer *gc_next;
@@ -166,7 +165,7 @@ extern void inetpeer_invalidate_tree(struct inet_peer_base *);
extern void inetpeer_invalidate_family(int family);

/*
- * temporary check to make sure we dont access rid, ip_id_count, tcp_ts,
+ * temporary check to make sure we dont access rid, tcp_ts,
* tcp_ts_stamp if no refcount is taken on inet_peer
*/
static inline void inet_peer_refcheck(const struct inet_peer *p)
@@ -174,13 +173,4 @@ static inline void inet_peer_refcheck(const struct inet_peer *p)
WARN_ON_ONCE(atomic_read(&p->refcnt) <= 0);
}

-
-/* can be called with or without local BH being disabled */
-static inline int inet_getid(struct inet_peer *p, int more)
-{
- more++;
- inet_peer_refcheck(p);
- return atomic_add_return(more, &p->ip_id_count) - more;
-}
-
#endif /* _NET_INETPEER_H */
diff --git a/include/net/ip.h b/include/net/ip.h
index 788f1d8a796f..dd72c8f93797 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -252,9 +252,19 @@ int ip_dont_fragment(struct sock *sk, struct dst_entry *dst)
!(dst_metric_locked(dst, RTAX_MTU)));
}

-extern void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more);
+#define IP_IDENTS_SZ 2048u
+extern atomic_t *ip_idents;

-static inline void ip_select_ident(struct sk_buff *skb, struct dst_entry *dst, struct sock *sk)
+static inline u32 ip_idents_reserve(u32 hash, int segs)
+{
+ atomic_t *id_ptr = ip_idents + hash % IP_IDENTS_SZ;
+
+ return atomic_add_return(segs, id_ptr) - segs;
+}
+
+void __ip_select_ident(struct iphdr *iph, int segs);
+
+static inline void ip_select_ident_segs(struct sk_buff *skb, struct sock *sk, int segs)
{
struct iphdr *iph = ip_hdr(skb);

@@ -264,24 +274,20 @@ static inline void ip_select_ident(struct sk_buff *skb, struct dst_entry *dst, s
* does not change, they drop every other packet in
* a TCP stream using header compression.
*/
- iph->id = (sk && inet_sk(sk)->inet_daddr) ?
- htons(inet_sk(sk)->inet_id++) : 0;
- } else
- __ip_select_ident(iph, dst, 0);
-}
-
-static inline void ip_select_ident_more(struct sk_buff *skb, struct dst_entry *dst, struct sock *sk, int more)
-{
- struct iphdr *iph = ip_hdr(skb);
-
- if ((iph->frag_off & htons(IP_DF)) && !skb->local_df) {
if (sk && inet_sk(sk)->inet_daddr) {
iph->id = htons(inet_sk(sk)->inet_id);
- inet_sk(sk)->inet_id += 1 + more;
- } else
+ inet_sk(sk)->inet_id += segs;
+ } else {
iph->id = 0;
- } else
- __ip_select_ident(iph, dst, more);
+ }
+ } else {
+ __ip_select_ident(iph, segs);
+ }
+}
+
+static inline void ip_select_ident(struct sk_buff *skb, struct sock *sk)
+{
+ ip_select_ident_segs(skb, sk, 1);
}

/*
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 62ccfc040f1a..444ad5d45375 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -536,14 +536,19 @@ static inline u32 ipv6_addr_hash(const struct in6_addr *a)
}

/* more secured version of ipv6_addr_hash() */
-static inline u32 ipv6_addr_jhash(const struct in6_addr *a)
+static inline u32 __ipv6_addr_jhash(const struct in6_addr *a, const u32 initval)
{
u32 v = (__force u32)a->s6_addr32[0] ^ (__force u32)a->s6_addr32[1];

return jhash_3words(v,
(__force u32)a->s6_addr32[2],
(__force u32)a->s6_addr32[3],
- ipv6_hash_secret);
+ initval);
+}
+
+static inline u32 ipv6_addr_jhash(const struct in6_addr *a)
+{
+ return __ipv6_addr_jhash(a, ipv6_hash_secret);
}

static inline bool ipv6_addr_loopback(const struct in6_addr *a)
@@ -655,8 +660,6 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add
return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr));
}

-extern void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt);
-
/*
* Header manipulation
*/
diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h
index c2e542b27a5a..b1c3d1c63c4e 100644
--- a/include/net/secure_seq.h
+++ b/include/net/secure_seq.h
@@ -3,8 +3,6 @@

#include <linux/types.h>

-extern __u32 secure_ip_id(__be32 daddr);
-extern __u32 secure_ipv6_id(const __be32 daddr[4]);
extern u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport);
extern u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
__be16 dport);
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 8d9d05edd2eb..d0afc322b961 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -95,31 +95,6 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#endif

#ifdef CONFIG_INET
-__u32 secure_ip_id(__be32 daddr)
-{
- u32 hash[MD5_DIGEST_WORDS];
-
- net_secret_init();
- hash[0] = (__force __u32) daddr;
- hash[1] = net_secret[13];
- hash[2] = net_secret[14];
- hash[3] = net_secret[15];
-
- md5_transform(hash, net_secret);
-
- return hash[0];
-}
-
-__u32 secure_ipv6_id(const __be32 daddr[4])
-{
- __u32 hash[4];
-
- net_secret_init();
- memcpy(hash, daddr, 16);
- md5_transform(hash, net_secret);
-
- return hash[0];
-}

__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
__be16 sport, __be16 dport)
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index bad83d812afe..0162ba45015e 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -343,7 +343,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
pip->saddr = fl4.saddr;
pip->protocol = IPPROTO_IGMP;
pip->tot_len = 0; /* filled in later */
- ip_select_ident(skb, &rt->dst, NULL);
+ ip_select_ident(skb, NULL);
((u8 *)&pip[1])[0] = IPOPT_RA;
((u8 *)&pip[1])[1] = 4;
((u8 *)&pip[1])[2] = 0;
@@ -687,7 +687,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
iph->daddr = dst;
iph->saddr = fl4.saddr;
iph->protocol = IPPROTO_IGMP;
- ip_select_ident(skb, &rt->dst, NULL);
+ ip_select_ident(skb, NULL);
((u8 *)&iph[1])[0] = IPOPT_RA;
((u8 *)&iph[1])[1] = 4;
((u8 *)&iph[1])[2] = 0;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 33d5537881ed..67140efc15fd 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -26,20 +26,7 @@
* Theory of operations.
* We keep one entry for each peer IP address. The nodes contains long-living
* information about the peer which doesn't depend on routes.
- * At this moment this information consists only of ID field for the next
- * outgoing IP packet. This field is incremented with each packet as encoded
- * in inet_getid() function (include/net/inetpeer.h).
- * At the moment of writing this notes identifier of IP packets is generated
- * to be unpredictable using this code only for packets subjected
- * (actually or potentially) to defragmentation. I.e. DF packets less than
- * PMTU in size when local fragmentation is disabled use a constant ID and do
- * not use this code (see ip_select_ident() in include/net/ip.h).
*
- * Route cache entries hold references to our nodes.
- * New cache entries get references via lookup by destination IP address in
- * the avl tree. The reference is grabbed only when it's needed i.e. only
- * when we try to output IP packet which needs an unpredictable ID (see
- * __ip_select_ident() in net/ipv4/route.c).
* Nodes are removed only when reference counter goes to 0.
* When it's happened the node may be removed when a sufficient amount of
* time has been passed since its last use. The less-recently-used entry can
@@ -62,7 +49,6 @@
* refcnt: atomically against modifications on other CPU;
* usually under some other lock to prevent node disappearing
* daddr: unchangeable
- * ip_id_count: atomic value (no lock needed)
*/

static struct kmem_cache *peer_cachep __read_mostly;
@@ -504,10 +490,6 @@ relookup:
p->daddr = *daddr;
atomic_set(&p->refcnt, 1);
atomic_set(&p->rid, 0);
- atomic_set(&p->ip_id_count,
- (daddr->family == AF_INET) ?
- secure_ip_id(daddr->addr.a4) :
- secure_ipv6_id(daddr->addr.a6));
p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
p->rate_tokens = 0;
/* 60*HZ is arbitrary, but chosen enough high so that the first
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7f4ab5d31c16..e3f65a647e83 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -148,7 +148,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
iph->saddr = saddr;
iph->protocol = sk->sk_protocol;
- ip_select_ident(skb, &rt->dst, sk);
+ ip_select_ident(skb, sk);

if (opt && opt->opt.optlen) {
iph->ihl += opt->opt.optlen>>2;
@@ -386,8 +386,7 @@ packet_routed:
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
}

- ip_select_ident_more(skb, &rt->dst, sk,
- (skb_shinfo(skb)->gso_segs ?: 1) - 1);
+ ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1);

skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
@@ -1324,7 +1323,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
iph->ttl = ttl;
iph->protocol = sk->sk_protocol;
ip_copy_addrs(iph, fl4);
- ip_select_ident(skb, &rt->dst, sk);
+ ip_select_ident(skb, sk);

if (opt) {
iph->ihl += opt->optlen>>2;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 3fbba1753f2c..f386697cb798 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -76,7 +76,7 @@ int iptunnel_xmit(struct net *net, struct rtable *rt,
iph->daddr = dst;
iph->saddr = src;
iph->ttl = ttl;
- __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1);
+ __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1);

err = ip_local_out(skb);
if (unlikely(net_xmit_eval(err)))
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index aaa1be56c6ca..c6ebd6df7ef6 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1661,7 +1661,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
iph->protocol = IPPROTO_IPIP;
iph->ihl = 5;
iph->tot_len = htons(skb->len);
- ip_select_ident(skb, skb_dst(skb), NULL);
+ ip_select_ident(skb, NULL);
ip_send_check(iph);

memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 402870fdfa0e..b4a1c42a627f 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -387,7 +387,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
iph->check = 0;
iph->tot_len = htons(length);
if (!iph->id)
- ip_select_ident(skb, &rt->dst, NULL);
+ ip_select_ident(skb, NULL);

iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3f7344fca8f0..69a503c41144 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -89,6 +89,7 @@
#include <linux/rcupdate.h>
#include <linux/times.h>
#include <linux/slab.h>
+#include <linux/jhash.h>
#include <net/dst.h>
#include <net/net_namespace.h>
#include <net/protocol.h>
@@ -464,39 +465,23 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
return neigh_create(&arp_tbl, pkey, dev);
}

-/*
- * Peer allocation may fail only in serious out-of-memory conditions. However
- * we still can generate some output.
- * Random ID selection looks a bit dangerous because we have no chances to
- * select ID being unique in a reasonable period of time.
- * But broken packet identifier may be better than no packet at all.
- */
-static void ip_select_fb_ident(struct iphdr *iph)
-{
- static DEFINE_SPINLOCK(ip_fb_id_lock);
- static u32 ip_fallback_id;
- u32 salt;
+atomic_t *ip_idents __read_mostly;
+EXPORT_SYMBOL(ip_idents);

- spin_lock_bh(&ip_fb_id_lock);
- salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
- iph->id = htons(salt & 0xFFFF);
- ip_fallback_id = salt;
- spin_unlock_bh(&ip_fb_id_lock);
-}
-
-void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
+void __ip_select_ident(struct iphdr *iph, int segs)
{
- struct net *net = dev_net(dst->dev);
- struct inet_peer *peer;
+ static u32 ip_idents_hashrnd __read_mostly;
+ static bool hashrnd_initialized = false;
+ u32 hash, id;

- peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
- if (peer) {
- iph->id = htons(inet_getid(peer, more));
- inet_putpeer(peer);
- return;
+ if (unlikely(!hashrnd_initialized)) {
+ hashrnd_initialized = true;
+ get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
}

- ip_select_fb_ident(iph);
+ hash = jhash_1word((__force u32)iph->daddr, ip_idents_hashrnd);
+ id = ip_idents_reserve(hash, segs);
+ iph->id = htons(id);
}
EXPORT_SYMBOL(__ip_select_ident);

@@ -2714,6 +2699,12 @@ int __init ip_rt_init(void)
{
int rc = 0;

+ ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
+ if (!ip_idents)
+ panic("IP: failed to allocate ip_idents\n");
+
+ prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
+
#ifdef CONFIG_IP_ROUTE_CLASSID
ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
if (!ip_rt_acct)
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index b5663c37f089..e3f64831bc36 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -117,12 +117,12 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)

top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
- ip_select_ident(skb, dst->child, NULL);

top_iph->ttl = ip4_dst_hoplimit(dst->child);

top_iph->saddr = x->props.saddr.a4;
top_iph->daddr = x->id.daddr.a4;
+ ip_select_ident(skb, NULL);

return 0;
}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index fe0bc8e0e866..ebd8e1eeaeec 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -541,6 +541,21 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
skb_copy_secmark(to, from);
}

+static void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
+{
+ static u32 ip6_idents_hashrnd __read_mostly;
+ static bool hashrnd_initialized = false;
+ u32 hash, id;
+
+ if (unlikely(!hashrnd_initialized)) {
+ hashrnd_initialized = true;
+ get_random_bytes(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
+ }
+ hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd);
+ id = ip_idents_reserve(hash, 1);
+ fhdr->identification = htonl(id);
+}
+
int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
{
struct sk_buff *frag;
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 39f6ad1629ff..a515ec882ff5 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -6,30 +6,6 @@
#include <net/ipv6.h>
#include <net/ip6_fib.h>

-void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
-{
- static atomic_t ipv6_fragmentation_id;
- int ident;
-
-#if IS_ENABLED(CONFIG_IPV6)
- if (rt && !(rt->dst.flags & DST_NOPEER)) {
- struct inet_peer *peer;
- struct net *net;
-
- net = dev_net(rt->dst.dev);
- peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
- if (peer) {
- fhdr->identification = htonl(inet_getid(peer, 0));
- inet_putpeer(peer);
- return;
- }
- }
-#endif
- ident = atomic_inc_return(&ipv6_fragmentation_id);
- fhdr->identification = htonl(ident);
-}
-EXPORT_SYMBOL(ipv6_select_ident);
-
int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
{
u16 offset = sizeof(struct ipv6hdr);
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index c47444e4cf8c..7f0e1cf2d7e8 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -883,7 +883,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
iph->daddr = cp->daddr.ip;
iph->saddr = saddr;
iph->ttl = old_iph->ttl;
- ip_select_ident(skb, &rt->dst, NULL);
+ ip_select_ident(skb, NULL);

/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;

Luis Henriques

unread,

Aug 18, 2014, 6:00:02 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: "H. Peter Anvin" <h...@zytor.com>

commit 7ed6fb9b5a5510e4ef78ab27419184741169978a upstream.

This reverts commit fa81511bb0bbb2b1aace3695ce869da9762624ff in
preparation of merging in the proper fix (espfix64).

Signed-off-by: H. Peter Anvin <h...@zytor.com>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/x86/kernel/ldt.c | 4 +---
arch/x86/vdso/vdso32-setup.c | 8 --------
2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index dcbbaa165bde..af1d14a9ebda 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -20,8 +20,6 @@
#include <asm/mmu_context.h>
#include <asm/syscalls.h>

-int sysctl_ldt16 = 0;
-
#ifdef CONFIG_SMP
static void flush_ldt(void *current_mm)
{
@@ -236,7 +234,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
* IRET leaking the high bits of the kernel stack address.
*/
#ifdef CONFIG_X86_64
- if (!ldt_info.seg_32bit && !sysctl_ldt16) {
+ if (!ldt_info.seg_32bit) {
error = -EINVAL;
goto out_unlock;
}
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index f1d633a43f8e..d6bfb876cfb0 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -41,7 +41,6 @@ enum {
#ifdef CONFIG_X86_64
#define vdso_enabled sysctl_vsyscall32
#define arch_setup_additional_pages syscall32_setup_pages
-extern int sysctl_ldt16;
#endif

/*
@@ -381,13 +380,6 @@ static struct ctl_table abi_table2[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
- {
- .procname = "ldt16",
- .data = &sysctl_ldt16,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
{}

Luis Henriques

unread,

Aug 18, 2014, 6:00:02 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Jan Kara <ja...@suse.cz>

commit 504d58745c9ca28d33572e2d8a9990b43e06075d upstream.

clockevents_increase_min_delta() calls printk() from under
hrtimer_bases.lock. That causes lock inversion on scheduler locks because
printk() can call into the scheduler. Lockdep puts it as:

======================================================
[ INFO: possible circular locking dependency detected ]
3.15.0-rc8-06195-g939f04b #2 Not tainted
-------------------------------------------------------
trinity-main/74 is trying to acquire lock:
(&port_lock_key){-.....}, at: [<811c60be>] serial8250_console_write+0x8c/0x10c

but task is already holding lock:
(hrtimer_bases.lock){-.-...}, at: [<8103caeb>] hrtimer_try_to_cancel+0x13/0x66

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #5 (hrtimer_bases.lock){-.-...}:
[<8104a942>] lock_acquire+0x92/0x101
[<8142f11d>] _raw_spin_lock_irqsave+0x2e/0x3e
[<8103c918>] __hrtimer_start_range_ns+0x1c/0x197
[<8107ec20>] perf_swevent_start_hrtimer.part.41+0x7a/0x85
[<81080792>] task_clock_event_start+0x3a/0x3f
[<810807a4>] task_clock_event_add+0xd/0x14
[<8108259a>] event_sched_in+0xb6/0x17a
[<810826a2>] group_sched_in+0x44/0x122
[<81082885>] ctx_sched_in.isra.67+0x105/0x11f
[<810828e6>] perf_event_sched_in.isra.70+0x47/0x4b
[<81082bf6>] __perf_install_in_context+0x8b/0xa3
[<8107eb8e>] remote_function+0x12/0x2a
[<8105f5af>] smp_call_function_single+0x2d/0x53
[<8107e17d>] task_function_call+0x30/0x36
[<8107fb82>] perf_install_in_context+0x87/0xbb
[<810852c9>] SYSC_perf_event_open+0x5c6/0x701
[<810856f9>] SyS_perf_event_open+0x17/0x19
[<8142f8ee>] syscall_call+0x7/0xb

-> #4 (&ctx->lock){......}:
[<8104a942>] lock_acquire+0x92/0x101
[<8142f04c>] _raw_spin_lock+0x21/0x30
[<81081df3>] __perf_event_task_sched_out+0x1dc/0x34f
[<8142cacc>] __schedule+0x4c6/0x4cb
[<8142cae0>] schedule+0xf/0x11
[<8142f9a6>] work_resched+0x5/0x30

-> #3 (&rq->lock){-.-.-.}:
[<8104a942>] lock_acquire+0x92/0x101
[<8142f04c>] _raw_spin_lock+0x21/0x30
[<81040873>] __task_rq_lock+0x33/0x3a
[<8104184c>] wake_up_new_task+0x25/0xc2
[<8102474b>] do_fork+0x15c/0x2a0
[<810248a9>] kernel_thread+0x1a/0x1f
[<814232a2>] rest_init+0x1a/0x10e
[<817af949>] start_kernel+0x303/0x308
[<817af2ab>] i386_start_kernel+0x79/0x7d

-> #2 (&p->pi_lock){-.-...}:
[<8104a942>] lock_acquire+0x92/0x101
[<8142f11d>] _raw_spin_lock_irqsave+0x2e/0x3e
[<810413dd>] try_to_wake_up+0x1d/0xd6
[<810414cd>] default_wake_function+0xb/0xd
[<810461f3>] __wake_up_common+0x39/0x59
[<81046346>] __wake_up+0x29/0x3b
[<811b8733>] tty_wakeup+0x49/0x51
[<811c3568>] uart_write_wakeup+0x17/0x19
[<811c5dc1>] serial8250_tx_chars+0xbc/0xfb
[<811c5f28>] serial8250_handle_irq+0x54/0x6a
[<811c5f57>] serial8250_default_handle_irq+0x19/0x1c
[<811c56d8>] serial8250_interrupt+0x38/0x9e
[<810510e7>] handle_irq_event_percpu+0x5f/0x1e2
[<81051296>] handle_irq_event+0x2c/0x43
[<81052cee>] handle_level_irq+0x57/0x80
[<81002a72>] handle_irq+0x46/0x5c
[<810027df>] do_IRQ+0x32/0x89
[<8143036e>] common_interrupt+0x2e/0x33
[<8142f23c>] _raw_spin_unlock_irqrestore+0x3f/0x49
[<811c25a4>] uart_start+0x2d/0x32
[<811c2c04>] uart_write+0xc7/0xd6
[<811bc6f6>] n_tty_write+0xb8/0x35e
[<811b9beb>] tty_write+0x163/0x1e4
[<811b9cd9>] redirected_tty_write+0x6d/0x75
[<810b6ed6>] vfs_write+0x75/0xb0
[<810b7265>] SyS_write+0x44/0x77
[<8142f8ee>] syscall_call+0x7/0xb

-> #1 (&tty->write_wait){-.....}:
[<8104a942>] lock_acquire+0x92/0x101
[<8142f11d>] _raw_spin_lock_irqsave+0x2e/0x3e
[<81046332>] __wake_up+0x15/0x3b
[<811b8733>] tty_wakeup+0x49/0x51
[<811c3568>] uart_write_wakeup+0x17/0x19
[<811c5dc1>] serial8250_tx_chars+0xbc/0xfb
[<811c5f28>] serial8250_handle_irq+0x54/0x6a
[<811c5f57>] serial8250_default_handle_irq+0x19/0x1c
[<811c56d8>] serial8250_interrupt+0x38/0x9e
[<810510e7>] handle_irq_event_percpu+0x5f/0x1e2
[<81051296>] handle_irq_event+0x2c/0x43
[<81052cee>] handle_level_irq+0x57/0x80
[<81002a72>] handle_irq+0x46/0x5c
[<810027df>] do_IRQ+0x32/0x89
[<8143036e>] common_interrupt+0x2e/0x33
[<8142f23c>] _raw_spin_unlock_irqrestore+0x3f/0x49
[<811c25a4>] uart_start+0x2d/0x32
[<811c2c04>] uart_write+0xc7/0xd6
[<811bc6f6>] n_tty_write+0xb8/0x35e
[<811b9beb>] tty_write+0x163/0x1e4
[<811b9cd9>] redirected_tty_write+0x6d/0x75
[<810b6ed6>] vfs_write+0x75/0xb0
[<810b7265>] SyS_write+0x44/0x77
[<8142f8ee>] syscall_call+0x7/0xb

-> #0 (&port_lock_key){-.....}:
[<8104a62d>] __lock_acquire+0x9ea/0xc6d
[<8104a942>] lock_acquire+0x92/0x101
[<8142f11d>] _raw_spin_lock_irqsave+0x2e/0x3e
[<811c60be>] serial8250_console_write+0x8c/0x10c
[<8104e402>] call_console_drivers.constprop.31+0x87/0x118
[<8104f5d5>] console_unlock+0x1d7/0x398
[<8104fb70>] vprintk_emit+0x3da/0x3e4
[<81425f76>] printk+0x17/0x19
[<8105bfa0>] clockevents_program_min_delta+0x104/0x116
[<8105c548>] clockevents_program_event+0xe7/0xf3
[<8105cc1c>] tick_program_event+0x1e/0x23
[<8103c43c>] hrtimer_force_reprogram+0x88/0x8f
[<8103c49e>] __remove_hrtimer+0x5b/0x79
[<8103cb21>] hrtimer_try_to_cancel+0x49/0x66
[<8103cb4b>] hrtimer_cancel+0xd/0x18
[<8107f102>] perf_swevent_cancel_hrtimer.part.60+0x2b/0x30
[<81080705>] task_clock_event_stop+0x20/0x64
[<81080756>] task_clock_event_del+0xd/0xf
[<81081350>] event_sched_out+0xab/0x11e
[<810813e0>] group_sched_out+0x1d/0x66
[<81081682>] ctx_sched_out+0xaf/0xbf
[<81081e04>] __perf_event_task_sched_out+0x1ed/0x34f
[<8142cacc>] __schedule+0x4c6/0x4cb
[<8142cae0>] schedule+0xf/0x11
[<8142f9a6>] work_resched+0x5/0x30

other info that might help us debug this:

Chain exists of:
&port_lock_key --> &ctx->lock --> hrtimer_bases.lock

Possible unsafe locking scenario:

CPU0 CPU1
---- ----
lock(hrtimer_bases.lock);
lock(&ctx->lock);
lock(hrtimer_bases.lock);
lock(&port_lock_key);

*** DEADLOCK ***

4 locks held by trinity-main/74:
#0: (&rq->lock){-.-.-.}, at: [<8142c6f3>] __schedule+0xed/0x4cb
#1: (&ctx->lock){......}, at: [<81081df3>] __perf_event_task_sched_out+0x1dc/0x34f
#2: (hrtimer_bases.lock){-.-...}, at: [<8103caeb>] hrtimer_try_to_cancel+0x13/0x66
#3: (console_lock){+.+...}, at: [<8104fb5d>] vprintk_emit+0x3c7/0x3e4

stack backtrace:
CPU: 0 PID: 74 Comm: trinity-main Not tainted 3.15.0-rc8-06195-g939f04b #2
00000000 81c3a310 8b995c14 81426f69 8b995c44 81425a99 8161f671 8161f570
8161f538 8161f559 8161f538 8b995c78 8b142bb0 00000004 8b142fdc 8b142bb0
8b995ca8 8104a62d 8b142fac 000016f2 81c3a310 00000001 00000001 00000003
Call Trace:
[<81426f69>] dump_stack+0x16/0x18
[<81425a99>] print_circular_bug+0x18f/0x19c
[<8104a62d>] __lock_acquire+0x9ea/0xc6d
[<8104a942>] lock_acquire+0x92/0x101
[<811c60be>] ? serial8250_console_write+0x8c/0x10c
[<811c6032>] ? wait_for_xmitr+0x76/0x76
[<8142f11d>] _raw_spin_lock_irqsave+0x2e/0x3e
[<811c60be>] ? serial8250_console_write+0x8c/0x10c
[<811c60be>] serial8250_console_write+0x8c/0x10c
[<8104af87>] ? lock_release+0x191/0x223
[<811c6032>] ? wait_for_xmitr+0x76/0x76
[<8104e402>] call_console_drivers.constprop.31+0x87/0x118
[<8104f5d5>] console_unlock+0x1d7/0x398
[<8104fb70>] vprintk_emit+0x3da/0x3e4
[<81425f76>] printk+0x17/0x19
[<8105bfa0>] clockevents_program_min_delta+0x104/0x116
[<8105cc1c>] tick_program_event+0x1e/0x23
[<8103c43c>] hrtimer_force_reprogram+0x88/0x8f
[<8103c49e>] __remove_hrtimer+0x5b/0x79
[<8103cb21>] hrtimer_try_to_cancel+0x49/0x66
[<8103cb4b>] hrtimer_cancel+0xd/0x18
[<8107f102>] perf_swevent_cancel_hrtimer.part.60+0x2b/0x30
[<81080705>] task_clock_event_stop+0x20/0x64
[<81080756>] task_clock_event_del+0xd/0xf
[<81081350>] event_sched_out+0xab/0x11e
[<810813e0>] group_sched_out+0x1d/0x66
[<81081682>] ctx_sched_out+0xaf/0xbf
[<81081e04>] __perf_event_task_sched_out+0x1ed/0x34f
[<8104416d>] ? __dequeue_entity+0x23/0x27
[<81044505>] ? pick_next_task_fair+0xb1/0x120
[<8142cacc>] __schedule+0x4c6/0x4cb
[<81047574>] ? trace_hardirqs_off_caller+0xd7/0x108
[<810475b0>] ? trace_hardirqs_off+0xb/0xd
[<81056346>] ? rcu_irq_exit+0x64/0x77

Fix the problem by using printk_deferred() which does not call into the
scheduler.

Reported-by: Fengguang Wu <fenggu...@intel.com>
Signed-off-by: Jan Kara <ja...@suse.cz>
Signed-off-by: Thomas Gleixner <tg...@linutronix.de>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

kernel/time/clockevents.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 662c5798a685..c2eb27b6017b 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -146,7 +146,8 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
{
/* Nothing to do if we already reached the limit */
if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
- printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n");
+ printk_deferred(KERN_WARNING
+ "CE: Reprogramming failure. Giving up\n");
dev->next_event.tv64 = KTIME_MAX;
return -ETIME;
}
@@ -159,9 +160,10 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
if (dev->min_delta_ns > MIN_DELTA_LIMIT)
dev->min_delta_ns = MIN_DELTA_LIMIT;

- printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
- dev->name ? dev->name : "?",
- (unsigned long long) dev->min_delta_ns);
+ printk_deferred(KERN_WARNING
+ "CE: %s increased min_delta_ns to %llu nsec\n",
+ dev->name ? dev->name : "?",
+ (unsigned long long) dev->min_delta_ns);
return 0;

Luis Henriques

unread,

Aug 18, 2014, 6:00:02 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Jeff Layton <jla...@redhat.com>

commit 8033426e6bdb2690d302872ac1e1fadaec1a5581 upstream.

Christopher reported a regression where he was unable to unmount a NFS
filesystem where the root had gone stale. The problem is that
d_revalidate handles the root of the filesystem differently from other
dentries, but d_weak_revalidate does not. We could simply fix this by
making d_weak_revalidate return success on IS_ROOT dentries, but there
are cases where we do want to revalidate the root of the fs.

A umount is really a special case. We generally aren't interested in
anything but the dentry and vfsmount that's attached at that point. If
the inode turns out to be stale we just don't care since the intent is
to stop using it anyway.

Try to handle this situation better by treating umount as a special
case in the lookup code. Have it resolve the parent using normal
means, and then do a lookup of the final dentry without revalidating
it. In most cases, the final lookup will come out of the dcache, but
the case where there's a trailing symlink or !LAST_NORM entry on the
end complicates things a bit.

Cc: Neil Brown <ne...@suse.de>
Reported-by: Christopher T Vogan <cvo...@us.ibm.com>
Signed-off-by: Jeff Layton <jla...@redhat.com>
Signed-off-by: Al Viro <vi...@zeniv.linux.org.uk>
Cc: Chris Dunlop <ch...@onthe.net.au>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

fs/namei.c | 182 ++++++++++++++++++++++++++++++++++++++++++++++++++
fs/namespace.c | 2 +-
include/linux/namei.h | 1 +
3 files changed, 184 insertions(+), 1 deletion(-)

diff --git a/fs/namei.c b/fs/namei.c
index 2a2d0236f82a..a10bd2f8b66b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2185,6 +2185,188 @@ user_path_parent(int dfd, const char __user *path, struct nameidata *nd,
return s;
}

+/**
+ * umount_lookup_last - look up last component for umount
+ * @nd: pathwalk nameidata - currently pointing at parent directory of "last"
+ * @path: pointer to container for result
+ *
+ * This is a special lookup_last function just for umount. In this case, we
+ * need to resolve the path without doing any revalidation.
+ *
+ * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
+ * mountpoints are always pinned in the dcache, their ancestors are too. Thus,
+ * in almost all cases, this lookup will be served out of the dcache. The only
+ * cases where it won't are if nd->last refers to a symlink or the path is
+ * bogus and it doesn't exist.
+ *
+ * Returns:
+ * -error: if there was an error during lookup. This includes -ENOENT if the
+ * lookup found a negative dentry. The nd->path reference will also be
+ * put in this case.
+ *
+ * 0: if we successfully resolved nd->path and found it to not to be a
+ * symlink that needs to be followed. "path" will also be populated.
+ * The nd->path reference will also be put.
+ *
+ * 1: if we successfully resolved nd->last and found it to be a symlink
+ * that needs to be followed. "path" will be populated with the path
+ * to the link, and nd->path will *not* be put.
+ */
+static int
+umount_lookup_last(struct nameidata *nd, struct path *path)
+{
+ int error = 0;
+ struct dentry *dentry;
+ struct dentry *dir = nd->path.dentry;
+
+ if (unlikely(nd->flags & LOOKUP_RCU)) {
+ WARN_ON_ONCE(1);
+ error = -ECHILD;
+ goto error_check;
+ }
+
+ nd->flags &= ~LOOKUP_PARENT;
+
+ if (unlikely(nd->last_type != LAST_NORM)) {
+ error = handle_dots(nd, nd->last_type);
+ if (!error)
+ dentry = dget(nd->path.dentry);
+ goto error_check;
+ }
+
+ mutex_lock(&dir->d_inode->i_mutex);
+ dentry = d_lookup(dir, &nd->last);
+ if (!dentry) {
+ /*
+ * No cached dentry. Mounted dentries are pinned in the cache,
+ * so that means that this dentry is probably a symlink or the
+ * path doesn't actually point to a mounted dentry.
+ */
+ dentry = d_alloc(dir, &nd->last);
+ if (!dentry) {
+ error = -ENOMEM;
+ } else {
+ dentry = lookup_real(dir->d_inode, dentry, nd->flags);
+ if (IS_ERR(dentry))
+ error = PTR_ERR(dentry);
+ }
+ }
+ mutex_unlock(&dir->d_inode->i_mutex);
+
+error_check:
+ if (!error) {
+ if (!dentry->d_inode) {
+ error = -ENOENT;
+ dput(dentry);
+ } else {
+ path->dentry = dentry;
+ path->mnt = mntget(nd->path.mnt);
+ if (should_follow_link(dentry->d_inode,
+ nd->flags & LOOKUP_FOLLOW))
+ return 1;
+ follow_mount(path);
+ }
+ }
+ terminate_walk(nd);
+ return error;
+}
+
+/**
+ * path_umountat - look up a path to be umounted
+ * @dfd: directory file descriptor to start walk from
+ * @name: full pathname to walk
+ * @flags: lookup flags
+ * @nd: pathwalk nameidata
+ *
+ * Look up the given name, but don't attempt to revalidate the last component.
+ * Returns 0 and "path" will be valid on success; Retuns error otherwise.
+ */
+static int
+path_umountat(int dfd, const char *name, struct path *path, unsigned int flags)
+{
+ struct file *base = NULL;
+ struct nameidata nd;
+ int err;
+
+ err = path_init(dfd, name, flags | LOOKUP_PARENT, &nd, &base);
+ if (unlikely(err))
+ return err;
+
+ current->total_link_count = 0;
+ err = link_path_walk(name, &nd);
+ if (err)
+ goto out;
+
+ /* If we're in rcuwalk, drop out of it to handle last component */
+ if (nd.flags & LOOKUP_RCU) {
+ err = unlazy_walk(&nd, NULL);
+ if (err) {
+ terminate_walk(&nd);
+ goto out;
+ }
+ }
+
+ err = umount_lookup_last(&nd, path);
+ while (err > 0) {
+ void *cookie;
+ struct path link = *path;
+ err = may_follow_link(&link, &nd);
+ if (unlikely(err))
+ break;
+ nd.flags |= LOOKUP_PARENT;
+ err = follow_link(&link, &nd, &cookie);
+ if (err)
+ break;
+ err = umount_lookup_last(&nd, path);
+ put_link(&nd, &link, cookie);
+ }
+out:
+ if (base)
+ fput(base);
+
+ if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT))
+ path_put(&nd.root);
+
+ return err;
+}
+
+/**
+ * user_path_umountat - lookup a path from userland in order to umount it
+ * @dfd: directory file descriptor
+ * @name: pathname from userland
+ * @flags: lookup flags
+ * @path: pointer to container to hold result
+ *
+ * A umount is a special case for path walking. We're not actually interested
+ * in the inode in this situation, and ESTALE errors can be a problem. We
+ * simply want track down the dentry and vfsmount attached at the mountpoint
+ * and avoid revalidating the last component.
+ *
+ * Returns 0 and populates "path" on success.
+ */
+int
+user_path_umountat(int dfd, const char __user *name, unsigned int flags,
+ struct path *path)
+{
+ struct filename *s = getname(name);
+ int error;
+
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+
+ error = path_umountat(dfd, s->name, path, flags | LOOKUP_RCU);
+ if (unlikely(error == -ECHILD))
+ error = path_umountat(dfd, s->name, path, flags);
+ if (unlikely(error == -ESTALE))
+ error = path_umountat(dfd, s->name, path, flags | LOOKUP_REVAL);
+
+ if (likely(!error))
+ audit_inode(s, path->dentry, 0);
+
+ putname(s);
+ return error;
+}
+
/*
* It's inline, so penalty for filesystems that don't use sticky bit is
* minimal.
diff --git a/fs/namespace.c b/fs/namespace.c
index a45ba4f267fe..ad8ea9bc2518 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1318,7 +1318,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
if (!(flags & UMOUNT_NOFOLLOW))
lookup_flags |= LOOKUP_FOLLOW;

- retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
+ retval = user_path_umountat(AT_FDCWD, name, lookup_flags, &path);
if (retval)
goto out;
mnt = real_mount(path.mnt);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 5a5ff57ceed4..cd09751c71a0 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -58,6 +58,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};

extern int user_path_at(int, const char __user *, unsigned, struct path *);
extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty);
+extern int user_path_umountat(int, const char __user *, unsigned int, struct path *);

#define user_path(name, path) user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW, path)
#define user_lpath(name, path) user_path_at(AT_FDCWD, name, 0, path)

Luis Henriques

unread,

Aug 18, 2014, 6:00:02 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: "H. Peter Anvin" <h...@linux.intel.com>

commit 20b68535cd27183ebd3651ff313afb2b97dac941 upstream.

Header guard is #ifndef, not #ifdef...

Reported-by: Fengguang Wu <fenggu...@intel.com>
Signed-off-by: H. Peter Anvin <h...@linux.intel.com>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/x86/include/asm/espfix.h | 2 +-

1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h
index 729051c82b02..99efebb2f69d 100644
--- a/arch/x86/include/asm/espfix.h
+++ b/arch/x86/include/asm/espfix.h
@@ -1,4 +1,4 @@
-#ifdef _ASM_X86_ESPFIX_H
+#ifndef _ASM_X86_ESPFIX_H
#define _ASM_X86_ESPFIX_H

#ifdef CONFIG_X86_64

Luis Henriques

unread,

Aug 18, 2014, 6:00:02 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: willy tarreau <w...@1wt.eu>

commit 71f6d1b31fb1f278a345a30a2180515adc7d80ae upstream.

Right now the mvneta driver doesn't handle Tx IRQ, and relies on two
mechanisms to flush Tx descriptors : a flush at the end of mvneta_tx()
and a timer. If a burst of packets is emitted faster than the device
can send them, then the queue is stopped until next wake-up of the
timer 10ms later. This causes jerky output traffic with bursts and
pauses, making it difficult to reach line rate with very few streams.

A test on UDP traffic shows that it's not possible to go beyond 134
Mbps / 12 kpps of outgoing traffic with 1500-bytes IP packets. Routed
traffic tends to observe pauses as well if the traffic is bursty,
making it even burstier after the wake-up.

It seems that this feature was inherited from the original driver but
nothing there mentions any reason for not using the interrupt instead,
which the chip supports.

Thus, this patch enables Tx interrupts and removes the timer. It does
the two at once because it's not really possible to make the two
mechanisms coexist, so a split patch doesn't make sense.

First tests performed on a Mirabox (Armada 370) show that less CPU
seems to be used when sending traffic. One reason might be that we now
call the mvneta_tx_done_gbe() with a mask indicating which queues have
been done instead of looping over all of them.

The same UDP test above now happily reaches 987 Mbps / 87.7 kpps.
Single-stream TCP traffic can now more easily reach line rate. HTTP
transfers of 1 MB objects over a single connection went from 730 to
840 Mbps. It is even possible to go significantly higher (>900 Mbps)
by tweaking tcp_tso_win_divisor.

Cc: Thomas Petazzoni <thomas.p...@free-electrons.com>
Cc: Gregory CLEMENT <gregory...@free-electrons.com>

Cc: Arnaud Ebalard <ar...@natisbad.org>
Cc: Eric Dumazet <eric.d...@gmail.com>

Tested-by: Arnaud Ebalard <ar...@natisbad.org>
Signed-off-by: Willy Tarreau <w...@1wt.eu>

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Willy Tarreau <w...@1wt.eu>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/net/ethernet/marvell/mvneta.c | 71 ++++++-----------------------------
1 file changed, 12 insertions(+), 59 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index a10f156430ca..ab630ca3c757 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -216,9 +216,6 @@
#define MVNETA_RX_COAL_PKTS 32
#define MVNETA_RX_COAL_USEC 100

-/* Timer */
-#define MVNETA_TX_DONE_TIMER_PERIOD 10
-
/* Napi polling weight */
#define MVNETA_RX_POLL_WEIGHT 64

@@ -274,16 +271,11 @@ struct mvneta_port {
void __iomem *base;
struct mvneta_rx_queue *rxqs;
struct mvneta_tx_queue *txqs;
- struct timer_list tx_done_timer;
struct net_device *dev;

u32 cause_rx_tx;
struct napi_struct napi;

- /* Flags */
- unsigned long flags;
-#define MVNETA_F_TX_DONE_TIMER_BIT 0
-
/* Napi weight */
int weight;

@@ -1116,17 +1108,6 @@ static void mvneta_tx_done_pkts_coal_set(struct mvneta_port *pp,
txq->done_pkts_coal = value;
}

-/* Trigger tx done timer in MVNETA_TX_DONE_TIMER_PERIOD msecs */
-static void mvneta_add_tx_done_timer(struct mvneta_port *pp)
-{
- if (test_and_set_bit(MVNETA_F_TX_DONE_TIMER_BIT, &pp->flags) == 0) {
- pp->tx_done_timer.expires = jiffies +
- msecs_to_jiffies(MVNETA_TX_DONE_TIMER_PERIOD);
- add_timer(&pp->tx_done_timer);
- }
-}
-
-
/* Handle rx descriptor fill by setting buf_cookie and buf_phys_addr */
static void mvneta_rx_desc_fill(struct mvneta_rx_desc *rx_desc,
u32 phys_addr, u32 cookie)
@@ -1618,15 +1599,6 @@ out:
dev_kfree_skb_any(skb);
}

- if (txq->count >= MVNETA_TXDONE_COAL_PKTS)
- mvneta_txq_done(pp, txq);
-
- /* If after calling mvneta_txq_done, count equals
- * frags, we need to set the timer
- */
- if (txq->count == frags && frags > 0)
- mvneta_add_tx_done_timer(pp);
-
return NETDEV_TX_OK;
}

@@ -1902,14 +1874,22 @@ static int mvneta_poll(struct napi_struct *napi, int budget)

/* Read cause register */
cause_rx_tx = mvreg_read(pp, MVNETA_INTR_NEW_CAUSE) &
- MVNETA_RX_INTR_MASK(rxq_number);
+ (MVNETA_RX_INTR_MASK(rxq_number) | MVNETA_TX_INTR_MASK(txq_number));
+
+ /* Release Tx descriptors */
+ if (cause_rx_tx & MVNETA_TX_INTR_MASK_ALL) {
+ int tx_todo = 0;
+
+ mvneta_tx_done_gbe(pp, (cause_rx_tx & MVNETA_TX_INTR_MASK_ALL), &tx_todo);
+ cause_rx_tx &= ~MVNETA_TX_INTR_MASK_ALL;
+ }

/* For the case where the last mvneta_poll did not process all
* RX packets
*/
cause_rx_tx |= pp->cause_rx_tx;
if (rxq_number > 1) {
- while ((cause_rx_tx != 0) && (budget > 0)) {
+ while ((cause_rx_tx & MVNETA_RX_INTR_MASK_ALL) && (budget > 0)) {
int count;
struct mvneta_rx_queue *rxq;
/* get rx queue number from cause_rx_tx */
@@ -1941,7 +1921,7 @@ static int mvneta_poll(struct napi_struct *napi, int budget)
napi_complete(napi);
local_irq_save(flags);
mvreg_write(pp, MVNETA_INTR_NEW_MASK,
- MVNETA_RX_INTR_MASK(rxq_number));
+ MVNETA_RX_INTR_MASK(rxq_number) | MVNETA_TX_INTR_MASK(txq_number));
local_irq_restore(flags);
}

@@ -1949,26 +1929,6 @@ static int mvneta_poll(struct napi_struct *napi, int budget)
return rx_done;
}

-/* tx done timer callback */
-static void mvneta_tx_done_timer_callback(unsigned long data)
-{
- struct net_device *dev = (struct net_device *)data;

- struct mvneta_port *pp = netdev_priv(dev);

- int tx_done = 0, tx_todo = 0;
-
- if (!netif_running(dev))
- return ;
-
- clear_bit(MVNETA_F_TX_DONE_TIMER_BIT, &pp->flags);
-
- tx_done = mvneta_tx_done_gbe(pp,
- (((1 << txq_number) - 1) &
- MVNETA_CAUSE_TXQ_SENT_DESC_ALL_MASK),
- &tx_todo);
- if (tx_todo > 0)
- mvneta_add_tx_done_timer(pp);
-}
-
/* Handle rxq fill: allocates rxq skbs; called when initializing a port */
static int mvneta_rxq_fill(struct mvneta_port *pp, struct mvneta_rx_queue *rxq,
int num)
@@ -2218,7 +2178,7 @@ static void mvneta_start_dev(struct mvneta_port *pp)

/* Unmask interrupts */
mvreg_write(pp, MVNETA_INTR_NEW_MASK,
- MVNETA_RX_INTR_MASK(rxq_number));
+ MVNETA_RX_INTR_MASK(rxq_number) | MVNETA_TX_INTR_MASK(txq_number));

phy_start(pp->phy_dev);
netif_tx_start_all_queues(pp->dev);
@@ -2494,8 +2454,6 @@ static int mvneta_stop(struct net_device *dev)
free_irq(dev->irq, pp);
mvneta_cleanup_rxqs(pp);
mvneta_cleanup_txqs(pp);
- del_timer(&pp->tx_done_timer);
- clear_bit(MVNETA_F_TX_DONE_TIMER_BIT, &pp->flags);

return 0;
}
@@ -2831,11 +2789,6 @@ static int mvneta_probe(struct platform_device *pdev)
}
}

- pp->tx_done_timer.data = (unsigned long)dev;
- pp->tx_done_timer.function = mvneta_tx_done_timer_callback;
- init_timer(&pp->tx_done_timer);
- clear_bit(MVNETA_F_TX_DONE_TIMER_BIT, &pp->flags);
-
pp->tx_ring_size = MVNETA_MAX_TXD;
pp->rx_ring_size = MVNETA_MAX_RXD;

Luis Henriques

unread,

Aug 18, 2014, 6:00:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Daniel Borkmann <dbor...@redhat.com>

commit 1be9a950c646c9092fb3618197f7b6bfb50e82aa upstream.

Jason reported an oops caused by SCTP on his ARM machine with
SCTP authentication enabled:

Internal error: Oops: 17 [#1] ARM
CPU: 0 PID: 104 Comm: sctp-test Not tainted 3.13.0-68744-g3632f30c9b20-dirty #1
task: c6eefa40 ti: c6f52000 task.ti: c6f52000
PC is at sctp_auth_calculate_hmac+0xc4/0x10c
LR is at sg_init_table+0x20/0x38
pc : [<c024bb80>] lr : [<c00f32dc>] psr: 40000013
sp : c6f538e8 ip : 00000000 fp : c6f53924
r10: c6f50d80 r9 : 00000000 r8 : 00010000
r7 : 00000000 r6 : c7be4000 r5 : 00000000 r4 : c6f56254
r3 : c00c8170 r2 : 00000001 r1 : 00000008 r0 : c6f1e660
Flags: nZcv IRQs on FIQs on Mode SVC_32 ISA ARM Segment user
Control: 0005397f Table: 06f28000 DAC: 00000015
Process sctp-test (pid: 104, stack limit = 0xc6f521c0)
Stack: (0xc6f538e8 to 0xc6f54000)
[...]
Backtrace:
[<c024babc>] (sctp_auth_calculate_hmac+0x0/0x10c) from [<c0249af8>] (sctp_packet_transmit+0x33c/0x5c8)
[<c02497bc>] (sctp_packet_transmit+0x0/0x5c8) from [<c023e96c>] (sctp_outq_flush+0x7fc/0x844)
[<c023e170>] (sctp_outq_flush+0x0/0x844) from [<c023ef78>] (sctp_outq_uncork+0x24/0x28)
[<c023ef54>] (sctp_outq_uncork+0x0/0x28) from [<c0234364>] (sctp_side_effects+0x1134/0x1220)
[<c0233230>] (sctp_side_effects+0x0/0x1220) from [<c02330b0>] (sctp_do_sm+0xac/0xd4)
[<c0233004>] (sctp_do_sm+0x0/0xd4) from [<c023675c>] (sctp_assoc_bh_rcv+0x118/0x160)
[<c0236644>] (sctp_assoc_bh_rcv+0x0/0x160) from [<c023d5bc>] (sctp_inq_push+0x6c/0x74)
[<c023d550>] (sctp_inq_push+0x0/0x74) from [<c024a6b0>] (sctp_rcv+0x7d8/0x888)

While we already had various kind of bugs in that area
ec0223ec48a9 ("net: sctp: fix sctp_sf_do_5_1D_ce to verify if
we/peer is AUTH capable") and b14878ccb7fa ("net: sctp: cache
auth_enable per endpoint"), this one is a bit of a different
kind.

Giving a bit more background on why SCTP authentication is
needed can be found in RFC4895:

SCTP uses 32-bit verification tags to protect itself against
blind attackers. These values are not changed during the
lifetime of an SCTP association.

Looking at new SCTP extensions, there is the need to have a
method of proving that an SCTP chunk(s) was really sent by
the original peer that started the association and not by a
malicious attacker.

To cause this bug, we're triggering an INIT collision between
peers; normal SCTP handshake where both sides intent to
authenticate packets contains RANDOM; CHUNKS; HMAC-ALGO
parameters that are being negotiated among peers:

---------- INIT[RANDOM; CHUNKS; HMAC-ALGO] ---------->
<------- INIT-ACK[RANDOM; CHUNKS; HMAC-ALGO] ---------
-------------------- COOKIE-ECHO -------------------->
<-------------------- COOKIE-ACK ---------------------

RFC4895 says that each endpoint therefore knows its own random
number and the peer's random number *after* the association
has been established. The local and peer's random number along
with the shared key are then part of the secret used for
calculating the HMAC in the AUTH chunk.

Now, in our scenario, we have 2 threads with 1 non-blocking
SEQ_PACKET socket each, setting up common shared SCTP_AUTH_KEY
and SCTP_AUTH_ACTIVE_KEY properly, and each of them calling
sctp_bindx(3), listen(2) and connect(2) against each other,
thus the handshake looks similar to this, e.g.:

---------- INIT[RANDOM; CHUNKS; HMAC-ALGO] ---------->
<------- INIT-ACK[RANDOM; CHUNKS; HMAC-ALGO] ---------
<--------- INIT[RANDOM; CHUNKS; HMAC-ALGO] -----------
-------- INIT-ACK[RANDOM; CHUNKS; HMAC-ALGO] -------->
...

Since such collisions can also happen with verification tags,
the RFC4895 for AUTH rather vaguely says under section 6.1:

In case of INIT collision, the rules governing the handling
of this Random Number follow the same pattern as those for
the Verification Tag, as explained in Section 5.2.4 of
RFC 2960 [5]. Therefore, each endpoint knows its own Random
Number and the peer's Random Number after the association
has been established.

In RFC2960, section 5.2.4, we're eventually hitting Action B:

B) In this case, both sides may be attempting to start an
association at about the same time but the peer endpoint
started its INIT after responding to the local endpoint's
INIT. Thus it may have picked a new Verification Tag not
being aware of the previous Tag it had sent this endpoint.
The endpoint should stay in or enter the ESTABLISHED
state but it MUST update its peer's Verification Tag from
the State Cookie, stop any init or cookie timers that may
running and send a COOKIE ACK.

In other words, the handling of the Random parameter is the
same as behavior for the Verification Tag as described in
Action B of section 5.2.4.

Looking at the code, we exactly hit the sctp_sf_do_dupcook_b()
case which triggers an SCTP_CMD_UPDATE_ASSOC command to the
side effect interpreter, and in fact it properly copies over
peer_{random, hmacs, chunks} parameters from the newly created
association to update the existing one.

Also, the old asoc_shared_key is being released and based on
the new params, sctp_auth_asoc_init_active_key() updated.
However, the issue observed in this case is that the previous
asoc->peer.auth_capable was 0, and has *not* been updated, so
that instead of creating a new secret, we're doing an early
return from the function sctp_auth_asoc_init_active_key()
leaving asoc->asoc_shared_key as NULL. However, we now have to
authenticate chunks from the updated chunk list (e.g. COOKIE-ACK).

That in fact causes the server side when responding with ...

<------------------ AUTH; COOKIE-ACK -----------------

... to trigger a NULL pointer dereference, since in
sctp_packet_transmit(), it discovers that an AUTH chunk is
being queued for xmit, and thus it calls sctp_auth_calculate_hmac().

Since the asoc->active_key_id is still inherited from the
endpoint, and the same as encoded into the chunk, it uses
asoc->asoc_shared_key, which is still NULL, as an asoc_key
and dereferences it in ...

crypto_hash_setkey(desc.tfm, &asoc_key->data[0], asoc_key->len)

... causing an oops. All this happens because sctp_make_cookie_ack()
called with the *new* association has the peer.auth_capable=1
and therefore marks the chunk with auth=1 after checking
sctp_auth_send_cid(), but it is *actually* sent later on over
the then *updated* association's transport that didn't initialize
its shared key due to peer.auth_capable=0. Since control chunks
in that case are not sent by the temporary association which
are scheduled for deletion, they are issued for xmit via
SCTP_CMD_REPLY in the interpreter with the context of the
*updated* association. peer.auth_capable was 0 in the updated
association (which went from COOKIE_WAIT into ESTABLISHED state),
since all previous processing that performed sctp_process_init()
was being done on temporary associations, that we eventually
throw away each time.

The correct fix is to update to the new peer.auth_capable
value as well in the collision case via sctp_assoc_update(),
so that in case the collision migrated from 0 -> 1,
sctp_auth_asoc_init_active_key() can properly recalculate
the secret. This therefore fixes the observed server panic.

Fixes: 730fc3d05cd4 ("[SCTP]: Implete SCTP-AUTH parameter processing")
Reported-by: Jason Gunthorpe <jgunt...@obsidianresearch.com>
Signed-off-by: Daniel Borkmann <dbor...@redhat.com>
Tested-by: Jason Gunthorpe <jgunt...@obsidianresearch.com>
Cc: Vlad Yasevich <vyas...@gmail.com>
Acked-by: Vlad Yasevich <vyas...@gmail.com>

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---
net/sctp/associola.c | 1 +

1 file changed, 1 insertion(+)

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 619c781e6691..2400f9585ef8 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1204,6 +1204,7 @@ void sctp_assoc_update(struct sctp_association *asoc,
asoc->c = new->c;
asoc->peer.rwnd = new->peer.rwnd;
asoc->peer.sack_needed = new->peer.sack_needed;
+ asoc->peer.auth_capable = new->peer.auth_capable;
asoc->peer.i = new->peer.i;
sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL,
asoc->peer.i.initial_tsn, GFP_ATOMIC);

Luis Henriques

unread,

Aug 18, 2014, 6:00:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Guenter Roeck <li...@roeck-us.net>

commit 043572d5444116b9d9ad8ae763cf069e7accbc30 upstream.

Temperature limit clamps are applied after converting the temperature
from milli-degrees C to degrees C, so either the clamp limit needs
to be specified in degrees C, not milli-degrees C, or clamping must
happen before converting to degrees C. Use the latter method to avoid
overflows.

vrm is an u8, so the written value needs to be limited to [0, 255].

Cc: Axel Lin <axel...@ingics.com>
Signed-off-by: Guenter Roeck <li...@roeck-us.net>
Reviewed-by: Jean Delvare <jdel...@suse.de>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/hwmon/smsc47m192.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/hwmon/smsc47m192.c b/drivers/hwmon/smsc47m192.c
index efee4c59239f..34b9a601ad07 100644
--- a/drivers/hwmon/smsc47m192.c
+++ b/drivers/hwmon/smsc47m192.c
@@ -86,7 +86,7 @@ static inline u8 IN_TO_REG(unsigned long val, int n)
*/
static inline s8 TEMP_TO_REG(int val)
{
- return clamp_val(SCALE(val, 1, 1000), -128000, 127000);
+ return SCALE(clamp_val(val, -128000, 127000), 1, 1000);
}

static inline int TEMP_FROM_REG(s8 val)
@@ -384,6 +384,8 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr,
err = kstrtoul(buf, 10, &val);
if (err)
return err;
+ if (val > 255)
+ return -EINVAL;

data->vrm = val;
return count;

Luis Henriques

unread,

Aug 18, 2014, 6:00:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Yuchung Cheng <ych...@google.com>

commit 6e08d5e3c8236e7484229e46fdf92006e1dd4c49 upstream.

The undo code assumes that, upon entering loss recovery, TCP
1) always retransmit something
2) the retransmission never fails locally (e.g., qdisc drop)

so undo_marker is set in tcp_enter_recovery() and undo_retrans is
incremented only when tcp_retransmit_skb() is successful.

When the assumption is broken because TCP's cwnd is too small to
retransmit or the retransmit fails locally. The next (DUP)ACK
would incorrectly revert the cwnd and the congestion state in
tcp_try_undo_dsack() or tcp_may_undo(). Subsequent (DUP)ACKs
may enter the recovery state. The sender repeatedly enter and
(incorrectly) exit recovery states if the retransmits continue to
fail locally while receiving (DUP)ACKs.

The fix is to initialize undo_retrans to -1 and start counting on
the first retransmission. Always increment undo_retrans even if the
retransmissions fail locally because they couldn't cause DSACKs to
undo the cwnd reduction.

Signed-off-by: Yuchung Cheng <ych...@google.com>
Signed-off-by: Neal Cardwell <ncar...@google.com>

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

net/ipv4/tcp_input.c | 8 ++++----
net/ipv4/tcp_output.c | 6 ++++--
2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ebafc8703c7a..7e150e67129e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1064,7 +1064,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
}

/* D-SACK for already forgotten data... Do dumb counting. */
- if (dup_sack && tp->undo_marker && tp->undo_retrans &&
+ if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
!after(end_seq_0, prior_snd_una) &&
after(end_seq_0, tp->undo_marker))
tp->undo_retrans--;
@@ -1143,7 +1143,7 @@ static u8 tcp_sacktag_one(struct sock *sk,

/* Account D-SACK for retransmitted packet. */
if (dup_sack && (sacked & TCPCB_RETRANS)) {
- if (tp->undo_marker && tp->undo_retrans &&
+ if (tp->undo_marker && tp->undo_retrans > 0 &&
after(end_seq, tp->undo_marker))
tp->undo_retrans--;
if (sacked & TCPCB_SACKED_ACKED)
@@ -1837,7 +1837,7 @@ static void tcp_clear_retrans_partial(struct tcp_sock *tp)
tp->lost_out = 0;

tp->undo_marker = 0;
- tp->undo_retrans = 0;
+ tp->undo_retrans = -1;
}

void tcp_clear_retrans(struct tcp_sock *tp)
@@ -2602,7 +2602,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)

tp->prior_ssthresh = 0;
tp->undo_marker = tp->snd_una;
- tp->undo_retrans = tp->retrans_out;
+ tp->undo_retrans = tp->retrans_out ? : -1;

if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
if (!ece_ack)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index acc001eb3a55..6703a74ed3a0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2433,8 +2433,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (!tp->retrans_stamp)
tp->retrans_stamp = TCP_SKB_CB(skb)->when;

- tp->undo_retrans += tcp_skb_pcount(skb);
-
/* snd_nxt is stored to detect loss of retransmitted segment,
* see tcp_input.c tcp_sacktag_write_queue().
*/
@@ -2442,6 +2440,10 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
} else {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
+
+ if (tp->undo_retrans < 0)
+ tp->undo_retrans = 0;
+ tp->undo_retrans += tcp_skb_pcount(skb);
return err;

Luis Henriques

unread,

Aug 18, 2014, 6:00:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: "David S. Miller" <da...@davemloft.net>

commit aa3449ee9c87d9b7660dd1493248abcc57769e31 upstream.

Only the second argument, 'op', is signed.

Signed-off-by: David S. Miller <da...@davemloft.net>
Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/sparc/kernel/sys32.S | 2 +-

1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sparc/kernel/sys32.S b/arch/sparc/kernel/sys32.S
index f7c72b6efc27..d066eb18650c 100644
--- a/arch/sparc/kernel/sys32.S
+++ b/arch/sparc/kernel/sys32.S
@@ -44,7 +44,7 @@ SIGN1(sys32_timer_settime, compat_sys_timer_settime, %o1)
SIGN1(sys32_io_submit, compat_sys_io_submit, %o1)
SIGN1(sys32_mq_open, compat_sys_mq_open, %o1)
SIGN1(sys32_select, compat_sys_select, %o0)
-SIGN3(sys32_futex, compat_sys_futex, %o1, %o2, %o5)
+SIGN1(sys32_futex, compat_sys_futex, %o1)
SIGN1(sys32_recvfrom, compat_sys_recvfrom, %o0)
SIGN1(sys32_recvmsg, compat_sys_recvmsg, %o0)
SIGN1(sys32_sendmsg, compat_sys_sendmsg, %o0)

Luis Henriques

unread,

Aug 18, 2014, 6:00:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Ben Hutchings <b...@decadent.org.uk>

commit 640d7efe4c08f06c4ae5d31b79bd8740e7f6790a upstream.

*_result[len] is parsed as *(_result[len]) which is not at all what we
want to touch here.

Signed-off-by: Ben Hutchings <b...@decadent.org.uk>
Fixes: 84a7c0b1db1c ("dns_resolver: assure that dns_query() result is null-terminated")

Signed-off-by: David S. Miller <da...@davemloft.net>
Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

net/dns_resolver/dns_query.c | 2 +-

1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index ede0e2d7412e..2022b46ab38f 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -151,7 +151,7 @@ int dns_query(const char *type, const char *name, size_t namelen,
goto put;

memcpy(*_result, upayload->data, len);
- *_result[len] = '\0';
+ (*_result)[len] = '\0';

if (_expiry)
*_expiry = rkey->expiry;

Luis Henriques

unread,

Aug 18, 2014, 6:00:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Malcolm Priestley <tvbo...@gmail.com>

commit 6cff1f6ad4c615319c1a146b2aa0af1043c5e9f5 upstream.

WARNING: CPU: 0 PID: 929 at /home/apw/COD/linux/kernel/irq/handle.c:147 handle_irq_event_percpu+0x1d1/0x1e0()
irq 17 handler device_intr+0x0/0xa80 [vt6655_stage] enabled interrupts

Using spin_lock_irqsave appears to fix this.

Signed-off-by: Malcolm Priestley <tvbo...@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gre...@linuxfoundation.org>
[ luis: backported to 3.11: adjusted context ]

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/staging/vt6655/device_main.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/vt6655/device_main.c b/drivers/staging/vt6655/device_main.c
index 08b250f01dae..d170b6f9db7c 100644
--- a/drivers/staging/vt6655/device_main.c
+++ b/drivers/staging/vt6655/device_main.c
@@ -2434,6 +2434,7 @@ static irqreturn_t device_intr(int irq, void *dev_instance) {
int handled = 0;
unsigned char byData = 0;
int ii = 0;
+ unsigned long flags;
// unsigned char byRSSI;

MACvReadISR(pDevice->PortOffset, &pDevice->dwIsr);
@@ -2459,7 +2460,8 @@ static irqreturn_t device_intr(int irq, void *dev_instance) {

handled = 1;
MACvIntDisable(pDevice->PortOffset);
- spin_lock_irq(&pDevice->lock);
+
+ spin_lock_irqsave(&pDevice->lock, flags);

//Make sure current page is 0
VNSvInPortB(pDevice->PortOffset + MAC_REG_PAGE1SEL, &byOrgPageSel);
@@ -2700,7 +2702,8 @@ static irqreturn_t device_intr(int irq, void *dev_instance) {
MACvSelectPage1(pDevice->PortOffset);
}

- spin_unlock_irq(&pDevice->lock);
+ spin_unlock_irqrestore(&pDevice->lock, flags);
+
MACvIntEnable(pDevice->PortOffset, IMR_MASK_VALUE);

return IRQ_RETVAL(handled);

Luis Henriques

unread,

Aug 18, 2014, 6:00:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Hans de Goede <hdeg...@redhat.com>

commit 242841d3d71191348f98310e2d2001e1001d8630 upstream.

Tested-and-reported-by: yullaw <yul...@mageia.cz>

Signed-off-by: Hans de Goede <hdeg...@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <m.ch...@samsung.com>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/media/usb/gspca/pac7302.c | 1 +

1 file changed, 1 insertion(+)

diff --git a/drivers/media/usb/gspca/pac7302.c b/drivers/media/usb/gspca/pac7302.c
index a91509643563..0d4be1d840ab 100644
--- a/drivers/media/usb/gspca/pac7302.c
+++ b/drivers/media/usb/gspca/pac7302.c
@@ -928,6 +928,7 @@ static const struct usb_device_id device_table[] = {
{USB_DEVICE(0x093a, 0x2620)},
{USB_DEVICE(0x093a, 0x2621)},
{USB_DEVICE(0x093a, 0x2622), .driver_info = FL_VFLIP},
+ {USB_DEVICE(0x093a, 0x2623), .driver_info = FL_VFLIP},
{USB_DEVICE(0x093a, 0x2624), .driver_info = FL_VFLIP},
{USB_DEVICE(0x093a, 0x2625)},
{USB_DEVICE(0x093a, 0x2626)},

Luis Henriques

unread,

Aug 18, 2014, 6:00:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Christoph Schulz <dev...@kristov.de>

commit a8a3e41c67d24eb12f9ab9680cbb85e24fcd9711 upstream.

The PPP channel MTU is used with Multilink PPP when ppp_mp_explode() (see
ppp_generic module) tries to determine how big a fragment might be. According
to RFC 1661, the MTU excludes the 2-byte PPP protocol field, see the
corresponding comment and code in ppp_mp_explode():

/*
* hdrlen includes the 2-byte PPP protocol field, but the
* MTU counts only the payload excluding the protocol field.
* (RFC1661 Section 2)
*/
mtu = pch->chan->mtu - (hdrlen - 2);

However, the pppoe module *does* include the PPP protocol field in the channel
MTU, which is wrong as it causes the PPP payload to be 1-2 bytes too big under
certain circumstances (one byte if PPP protocol compression is used, two
otherwise), causing the generated Ethernet packets to be dropped. So the pppoe
module has to subtract two bytes from the channel MTU. This error only
manifests itself when using Multilink PPP, as otherwise the channel MTU is not
used anywhere.

In the following, I will describe how to reproduce this bug. We configure two
pppd instances for multilink PPP over two PPPoE links, say eth2 and eth3, with
a MTU of 1492 bytes for each link and a MRRU of 2976 bytes. (This MRRU is
computed by adding the two link MTUs and subtracting the MP header twice, which
is 4 bytes long.) The necessary pppd statements on both sides are "multilink
mtu 1492 mru 1492 mrru 2976". On the client side, we additionally need "plugin
rp-pppoe.so eth2" and "plugin rp-pppoe.so eth3", respectively; on the server
side, we additionally need to start two pppoe-server instances to be able to
establish two PPPoE sessions, one over eth2 and one over eth3. We set the MTU
of the PPP network interface to the MRRU (2976) on both sides of the connection
in order to make use of the higher bandwidth. (If we didn't do that, IP
fragmentation would kick in, which we want to avoid.)

Now we send a ICMPv4 echo request with a payload of 2948 bytes from client to
server over the PPP link. This results in the following network packet:

2948 (echo payload)
+ 8 (ICMPv4 header)
+ 20 (IPv4 header)
---------------------
2976 (PPP payload)

These 2976 bytes do not exceed the MTU of the PPP network interface, so the
IP packet is not fragmented. Now the multilink PPP code in ppp_mp_explode()
prepends one protocol byte (0x21 for IPv4), making the packet one byte bigger
than the negotiated MRRU. So this packet would have to be divided in three
fragments. But this does not happen as each link MTU is assumed to be two bytes
larger. So this packet is diveded into two fragments only, one of size 1489 and
one of size 1488. Now we have for that bigger fragment:

1489 (PPP payload)
+ 4 (MP header)
+ 2 (PPP protocol field for the MP payload (0x3d))
+ 6 (PPPoE header)
--------------------------
1501 (Ethernet payload)

This packet exceeds the link MTU and is discarded.

If one configures the link MTU on the client side to 1501, one can see the
discarded Ethernet frames with tcpdump running on the client. A

ping -s 2948 -c 1 192.168.15.254

leads to the smaller fragment that is correctly received on the server side:

(tcpdump -vvvne -i eth3 pppoes and ppp proto 0x3d)
52:54:00:ad:87:fd > 52:54:00:79:5c:d0, ethertype PPPoE S (0x8864),
length 1514: PPPoE [ses 0x3] MLPPP (0x003d), length 1494: seq 0x000,
Flags [end], length 1492

and to the bigger fragment that is not received on the server side:

(tcpdump -vvvne -i eth2 pppoes and ppp proto 0x3d)
52:54:00:70:9e:89 > 52:54:00:5d:6f:b0, ethertype PPPoE S (0x8864),
length 1515: PPPoE [ses 0x5] MLPPP (0x003d), length 1495: seq 0x000,
Flags [begin], length 1493

With the patch below, we correctly obtain three fragments:

52:54:00:ad:87:fd > 52:54:00:79:5c:d0, ethertype PPPoE S (0x8864),
length 1514: PPPoE [ses 0x1] MLPPP (0x003d), length 1494: seq 0x000,
Flags [begin], length 1492
52:54:00:70:9e:89 > 52:54:00:5d:6f:b0, ethertype PPPoE S (0x8864),
length 1514: PPPoE [ses 0x1] MLPPP (0x003d), length 1494: seq 0x000,
Flags [none], length 1492
52:54:00:ad:87:fd > 52:54:00:79:5c:d0, ethertype PPPoE S (0x8864),
length 27: PPPoE [ses 0x1] MLPPP (0x003d), length 7: seq 0x000,
Flags [end], length 5

And the ICMPv4 echo request is successfully received at the server side:

IP (tos 0x0, ttl 64, id 21925, offset 0, flags [DF], proto ICMP (1),
length 2976)
192.168.222.2 > 192.168.15.254: ICMP echo request, id 30530, seq 0,
length 2956

The bug was introduced in commit c9aa6895371b2a257401f59d3393c9f7ac5a8698
("[PPPOE]: Advertise PPPoE MTU") from the very beginning. This patch applies
to 3.10 upwards but the fix can be applied (with minor modifications) to
kernels as old as 2.6.32.

Signed-off-by: Christoph Schulz <dev...@kristov.de>

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/net/ppp/pppoe.c | 2 +-

1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 82ee6ed954cb..addd23246eb6 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -675,7 +675,7 @@ static int pppoe_connect(struct socket *sock, struct sockaddr *uservaddr,
po->chan.hdrlen = (sizeof(struct pppoe_hdr) +
dev->hard_header_len);

- po->chan.mtu = dev->mtu - sizeof(struct pppoe_hdr);
+ po->chan.mtu = dev->mtu - sizeof(struct pppoe_hdr) - 2;
po->chan.private = sk;
po->chan.ops = &pppoe_chan_ops;

Luis Henriques

unread,

Aug 18, 2014, 6:00:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Andrey Ryabinin <ryabin...@gmail.com>

commit 40eea803c6b2cfaab092f053248cbeab3f368412 upstream.

Sasha's report:
> While fuzzing with trinity inside a KVM tools guest running the latest -next
> kernel with the KASAN patchset, I've stumbled on the following spew:
>
> [ 4448.949424] ==================================================================
> [ 4448.951737] AddressSanitizer: user-memory-access on address 0
> [ 4448.952988] Read of size 2 by thread T19638:
> [ 4448.954510] CPU: 28 PID: 19638 Comm: trinity-c76 Not tainted 3.16.0-rc4-next-20140711-sasha-00046-g07d3099-dirty #813
> [ 4448.956823] ffff88046d86ca40 0000000000000000 ffff880082f37e78 ffff880082f37a40
> [ 4448.958233] ffffffffb6e47068 ffff880082f37a68 ffff880082f37a58 ffffffffb242708d
> [ 4448.959552] 0000000000000000 ffff880082f37a88 ffffffffb24255b1 0000000000000000
> [ 4448.961266] Call Trace:
> [ 4448.963158] dump_stack (lib/dump_stack.c:52)
> [ 4448.964244] kasan_report_user_access (mm/kasan/report.c:184)
> [ 4448.965507] __asan_load2 (mm/kasan/kasan.c:352)
> [ 4448.966482] ? netlink_sendmsg (net/netlink/af_netlink.c:2339)
> [ 4448.967541] netlink_sendmsg (net/netlink/af_netlink.c:2339)
> [ 4448.968537] ? get_parent_ip (kernel/sched/core.c:2555)
> [ 4448.970103] sock_sendmsg (net/socket.c:654)
> [ 4448.971584] ? might_fault (mm/memory.c:3741)
> [ 4448.972526] ? might_fault (./arch/x86/include/asm/current.h:14 mm/memory.c:3740)
> [ 4448.973596] ? verify_iovec (net/core/iovec.c:64)
> [ 4448.974522] ___sys_sendmsg (net/socket.c:2096)
> [ 4448.975797] ? put_lock_stats.isra.13 (./arch/x86/include/asm/preempt.h:98 kernel/locking/lockdep.c:254)
> [ 4448.977030] ? lock_release_holdtime (kernel/locking/lockdep.c:273)
> [ 4448.978197] ? lock_release_non_nested (kernel/locking/lockdep.c:3434 (discriminator 1))
> [ 4448.979346] ? check_chain_key (kernel/locking/lockdep.c:2188)
> [ 4448.980535] __sys_sendmmsg (net/socket.c:2181)
> [ 4448.981592] ? trace_hardirqs_on_caller (kernel/locking/lockdep.c:2600)
> [ 4448.982773] ? trace_hardirqs_on (kernel/locking/lockdep.c:2607)
> [ 4448.984458] ? syscall_trace_enter (arch/x86/kernel/ptrace.c:1500 (discriminator 2))
> [ 4448.985621] ? trace_hardirqs_on_caller (kernel/locking/lockdep.c:2600)
> [ 4448.986754] SyS_sendmmsg (net/socket.c:2201)
> [ 4448.987708] tracesys (arch/x86/kernel/entry_64.S:542)
> [ 4448.988929] ==================================================================

This reports means that we've come to netlink_sendmsg() with msg->msg_name == NULL and msg->msg_namelen > 0.

After this report there was no usual "Unable to handle kernel NULL pointer dereference"
and this gave me a clue that address 0 is mapped and contains valid socket address structure in it.

This bug was introduced in f3d3342602f8bcbf37d7c46641cb9bca7618eb1c
(net: rework recvmsg handler msg_name and msg_namelen logic).
Commit message states that:
"Set msg->msg_name = NULL if user specified a NULL in msg_name but had a
non-null msg_namelen in verify_iovec/verify_compat_iovec. This doesn't
affect sendto as it would bail out earlier while trying to copy-in the
address."
But in fact this affects sendto when address 0 is mapped and contains
socket address structure in it. In such case copy-in address will succeed,
verify_iovec() function will successfully exit with msg->msg_namelen > 0
and msg->msg_name == NULL.

This patch fixes it by setting msg_namelen to 0 if msg_name == NULL.

Cc: Hannes Frederic Sowa <han...@stressinduktion.org>
Cc: Eric Dumazet <edum...@google.com>
Reported-by: Sasha Levin <sasha...@oracle.com>
Signed-off-by: Andrey Ryabinin <a.rya...@samsung.com>

Acked-by: Hannes Frederic Sowa <han...@stressinduktion.org>

Signed-off-by: David S. Miller <da...@davemloft.net>
Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

net/compat.c | 9 +++++----
net/core/iovec.c | 6 +++---
2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/net/compat.c b/net/compat.c
index f50161fb812e..cbc1a2a26587 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -85,7 +85,7 @@ int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *kern_iov,
{
int tot_len;

- if (kern_msg->msg_namelen) {
+ if (kern_msg->msg_name && kern_msg->msg_namelen) {
if (mode == VERIFY_READ) {
int err = move_addr_to_kernel(kern_msg->msg_name,
kern_msg->msg_namelen,
@@ -93,10 +93,11 @@ int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *kern_iov,
if (err < 0)
return err;
}
- if (kern_msg->msg_name)
- kern_msg->msg_name = kern_address;
- } else
+ kern_msg->msg_name = kern_address;
+ } else {
kern_msg->msg_name = NULL;
+ kern_msg->msg_namelen = 0;
+ }

tot_len = iov_from_user_compat_to_kern(kern_iov,
(struct compat_iovec __user *)kern_msg->msg_iov,
diff --git a/net/core/iovec.c b/net/core/iovec.c
index 9a31515fb8e3..2145b7150beb 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -39,7 +39,7 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a
{
int size, ct, err;

- if (m->msg_namelen) {
+ if (m->msg_name && m->msg_namelen) {
if (mode == VERIFY_READ) {
void __user *namep;
namep = (void __user __force *) m->msg_name;
@@ -48,10 +48,10 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a
if (err < 0)
return err;
}
- if (m->msg_name)
- m->msg_name = address;
+ m->msg_name = address;
} else {
m->msg_name = NULL;
+ m->msg_namelen = 0;
}

size = m->msg_iovlen * sizeof(struct iovec);

Luis Henriques

unread,

Aug 18, 2014, 6:00:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Dmitry Torokhov <dt...@chromium.org>

commit 50c5d36dab930b1f1b1e3348b8608aa8b9ee7610 upstream.

We attempt to remove noise from coordinates reported by devices in
input_handle_abs_event(), unfortunately, unless we were dropping the
event altogether, we were ignoring the adjusted value and were passing
on the original value instead.

Reviewed-by: Andrew de los Reyes <ad...@chromium.org>
Reviewed-by: Benson Leung <ble...@chromium.org>
Reviewed-by: David Herrmann <dh.he...@gmail.com>
Reviewed-by: Henrik Rydberg <ryd...@euromail.se>
Signed-off-by: Dmitry Torokhov <dmitry....@gmail.com>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/input/input.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/input/input.c b/drivers/input/input.c
index 66984e272c45..a161021c4526 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -257,9 +257,10 @@ static int input_handle_abs_event(struct input_dev *dev,
}

static int input_get_disposition(struct input_dev *dev,
- unsigned int type, unsigned int code, int value)
+ unsigned int type, unsigned int code, int *pval)
{
int disposition = INPUT_IGNORE_EVENT;
+ int value = *pval;

switch (type) {

@@ -357,6 +358,7 @@ static int input_get_disposition(struct input_dev *dev,
break;
}

+ *pval = value;
return disposition;
}

@@ -365,7 +367,7 @@ static void input_handle_event(struct input_dev *dev,
{
int disposition;

- disposition = input_get_disposition(dev, type, code, value);
+ disposition = input_get_disposition(dev, type, code, &value);

if ((disposition & INPUT_PASS_TO_DEVICE) && dev->event)
dev->event(dev, type, code, value);

Luis Henriques

unread,

Aug 18, 2014, 6:00:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Vasily Averin <v...@parallels.com>

commit 295dc39d941dc2ae53d5c170365af4c9d5c16212 upstream.

Currently umount on symlink blocks following umount:

/vz is separate mount

drwxr-xr-x. 2 root root 4096 Jul 19 01:14 testdir
lrwxrwxrwx. 1 root root 11 Jul 19 01:16 testlink -> /vz/testdir
umount: /vz/testlink: not mounted (expected)

umount: /vz: device is busy. (unexpected)

In this case mountpoint_last() gets an extra refcount on path->mnt

Signed-off-by: Vasily Averin <v...@openvz.org>
Acked-by: Ian Kent <ra...@themaw.net>
Acked-by: Jeff Layton <jla...@primarydata.com>
Signed-off-by: Christoph Hellwig <h...@lst.de>
Cc: Neil Brown <ne...@suse.de>

[ luis: backported to 3.11: adjusted context ]

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

fs/namei.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/namei.c b/fs/namei.c
index a10bd2f8b66b..2d36c4651627 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2260,10 +2260,11 @@ error_check:
dput(dentry);
} else {
path->dentry = dentry;
- path->mnt = mntget(nd->path.mnt);
+ path->mnt = nd->path.mnt;
if (should_follow_link(dentry->d_inode,
nd->flags & LOOKUP_FOLLOW))
return 1;
+ mntget(path->mnt);
follow_mount(path);

Luis Henriques

unread,

Aug 18, 2014, 6:00:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Suresh Reddy <Suresh...@emulex.com>

commit 4cad9f3b61c7268fa89ab8096e23202300399b5d upstream.

On BE3, if the clear-interrupt bit of the EQ doorbell is not set the first
time it is armed, ocassionally we have observed that the EQ doesn't raise
anymore interrupts even if it is in armed state.
This patch fixes this by setting the clear-interrupt bit when EQs are
armed for the first time in be_open().

Signed-off-by: Suresh Reddy <Suresh...@emulex.com>
Signed-off-by: Sathya Perla <sathya...@emulex.com>

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/net/ethernet/emulex/benet/be_main.c | 2 +-

1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 3d91a5ec61a4..8dee3d9835dd 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -2665,7 +2665,7 @@ static int be_open(struct net_device *netdev)

for_all_evt_queues(adapter, eqo, i) {
napi_enable(&eqo->napi);
- be_eq_notify(adapter, eqo->q.id, true, false, 0);
+ be_eq_notify(adapter, eqo->q.id, true, true, 0);
}
adapter->flags |= BE_FLAGS_NAPI_ENABLED;

Luis Henriques

unread,

Aug 18, 2014, 6:00:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Tony Luck <tony...@intel.com>

commit 58d4e21e50ff3cc57910a8abc20d7e14375d2f61 upstream.

The "uptime" trace clock added in:

commit 8aacf017b065a805d27467843490c976835eb4a5
tracing: Add "uptime" trace clock that uses jiffies

has wraparound problems when the system has been up more
than 1 hour 11 minutes and 34 seconds. It converts jiffies
to nanoseconds using:
(u64)jiffies_to_usecs(jiffy) * 1000ULL
but since jiffies_to_usecs() only returns a 32-bit value, it
truncates at 2^32 microseconds. An additional problem on 32-bit
systems is that the argument is "unsigned long", so fixing the
return value only helps until 2^32 jiffies (49.7 days on a HZ=1000
system).

Avoid these problems by using jiffies_64 as our basis, and
not converting to nanoseconds (we do convert to clock_t because
user facing API must not be dependent on internal kernel
HZ values).

Link: http://lkml.kernel.org/p/99d63c5bfe9b320a3b428d773825a370...@intel.com

Fixes: 8aacf017b065 "tracing: Add "uptime" trace clock that uses jiffies"
Signed-off-by: Tony Luck <tony...@intel.com>
Signed-off-by: Steven Rostedt <ros...@goodmis.org>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

kernel/trace/trace.c | 2 +-
kernel/trace/trace_clock.c | 9 +++++----
2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3ef771112cde..e0071b0c78bf 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -764,7 +764,7 @@ static struct {
{ trace_clock_local, "local", 1 },
{ trace_clock_global, "global", 1 },
{ trace_clock_counter, "counter", 0 },
- { trace_clock_jiffies, "uptime", 1 },
+ { trace_clock_jiffies, "uptime", 0 },
{ trace_clock, "perf", 1 },
ARCH_TRACE_CLOCKS
};
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 26dc348332b7..57b67b1f24d1 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -59,13 +59,14 @@ u64 notrace trace_clock(void)

/*
* trace_jiffy_clock(): Simply use jiffies as a clock counter.
+ * Note that this use of jiffies_64 is not completely safe on
+ * 32-bit systems. But the window is tiny, and the effect if
+ * we are affected is that we will have an obviously bogus
+ * timestamp on a trace event - i.e. not life threatening.
*/
u64 notrace trace_clock_jiffies(void)
{
- u64 jiffy = jiffies - INITIAL_JIFFIES;
-
- /* Return nsecs */
- return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
+ return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES);
}

/*

Luis Henriques

unread,

Aug 18, 2014, 6:00:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Lars-Peter Clausen <la...@metafoo.de>

commit 61bd55ce1667809f022be88da77db17add90ea4e upstream.

When creating the demux table we need to iterate over the selected scan mask for
the buffer to get the samples which should be copied to destination buffer.
Right now the code uses the mask which contains all active channels, which means
the demux table contains entries which causes it to copy all the samples from
source to destination buffer one by one without doing any demuxing.

Signed-off-by: Lars-Peter Clausen <la...@metafoo.de>
Signed-off-by: Jonathan Cameron <ji...@kernel.org>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/iio/industrialio-buffer.c | 2 +-

1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 54adea8f0134..45f5c0fd18f5 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -847,7 +847,7 @@ static int iio_buffer_update_demux(struct iio_dev *indio_dev,

/* Now we have the two masks, work from least sig and build up sizes */
for_each_set_bit(out_ind,
- indio_dev->active_scan_mask,
+ buffer->scan_mask,
indio_dev->masklength) {
in_ind = find_next_bit(indio_dev->active_scan_mask,
indio_dev->masklength,

Luis Henriques

unread,

Aug 18, 2014, 6:00:05 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: willy tarreau <w...@1wt.eu>

commit 40ba35e74fa56866918d2f3bc0528b5b92725d5e upstream.

Marvell has not published the chip's datasheet yet, so it's very hard
to find the relevant bits to manipulate to change the IRQ behaviour.
Fortunately, these bits are described in the proprietary LSP patch set
which is publicly available here :

http://www.plugcomputer.org/downloads/mirabox/

So let's put them back in the driver in order to reduce the burden of
current and future maintenance.

Cc: Thomas Petazzoni <thomas.p...@free-electrons.com>
Cc: Gregory CLEMENT <gregory...@free-electrons.com>

Tested-by: Arnaud Ebalard <ar...@natisbad.org>
Signed-off-by: Willy Tarreau <w...@1wt.eu>

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Willy Tarreau <w...@1wt.eu>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/net/ethernet/marvell/mvneta.c | 44 +++++++++++++++++++++++++++++++++--
1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index f72112294750..a10f156430ca 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -101,16 +101,56 @@
#define MVNETA_CPU_RXQ_ACCESS_ALL_MASK 0x000000ff
#define MVNETA_CPU_TXQ_ACCESS_ALL_MASK 0x0000ff00
#define MVNETA_RXQ_TIME_COAL_REG(q) (0x2580 + ((q) << 2))
+
+/* Exception Interrupt Port/Queue Cause register */
+
#define MVNETA_INTR_NEW_CAUSE 0x25a0
-#define MVNETA_RX_INTR_MASK(nr_rxqs) (((1 << nr_rxqs) - 1) << 8)
#define MVNETA_INTR_NEW_MASK 0x25a4
+
+/* bits 0..7 = TXQ SENT, one bit per queue.
+ * bits 8..15 = RXQ OCCUP, one bit per queue.
+ * bits 16..23 = RXQ FREE, one bit per queue.
+ * bit 29 = OLD_REG_SUM, see old reg ?
+ * bit 30 = TX_ERR_SUM, one bit for 4 ports
+ * bit 31 = MISC_SUM, one bit for 4 ports
+ */
+#define MVNETA_TX_INTR_MASK(nr_txqs) (((1 << nr_txqs) - 1) << 0)
+#define MVNETA_TX_INTR_MASK_ALL (0xff << 0)
+#define MVNETA_RX_INTR_MASK(nr_rxqs) (((1 << nr_rxqs) - 1) << 8)
+#define MVNETA_RX_INTR_MASK_ALL (0xff << 8)
+
#define MVNETA_INTR_OLD_CAUSE 0x25a8
#define MVNETA_INTR_OLD_MASK 0x25ac
+
+/* Data Path Port/Queue Cause Register */
#define MVNETA_INTR_MISC_CAUSE 0x25b0
#define MVNETA_INTR_MISC_MASK 0x25b4
+
+#define MVNETA_CAUSE_PHY_STATUS_CHANGE BIT(0)
+#define MVNETA_CAUSE_LINK_CHANGE BIT(1)
+#define MVNETA_CAUSE_PTP BIT(4)
+
+#define MVNETA_CAUSE_INTERNAL_ADDR_ERR BIT(7)
+#define MVNETA_CAUSE_RX_OVERRUN BIT(8)
+#define MVNETA_CAUSE_RX_CRC_ERROR BIT(9)
+#define MVNETA_CAUSE_RX_LARGE_PKT BIT(10)
+#define MVNETA_CAUSE_TX_UNDERUN BIT(11)
+#define MVNETA_CAUSE_PRBS_ERR BIT(12)
+#define MVNETA_CAUSE_PSC_SYNC_CHANGE BIT(13)
+#define MVNETA_CAUSE_SERDES_SYNC_ERR BIT(14)
+
+#define MVNETA_CAUSE_BMU_ALLOC_ERR_SHIFT 16
+#define MVNETA_CAUSE_BMU_ALLOC_ERR_ALL_MASK (0xF << MVNETA_CAUSE_BMU_ALLOC_ERR_SHIFT)
+#define MVNETA_CAUSE_BMU_ALLOC_ERR_MASK(pool) (1 << (MVNETA_CAUSE_BMU_ALLOC_ERR_SHIFT + (pool)))
+
+#define MVNETA_CAUSE_TXQ_ERROR_SHIFT 24
+#define MVNETA_CAUSE_TXQ_ERROR_ALL_MASK (0xFF << MVNETA_CAUSE_TXQ_ERROR_SHIFT)
+#define MVNETA_CAUSE_TXQ_ERROR_MASK(q) (1 << (MVNETA_CAUSE_TXQ_ERROR_SHIFT + (q)))
+
#define MVNETA_INTR_ENABLE 0x25b8
#define MVNETA_TXQ_INTR_ENABLE_ALL_MASK 0x0000ff00
-#define MVNETA_RXQ_INTR_ENABLE_ALL_MASK 0xff000000
+#define MVNETA_RXQ_INTR_ENABLE_ALL_MASK 0xff000000 // note: neta says it's 0x000000FF
+
#define MVNETA_RXQ_CMD 0x2680
#define MVNETA_RXQ_DISABLE_SHIFT 8
#define MVNETA_RXQ_ENABLE_MASK 0x000000ff

Luis Henriques

unread,

Aug 18, 2014, 6:00:05 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Alexandre Bounine <alexandr...@idt.com>

commit 0193ed8225e1a79ed64632106ec3cc81798cb13c upstream.

This is a bug fix for the situation when function tsi721_desc_get() fails
to obtain a free transaction descriptor.

The bug usually results in a memory access crash dump when data transfer
scatter-gather list has more entries than size of hardware buffer
descriptors ring. This fix ensures that error is properly returned to a
caller instead of an invalid entry.

This patch is applicable to kernel versions starting from v3.5.

Signed-off-by: Alexandre Bounine <alexandr...@idt.com>
Cc: Matt Porter <mpo...@kernel.crashing.org>
Cc: Andre van Herk <andre.v...@prodrive-technologies.com>
Cc: Stef van Os <stef....@prodrive-technologies.com>
Cc: Vinod Koul <vinod...@intel.com>
Cc: Dan Williams <dan.j.w...@intel.com>
Signed-off-by: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Linus Torvalds <torv...@linux-foundation.org>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/rapidio/devices/tsi721_dma.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/rapidio/devices/tsi721_dma.c b/drivers/rapidio/devices/tsi721_dma.c
index 91245f5dbe81..47257b6eea84 100644
--- a/drivers/rapidio/devices/tsi721_dma.c
+++ b/drivers/rapidio/devices/tsi721_dma.c
@@ -287,6 +287,12 @@ struct tsi721_tx_desc *tsi721_desc_get(struct tsi721_bdma_chan *bdma_chan)
"desc %p not ACKed\n", tx_desc);
}

+ if (ret == NULL) {
+ dev_dbg(bdma_chan->dchan.device->dev,
+ "%s: unable to obtain tx descriptor\n", __func__);
+ goto err_out;
+ }
+
i = bdma_chan->wr_count_next % bdma_chan->bd_num;
if (i == bdma_chan->bd_num - 1) {
i = 0;
@@ -297,7 +303,7 @@ struct tsi721_tx_desc *tsi721_desc_get(struct tsi721_bdma_chan *bdma_chan)
tx_desc->txd.phys = bdma_chan->bd_phys +
i * sizeof(struct tsi721_dma_desc);
tx_desc->hw_desc = &((struct tsi721_dma_desc *)bdma_chan->bd_base)[i];
-
+err_out:
spin_unlock_bh(&bdma_chan->lock);

return ret;

Luis Henriques

unread,

Aug 18, 2014, 6:00:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Antti Palosaari <cr...@iki.fi>

commit db4175ae2095634dbecd4c847da439f9c83e1b3b upstream.

Only supported modulation for DVB-S is QPSK. Modulation parameter
contains invalid value for DVB-S on some cases, which leads driver
refusing tuning attempt. Due to that, hard code modulation to QPSK
in case of DVB-S.

Signed-off-by: Antti Palosaari <cr...@iki.fi>

Signed-off-by: Mauro Carvalho Chehab <m.ch...@samsung.com>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/media/dvb-frontends/tda10071.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/media/dvb-frontends/tda10071.c b/drivers/media/dvb-frontends/tda10071.c
index 36eb27d3fdf1..def7812d7b22 100644
--- a/drivers/media/dvb-frontends/tda10071.c
+++ b/drivers/media/dvb-frontends/tda10071.c
@@ -667,6 +667,7 @@ static int tda10071_set_frontend(struct dvb_frontend *fe)
struct dtv_frontend_properties *c = &fe->dtv_property_cache;
int ret, i;
u8 mode, rolloff, pilot, inversion, div;
+ fe_modulation_t modulation;

dev_dbg(&priv->i2c->dev, "%s: delivery_system=%d modulation=%d " \
"frequency=%d symbol_rate=%d inversion=%d pilot=%d " \
@@ -701,10 +702,13 @@ static int tda10071_set_frontend(struct dvb_frontend *fe)

switch (c->delivery_system) {
case SYS_DVBS:
+ modulation = QPSK;
rolloff = 0;
pilot = 2;
break;
case SYS_DVBS2:
+ modulation = c->modulation;
+
switch (c->rolloff) {
case ROLLOFF_20:
rolloff = 2;
@@ -749,7 +753,7 @@ static int tda10071_set_frontend(struct dvb_frontend *fe)

for (i = 0, mode = 0xff; i < ARRAY_SIZE(TDA10071_MODCOD); i++) {
if (c->delivery_system == TDA10071_MODCOD[i].delivery_system &&
- c->modulation == TDA10071_MODCOD[i].modulation &&
+ modulation == TDA10071_MODCOD[i].modulation &&
c->fec_inner == TDA10071_MODCOD[i].fec) {
mode = TDA10071_MODCOD[i].val;
dev_dbg(&priv->i2c->dev, "%s: mode found=%02x\n",

Luis Henriques

unread,

Aug 18, 2014, 6:00:02 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: "H. Peter Anvin" <h...@linux.intel.com>

commit e1fe9ed8d2a4937510d0d60e20705035c2609aea upstream.

Sparse warns that the percpu variables aren't declared before they are
defined. Rather than hacking around it, move espfix definitions into
a proper header file.

Reported-by: Fengguang Wu <fenggu...@intel.com>
Signed-off-by: H. Peter Anvin <h...@linux.intel.com>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/x86/include/asm/espfix.h | 16 ++++++++++++++++
arch/x86/include/asm/setup.h | 5 ++---
arch/x86/kernel/espfix_64.c | 1 +
3 files changed, 19 insertions(+), 3 deletions(-)
create mode 100644 arch/x86/include/asm/espfix.h

diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h
new file mode 100644
index 000000000000..729051c82b02
--- /dev/null
+++ b/arch/x86/include/asm/espfix.h
@@ -0,0 +1,16 @@
+#ifdef _ASM_X86_ESPFIX_H
+#define _ASM_X86_ESPFIX_H
+
+#ifdef CONFIG_X86_64
+
+#include <asm/percpu.h>
+
+DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
+DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
+
+extern void init_espfix_bsp(void);
+extern void init_espfix_ap(void);
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* _ASM_X86_ESPFIX_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 93797d17ef32..2e327f114a1b 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -60,11 +60,10 @@ extern void x86_ce4100_early_setup(void);
static inline void x86_ce4100_early_setup(void) { }
#endif

-extern void init_espfix_bsp(void);
-extern void init_espfix_ap(void);
-
#ifndef _SETUP

+#include <asm/espfix.h>
+
/*
* This is set up by the setup-routine at boot-time
*/
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 8a64da36310f..6afbb16e9b79 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -40,6 +40,7 @@
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/setup.h>
+#include <asm/espfix.h>

/*
* Note: we only need 6*8 = 48 bytes for the espfix stack, but round

Luis Henriques

unread,

Aug 18, 2014, 6:00:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christia...@amd.com>

commit e8c214d22e76dd0ead38f97f8d2dc09aac70d651 upstream.

We must mask out the overflow bit as well, otherwise
the wptr will never match the rptr again and the interrupt
handler will loop forever.

Signed-off-by: Christian König <christia...@amd.com>
Signed-off-by: Alex Deucher <alexande...@amd.com>
Reviewed-by: Michel Dänzer <michel....@amd.com>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/gpu/drm/radeon/cik.c | 1 +
drivers/gpu/drm/radeon/evergreen.c | 1 +
drivers/gpu/drm/radeon/r600.c | 1 +
drivers/gpu/drm/radeon/si.c | 1 +
4 files changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
index 85647ab99fb1..0aaaa11b7036 100644
--- a/drivers/gpu/drm/radeon/cik.c
+++ b/drivers/gpu/drm/radeon/cik.c
@@ -5591,6 +5591,7 @@ static inline u32 cik_get_ih_wptr(struct radeon_device *rdev)
tmp = RREG32(IH_RB_CNTL);
tmp |= IH_WPTR_OVERFLOW_CLEAR;
WREG32(IH_RB_CNTL, tmp);
+ wptr &= ~RB_OVERFLOW;
}
return (wptr & rdev->ih.ptr_mask);
}
diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c
index a34e20921711..415ab58594d9 100644
--- a/drivers/gpu/drm/radeon/evergreen.c
+++ b/drivers/gpu/drm/radeon/evergreen.c
@@ -4597,6 +4597,7 @@ static u32 evergreen_get_ih_wptr(struct radeon_device *rdev)
tmp = RREG32(IH_RB_CNTL);
tmp |= IH_WPTR_OVERFLOW_CLEAR;
WREG32(IH_RB_CNTL, tmp);
+ wptr &= ~RB_OVERFLOW;
}
return (wptr & rdev->ih.ptr_mask);
}
diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
index 33b35c9e82be..af588cf27293 100644
--- a/drivers/gpu/drm/radeon/r600.c
+++ b/drivers/gpu/drm/radeon/r600.c
@@ -4429,6 +4429,7 @@ static u32 r600_get_ih_wptr(struct radeon_device *rdev)
tmp = RREG32(IH_RB_CNTL);
tmp |= IH_WPTR_OVERFLOW_CLEAR;
WREG32(IH_RB_CNTL, tmp);
+ wptr &= ~RB_OVERFLOW;
}
return (wptr & rdev->ih.ptr_mask);
}
diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c
index 3479145e8f47..5a46e269e5d5 100644
--- a/drivers/gpu/drm/radeon/si.c
+++ b/drivers/gpu/drm/radeon/si.c
@@ -6052,6 +6052,7 @@ static inline u32 si_get_ih_wptr(struct radeon_device *rdev)
tmp = RREG32(IH_RB_CNTL);
tmp |= IH_WPTR_OVERFLOW_CLEAR;
WREG32(IH_RB_CNTL, tmp);
+ wptr &= ~RB_OVERFLOW;
}
return (wptr & rdev->ih.ptr_mask);

Luis Henriques

unread,

Aug 18, 2014, 6:00:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Boris Ostrovsky <boris.o...@oracle.com>

commit 8762e5092828c4dc0f49da5a47a644c670df77f3 upstream.

init_espfix_ap() is currently off by one level when informing hypervisor
that allocated pages will be used for ministacks' page tables.

The most immediate effect of this on a PV guest is that if
'stack_page = __get_free_page()' returns a non-zeroed-out page the hypervisor
will refuse to use it for a page table (which it shouldn't be anyway). This will
result in warnings by both Xen and Linux.

More importantly, a subsequent write to that page (again, by a PV guest) is
likely to result in fatal page fault.

Signed-off-by: Boris Ostrovsky <boris.o...@oracle.com>
Link: http://lkml.kernel.org/r/1404926298-5565-1-git-se...@oracle.com
Reviewed-by: Konrad Rzeszutek Wilk <konra...@oracle.com>

Signed-off-by: H. Peter Anvin <h...@linux.intel.com>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/x86/kernel/espfix_64.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 6afbb16e9b79..94d857fb1033 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -175,7 +175,7 @@ void init_espfix_ap(void)
if (!pud_present(pud)) {
pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
- paravirt_alloc_pud(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PUD_CLONES; n++)
set_pud(&pud_p[n], pud);
}
@@ -185,7 +185,7 @@ void init_espfix_ap(void)
if (!pmd_present(pmd)) {
pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
- paravirt_alloc_pmd(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
+ paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PMD_CLONES; n++)
set_pmd(&pmd_p[n], pmd);
}
@@ -193,7 +193,6 @@ void init_espfix_ap(void)
pte_p = pte_offset_kernel(&pmd, addr);
stack_page = (void *)__get_free_page(GFP_KERNEL);
pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
- paravirt_alloc_pte(&init_mm, __pa(stack_page) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PTE_CLONES; n++)
set_pte(&pte_p[n*PTE_STRIDE], pte);

Luis Henriques

unread,

Aug 18, 2014, 6:00:03 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Christoph Hellwig <h...@lst.de>

commit d45b3279a5a2252cafcd665bbf2db8c9b31ef783 upstream.

There is no inherent reason why the last put of a tag structure must be
the one for the Scsi_Host, as device model objects can be held for
arbitrary periods. Merge blk_free_tags and __blk_free_tags into a single
funtion that just release a references and get rid of the BUG() when the
host reference wasn't the last.

Signed-off-by: Christoph Hellwig <h...@lst.de>
Signed-off-by: Jens Axboe <ax...@fb.com>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

block/blk-tag.c | 33 +++++++--------------------------
1 file changed, 7 insertions(+), 26 deletions(-)

diff --git a/block/blk-tag.c b/block/blk-tag.c
index 3f33d8672268..a185b86741e5 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -27,18 +27,15 @@ struct request *blk_queue_find_tag(struct request_queue *q, int tag)
EXPORT_SYMBOL(blk_queue_find_tag);

/**
- * __blk_free_tags - release a given set of tag maintenance info
+ * blk_free_tags - release a given set of tag maintenance info
* @bqt: the tag map to free
*
- * Tries to free the specified @bqt. Returns true if it was
- * actually freed and false if there are still references using it
+ * Drop the reference count on @bqt and frees it when the last reference
+ * is dropped.
*/
-static int __blk_free_tags(struct blk_queue_tag *bqt)
+void blk_free_tags(struct blk_queue_tag *bqt)
{
- int retval;
-
- retval = atomic_dec_and_test(&bqt->refcnt);
- if (retval) {
+ if (atomic_dec_and_test(&bqt->refcnt)) {
BUG_ON(find_first_bit(bqt->tag_map, bqt->max_depth) <
bqt->max_depth);

@@ -50,9 +47,8 @@ static int __blk_free_tags(struct blk_queue_tag *bqt)

kfree(bqt);
}
-
- return retval;
}
+EXPORT_SYMBOL(blk_free_tags);

/**
* __blk_queue_free_tags - release tag maintenance info
@@ -69,28 +65,13 @@ void __blk_queue_free_tags(struct request_queue *q)
if (!bqt)
return;

- __blk_free_tags(bqt);
+ blk_free_tags(bqt);

q->queue_tags = NULL;
queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q);
}

/**
- * blk_free_tags - release a given set of tag maintenance info
- * @bqt: the tag map to free
- *
- * For externally managed @bqt frees the map. Callers of this
- * function must guarantee to have released all the queues that
- * might have been using this tag map.
- */
-void blk_free_tags(struct blk_queue_tag *bqt)
-{
- if (unlikely(!__blk_free_tags(bqt)))
- BUG();
-}
-EXPORT_SYMBOL(blk_free_tags);
-
-/**
* blk_queue_free_tags - release tag maintenance info
* @q: the request queue for the device
*

Luis Henriques

unread,

Aug 18, 2014, 6:00:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Martin Schwidefsky <schwi...@de.ibm.com>

commit dab6cf55f81a6e16b8147aed9a843e1691dcd318 upstream.

The PSW mask check of the PTRACE_POKEUSR_AREA command is incorrect.
For the default user_mode=home address space layout the psw_user_bits
variable has the home space address-space-control bits set. But the
PSW_MASK_USER contains PSW_MASK_ASC, the ptrace validity check for the
PSW mask will therefore always fail.

Fixes CVE-2014-3534

Signed-off-by: Martin Schwidefsky <schwi...@de.ibm.com>
Cc: Ben Hutchings <b...@decadent.org.uk>
[ luis: backported to 3.11: used Martin's backport to 3.2 ]

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/s390/kernel/ptrace.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index e9fadb04e3c6..319da563e6cf 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -322,7 +322,9 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data)
* psw and gprs are stored on the stack
*/
if (addr == (addr_t) &dummy->regs.psw.mask &&
- ((data & ~PSW_MASK_USER) != psw_user_bits ||
+ (((data^psw_user_bits) & ~PSW_MASK_USER) ||
+ (((data^psw_user_bits) & PSW_MASK_ASC) &&
+ ((data|psw_user_bits) & PSW_MASK_ASC) == PSW_MASK_ASC) ||
((data & PSW_MASK_EA) && !(data & PSW_MASK_BA))))
/* Invalid psw mask. */
return -EINVAL;
@@ -655,7 +657,10 @@ static int __poke_user_compat(struct task_struct *child,
*/
if (addr == (addr_t) &dummy32->regs.psw.mask) {
/* Build a 64 bit psw mask from 31 bit mask. */
- if ((tmp & ~PSW32_MASK_USER) != psw32_user_bits)
+ if (((tmp^psw32_user_bits) & ~PSW32_MASK_USER) ||
+ (((tmp^psw32_user_bits) & PSW32_MASK_ASC) &&
+ ((tmp|psw32_user_bits) & PSW32_MASK_ASC)
+ == PSW32_MASK_ASC))
/* Invalid psw mask. */
return -EINVAL;
regs->psw.mask = (regs->psw.mask & ~PSW_MASK_USER) |

Luis Henriques

unread,

Aug 18, 2014, 6:00:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Malcolm Priestley <tvbo...@gmail.com>

commit 4aa0abed3a2a11b7d71ad560c1a3e7631c5a31cd upstream.

byReAssocCount is incremented every second resulting in
disassociated message being send every 10 seconds whether
connection or not.

byReAssocCount should only advance while eCommandState
is in WLAN_ASSOCIATE_WAIT

Change existing scope to if condition.

Signed-off-by: Malcolm Priestley <tvbo...@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gre...@linuxfoundation.org>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/staging/vt6655/bssdb.c | 2 +-

1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/vt6655/bssdb.c b/drivers/staging/vt6655/bssdb.c
index f983915168b7..3496a77612ba 100644
--- a/drivers/staging/vt6655/bssdb.c
+++ b/drivers/staging/vt6655/bssdb.c
@@ -1026,7 +1026,7 @@ start:
pDevice->byERPFlag &= ~(WLAN_SET_ERP_USE_PROTECTION(1));
}

- {
+ if (pDevice->eCommandState == WLAN_ASSOCIATE_WAIT) {
pDevice->byReAssocCount++;
if ((pDevice->byReAssocCount > 10) && (pDevice->bLinkPass != true)) { //10 sec timeout
printk("Re-association timeout!!!\n");

Luis Henriques

unread,

Aug 18, 2014, 6:00:04 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: George Cherian <george....@ti.com>

commit 33cf75656923ff11d67a937a4f8e9344f58cea77 upstream.

The raminit register is shared register for both can0 and can1. Since commit:

32766ff net: can: Convert to use devm_ioremap_resource

devm_ioremap_resource() is used to map raminit register. When using both
interfaces the mapping for the can1 interface fails, leading to a non
functional can interface.

Signed-off-by: George Cherian <george....@ti.com>
Signed-off-by: Mugunthan V N <mugunt...@ti.com>
Signed-off-by: Marc Kleine-Budde <m...@pengutronix.de>

[ luis: backported to 3.11: adjusted context ]

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/net/can/c_can/c_can_platform.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/can/c_can/c_can_platform.c b/drivers/net/can/c_can/c_can_platform.c
index c6f838d922a5..5d39bc152c5e 100644
--- a/drivers/net/can/c_can/c_can_platform.c
+++ b/drivers/net/can/c_can/c_can_platform.c
@@ -194,7 +194,8 @@ static int c_can_plat_probe(struct platform_device *pdev)
priv->instance = pdev->id;

res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
- priv->raminit_ctrlreg = devm_ioremap_resource(&pdev->dev, res);
+ priv->raminit_ctrlreg = devm_ioremap(&pdev->dev, res->start,
+ resource_size(res));
if (IS_ERR(priv->raminit_ctrlreg) || priv->instance < 0)
dev_info(&pdev->dev, "control memory is not used for raminit\n");
else

Luis Henriques

unread,

Aug 18, 2014, 6:00:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Jon Paul Maloy <jon....@ericsson.com>

commit 999417549c16dd0e3a382aa9f6ae61688db03181 upstream.

If the 'next' pointer of the last fragment buffer in a message is not
zeroed before reassembly, we risk ending up with a corrupt message,
since the reassembly function itself isn't doing this.

Currently, when a buffer is retrieved from the deferred queue of the
broadcast link, the next pointer is not cleared, with the result as
described above.

This commit corrects this, and thereby fixes a bug that may occur when
long broadcast messages are transmitted across dual interfaces. The bug
has been present since 40ba3cdf542a469aaa9083fa041656e59b109b90 ("tipc:
message reassembly using fragment chain")

This commit should be applied to both net and net-next.

Signed-off-by: Jon Maloy <jon....@ericsson.com>

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

net/tipc/bcast.c | 1 +

1 file changed, 1 insertion(+)

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 716de1ac6cb5..6ef89256b2fb 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -531,6 +531,7 @@ receive:

buf = node->bclink.deferred_head;
node->bclink.deferred_head = buf->next;
+ buf->next = NULL;
node->bclink.deferred_size--;
goto receive;

Luis Henriques

unread,

Aug 18, 2014, 6:00:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Milan Broz <gmaz...@gmail.com>

commit 4c63f83c2c2e16a13ce274ee678e28246bd33645 upstream.

Th AF_ALG socket was missing a security label (e.g. SELinux)
which means that socket was in "unlabeled" state.

This was recently demonstrated in the cryptsetup package
(cryptsetup v1.6.5 and later.)
See https://bugzilla.redhat.com/show_bug.cgi?id=1115120

This patch clones the sock's label from the parent sock
and resolves the issue (similar to AF_BLUETOOTH protocol family).

Signed-off-by: Milan Broz <gmaz...@gmail.com>
Acked-by: Paul Moore <pa...@paul-moore.com>
Signed-off-by: Herbert Xu <her...@gondor.apana.org.au>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

crypto/af_alg.c | 2 ++
1 file changed, 2 insertions(+)

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index ac33d5f30778..bf948e134981 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -21,6 +21,7 @@
#include <linux/module.h>
#include <linux/net.h>
#include <linux/rwsem.h>
+#include <linux/security.h>

struct alg_type_list {
const struct af_alg_type *type;
@@ -243,6 +244,7 @@ int af_alg_accept(struct sock *sk, struct socket *newsock)

sock_init_data(newsock, sk2);
sock_graft(sk2, newsock);
+ security_sk_clone(sk, sk2);

err = type->accept(ask->private, sk2);
if (err) {

Luis Henriques

unread,

Aug 18, 2014, 6:00:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: =?UTF-8?q?Manuel=20Sch=C3=B6lling?= <manuel.s...@gmx.de>

commit 84a7c0b1db1c17d5ded8d3800228a608e1070b40 upstream.

dns_query() credulously assumes that keys are null-terminated and
returns a copy of a memory block that is off by one.

Signed-off-by: Manuel Schölling <manuel.s...@gmx.de>

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

net/dns_resolver/dns_query.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index c32be292c7e3..ede0e2d7412e 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -150,7 +150,9 @@ int dns_query(const char *type, const char *name, size_t namelen,
if (!*_result)
goto put;

- memcpy(*_result, upayload->data, len + 1);
+ memcpy(*_result, upayload->data, len);
+ *_result[len] = '\0';
+

if (_expiry)
*_expiry = rkey->expiry;

Luis Henriques

unread,

Aug 18, 2014, 6:00:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: willy tarreau <w...@1wt.eu>

commit dc4277dd41a80fd5f29a90412ea04bc3ba54fbf1 upstream.

Better count packets and bytes in the stack and on 32 bit then
accumulate them at the end for once. This saves two memory writes
and two memory barriers per packet. The incoming packet rate was
increased by 4.7% on the Openblocks AX3 thanks to this.

Cc: Thomas Petazzoni <thomas.p...@free-electrons.com>
Cc: Gregory CLEMENT <gregory...@free-electrons.com>

Reviewed-by: Eric Dumazet <edum...@google.com>

Tested-by: Arnaud Ebalard <ar...@natisbad.org>
Signed-off-by: Willy Tarreau <w...@1wt.eu>

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Willy Tarreau <w...@1wt.eu>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/net/ethernet/marvell/mvneta.c | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 1594c61d80de..64bf21be7ff9 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -1358,6 +1358,8 @@ static int mvneta_rx(struct mvneta_port *pp, int rx_todo,
{
struct net_device *dev = pp->dev;
int rx_done, rx_filled;
+ u32 rcvd_pkts = 0;
+ u32 rcvd_bytes = 0;

/* Get number of received packets */
rx_done = mvneta_rxq_busy_desc_num_get(pp, rxq);
@@ -1395,10 +1397,8 @@ static int mvneta_rx(struct mvneta_port *pp, int rx_todo,

rx_bytes = rx_desc->data_size -
(ETH_FCS_LEN + MVNETA_MH_SIZE);
- u64_stats_update_begin(&pp->rx_stats.syncp);
- pp->rx_stats.packets++;
- pp->rx_stats.bytes += rx_bytes;
- u64_stats_update_end(&pp->rx_stats.syncp);
+ rcvd_pkts++;
+ rcvd_bytes += rx_bytes;

/* Linux processing */
skb_reserve(skb, MVNETA_MH_SIZE);
@@ -1419,6 +1419,13 @@ static int mvneta_rx(struct mvneta_port *pp, int rx_todo,
}
}

+ if (rcvd_pkts) {
+ u64_stats_update_begin(&pp->rx_stats.syncp);
+ pp->rx_stats.packets += rcvd_pkts;
+ pp->rx_stats.bytes += rcvd_bytes;
+ u64_stats_update_end(&pp->rx_stats.syncp);
+ }
+
/* Update rxq management counters */
mvneta_rxq_desc_num_update(pp, rxq, rx_done, rx_filled);

Luis Henriques

unread,

Aug 18, 2014, 6:00:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: "David S. Miller" <da...@davemloft.net>

commit 70ffc6ebaead783ac8dafb1e87df0039bb043596 upstream.

Make get_user_insn() able to cope with huge PMDs.

Next, make do_fault_siginfo() more robust when get_user_insn() can't
actually fetch the instruction. In particular, use the MMU announced
fault address when that happens, instead of calling
compute_effective_address() and computing garbage.

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---
arch/sparc/mm/fault_64.c | 86 ++++++++++++++++++++++++++++++------------------
1 file changed, 54 insertions(+), 32 deletions(-)

diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 5062ff389e83..1992fa04095f 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -95,38 +95,51 @@ static unsigned int get_user_insn(unsigned long tpc)
pte_t *ptep, pte;
unsigned long pa;
u32 insn = 0;
- unsigned long pstate;

- if (pgd_none(*pgdp))
- goto outret;
+ if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp)))
+ goto out;
pudp = pud_offset(pgdp, tpc);
- if (pud_none(*pudp))
- goto outret;
- pmdp = pmd_offset(pudp, tpc);
- if (pmd_none(*pmdp))
- goto outret;
-
- /* This disables preemption for us as well. */
- __asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
- __asm__ __volatile__("wrpr %0, %1, %%pstate"
- : : "r" (pstate), "i" (PSTATE_IE));
- ptep = pte_offset_map(pmdp, tpc);
- pte = *ptep;
- if (!pte_present(pte))
+ if (pud_none(*pudp) || unlikely(pud_bad(*pudp)))
goto out;

- pa = (pte_pfn(pte) << PAGE_SHIFT);
- pa += (tpc & ~PAGE_MASK);
-
- /* Use phys bypass so we don't pollute dtlb/dcache. */
- __asm__ __volatile__("lduwa [%1] %2, %0"
- : "=r" (insn)
- : "r" (pa), "i" (ASI_PHYS_USE_EC));
+ /* This disables preemption for us as well. */
+ local_irq_disable();

+ pmdp = pmd_offset(pudp, tpc);
+ if (pmd_none(*pmdp) || unlikely(pmd_bad(*pmdp)))
+ goto out_irq_enable;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(*pmdp)) {
+ if (pmd_trans_splitting(*pmdp))
+ goto out_irq_enable;
+
+ pa = pmd_pfn(*pmdp) << PAGE_SHIFT;
+ pa += tpc & ~HPAGE_MASK;
+
+ /* Use phys bypass so we don't pollute dtlb/dcache. */
+ __asm__ __volatile__("lduwa [%1] %2, %0"
+ : "=r" (insn)
+ : "r" (pa), "i" (ASI_PHYS_USE_EC));
+ } else
+#endif
+ {
+ ptep = pte_offset_map(pmdp, tpc);
+ pte = *ptep;
+ if (pte_present(pte)) {
+ pa = (pte_pfn(pte) << PAGE_SHIFT);
+ pa += (tpc & ~PAGE_MASK);
+
+ /* Use phys bypass so we don't pollute dtlb/dcache. */
+ __asm__ __volatile__("lduwa [%1] %2, %0"
+ : "=r" (insn)
+ : "r" (pa), "i" (ASI_PHYS_USE_EC));
+ }
+ pte_unmap(ptep);
+ }
+out_irq_enable:
+ local_irq_enable();
out:
- pte_unmap(ptep);
- __asm__ __volatile__("wrpr %0, 0x0, %%pstate" : : "r" (pstate));
-outret:
return insn;
}

@@ -152,7 +165,8 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,
}

static void do_fault_siginfo(int code, int sig, struct pt_regs *regs,
- unsigned int insn, int fault_code)
+ unsigned long fault_addr, unsigned int insn,
+ int fault_code)
{
unsigned long addr;
siginfo_t info;
@@ -160,10 +174,18 @@ static void do_fault_siginfo(int code, int sig, struct pt_regs *regs,
info.si_code = code;
info.si_signo = sig;
info.si_errno = 0;
- if (fault_code & FAULT_CODE_ITLB)
+ if (fault_code & FAULT_CODE_ITLB) {
addr = regs->tpc;
- else
- addr = compute_effective_address(regs, insn, 0);
+ } else {
+ /* If we were able to probe the faulting instruction, use it
+ * to compute a precise fault address. Otherwise use the fault
+ * time provided address which may only have page granularity.
+ */
+ if (insn)
+ addr = compute_effective_address(regs, insn, 0);
+ else
+ addr = fault_addr;
+ }
info.si_addr = (void __user *) addr;
info.si_trapno = 0;

@@ -238,7 +260,7 @@ static void __kprobes do_kernel_fault(struct pt_regs *regs, int si_code,
/* The si_code was set to make clear whether
* this was a SEGV_MAPERR or SEGV_ACCERR fault.
*/
- do_fault_siginfo(si_code, SIGSEGV, regs, insn, fault_code);
+ do_fault_siginfo(si_code, SIGSEGV, regs, address, insn, fault_code);
return;
}

@@ -519,7 +541,7 @@ do_sigbus:
* Send a sigbus, regardless of whether we were in kernel
* or user mode.
*/
- do_fault_siginfo(BUS_ADRERR, SIGBUS, regs, insn, fault_code);
+ do_fault_siginfo(BUS_ADRERR, SIGBUS, regs, address, insn, fault_code);

/* Kernel mode? Handle exceptions or die */
if (regs->tstate & TSTATE_PRIV)

Luis Henriques

unread,

Aug 18, 2014, 6:00:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Daniel Borkmann <dbor...@redhat.com>

commit 8f2e5ae40ec193bc0a0ed99e95315c3eebca84ea upstream.

While working on some other SCTP code, I noticed that some
structures shared with user space are leaking uninitialized
stack or heap buffer. In particular, struct sctp_sndrcvinfo
has a 2 bytes hole between .sinfo_flags and .sinfo_ppid that
remains unfilled by us in sctp_ulpevent_read_sndrcvinfo() when
putting this into cmsg. But also struct sctp_remote_error
contains a 2 bytes hole that we don't fill but place into a skb
through skb_copy_expand() via sctp_ulpevent_make_remote_error().

Both structures are defined by the IETF in RFC6458:

* Section 5.3.2. SCTP Header Information Structure:

The sctp_sndrcvinfo structure is defined below:

struct sctp_sndrcvinfo {
uint16_t sinfo_stream;
uint16_t sinfo_ssn;
uint16_t sinfo_flags;
<-- 2 bytes hole -->
uint32_t sinfo_ppid;
uint32_t sinfo_context;
uint32_t sinfo_timetolive;
uint32_t sinfo_tsn;
uint32_t sinfo_cumtsn;
sctp_assoc_t sinfo_assoc_id;
};

* 6.1.3. SCTP_REMOTE_ERROR:

A remote peer may send an Operation Error message to its peer.
This message indicates a variety of error conditions on an
association. The entire ERROR chunk as it appears on the wire
is included in an SCTP_REMOTE_ERROR event. Please refer to the
SCTP specification [RFC4960] and any extensions for a list of
possible error formats. An SCTP error notification has the
following format:

struct sctp_remote_error {
uint16_t sre_type;
uint16_t sre_flags;
uint32_t sre_length;
uint16_t sre_error;
<-- 2 bytes hole -->
sctp_assoc_t sre_assoc_id;
uint8_t sre_data[];
};

Fix this by setting both to 0 before filling them out. We also
have other structures shared between user and kernel space in
SCTP that contains holes (e.g. struct sctp_paddrthlds), but we
copy that buffer over from user space first and thus don't need
to care about it in that cases.

While at it, we can also remove lengthy comments copied from
the draft, instead, we update the comment with the correct RFC
number where one can look it up.

Signed-off-by: Daniel Borkmann <dbor...@redhat.com>

Signed-off-by: David S. Miller <da...@davemloft.net>
Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

net/sctp/ulpevent.c | 122 +++++++---------------------------------------------
1 file changed, 15 insertions(+), 107 deletions(-)

diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 44a45dbee4df..273c3285d938 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -373,9 +373,10 @@ fail:
* specification [SCTP] and any extensions for a list of possible
* error formats.
*/
-struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
- const struct sctp_association *asoc, struct sctp_chunk *chunk,
- __u16 flags, gfp_t gfp)
+struct sctp_ulpevent *
+sctp_ulpevent_make_remote_error(const struct sctp_association *asoc,
+ struct sctp_chunk *chunk, __u16 flags,
+ gfp_t gfp)
{
struct sctp_ulpevent *event;
struct sctp_remote_error *sre;
@@ -394,8 +395,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
/* Copy the skb to a new skb with room for us to prepend
* notification with.
*/
- skb = skb_copy_expand(chunk->skb, sizeof(struct sctp_remote_error),
- 0, gfp);
+ skb = skb_copy_expand(chunk->skb, sizeof(*sre), 0, gfp);

/* Pull off the rest of the cause TLV from the chunk. */
skb_pull(chunk->skb, elen);
@@ -406,62 +406,21 @@ struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
event = sctp_skb2event(skb);
sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize);

- sre = (struct sctp_remote_error *)
- skb_push(skb, sizeof(struct sctp_remote_error));
+ sre = (struct sctp_remote_error *) skb_push(skb, sizeof(*sre));

/* Trim the buffer to the right length. */
- skb_trim(skb, sizeof(struct sctp_remote_error) + elen);
+ skb_trim(skb, sizeof(*sre) + elen);

- /* Socket Extensions for SCTP
- * 5.3.1.3 SCTP_REMOTE_ERROR
- *
- * sre_type:
- * It should be SCTP_REMOTE_ERROR.
- */
+ /* RFC6458, Section 6.1.3. SCTP_REMOTE_ERROR */
+ memset(sre, 0, sizeof(*sre));
sre->sre_type = SCTP_REMOTE_ERROR;
-
- /*
- * Socket Extensions for SCTP
- * 5.3.1.3 SCTP_REMOTE_ERROR
- *
- * sre_flags: 16 bits (unsigned integer)
- * Currently unused.
- */
sre->sre_flags = 0;
-
- /* Socket Extensions for SCTP
- * 5.3.1.3 SCTP_REMOTE_ERROR
- *
- * sre_length: sizeof (__u32)
- *
- * This field is the total length of the notification data,
- * including the notification header.
- */
sre->sre_length = skb->len;
-
- /* Socket Extensions for SCTP
- * 5.3.1.3 SCTP_REMOTE_ERROR
- *
- * sre_error: 16 bits (unsigned integer)
- * This value represents one of the Operational Error causes defined in
- * the SCTP specification, in network byte order.
- */
sre->sre_error = cause;
-
- /* Socket Extensions for SCTP
- * 5.3.1.3 SCTP_REMOTE_ERROR
- *
- * sre_assoc_id: sizeof (sctp_assoc_t)
- *
- * The association id field, holds the identifier for the association.
- * All notifications for a given association have the same association
- * identifier. For TCP style socket, this field is ignored.
- */
sctp_ulpevent_set_owner(event, asoc);
sre->sre_assoc_id = sctp_assoc2id(asoc);

return event;
-
fail:
return NULL;
}
@@ -906,7 +865,9 @@ __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event)
return notification->sn_header.sn_type;
}

-/* Copy out the sndrcvinfo into a msghdr. */
+/* RFC6458, Section 5.3.2. SCTP Header Information Structure
+ * (SCTP_SNDRCV, DEPRECATED)
+ */
void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event,
struct msghdr *msghdr)
{
@@ -915,74 +876,21 @@ void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event,
if (sctp_ulpevent_is_notification(event))
return;

- /* Sockets API Extensions for SCTP
- * Section 5.2.2 SCTP Header Information Structure (SCTP_SNDRCV)
- *
- * sinfo_stream: 16 bits (unsigned integer)
- *
- * For recvmsg() the SCTP stack places the message's stream number in
- * this value.
- */
+ memset(&sinfo, 0, sizeof(sinfo));
sinfo.sinfo_stream = event->stream;
- /* sinfo_ssn: 16 bits (unsigned integer)
- *
- * For recvmsg() this value contains the stream sequence number that
- * the remote endpoint placed in the DATA chunk. For fragmented
- * messages this is the same number for all deliveries of the message
- * (if more than one recvmsg() is needed to read the message).
- */
sinfo.sinfo_ssn = event->ssn;
- /* sinfo_ppid: 32 bits (unsigned integer)
- *
- * In recvmsg() this value is
- * the same information that was passed by the upper layer in the peer
- * application. Please note that byte order issues are NOT accounted
- * for and this information is passed opaquely by the SCTP stack from
- * one end to the other.
- */
sinfo.sinfo_ppid = event->ppid;
- /* sinfo_flags: 16 bits (unsigned integer)
- *
- * This field may contain any of the following flags and is composed of
- * a bitwise OR of these values.
- *
- * recvmsg() flags:
- *
- * SCTP_UNORDERED - This flag is present when the message was sent
- * non-ordered.
- */
sinfo.sinfo_flags = event->flags;
- /* sinfo_tsn: 32 bit (unsigned integer)
- *
- * For the receiving side, this field holds a TSN that was
- * assigned to one of the SCTP Data Chunks.
- */
sinfo.sinfo_tsn = event->tsn;
- /* sinfo_cumtsn: 32 bit (unsigned integer)
- *
- * This field will hold the current cumulative TSN as
- * known by the underlying SCTP layer. Note this field is
- * ignored when sending and only valid for a receive
- * operation when sinfo_flags are set to SCTP_UNORDERED.
- */
sinfo.sinfo_cumtsn = event->cumtsn;
- /* sinfo_assoc_id: sizeof (sctp_assoc_t)
- *
- * The association handle field, sinfo_assoc_id, holds the identifier
- * for the association announced in the COMMUNICATION_UP notification.
- * All notifications for a given association have the same identifier.
- * Ignored for one-to-one style sockets.
- */
sinfo.sinfo_assoc_id = sctp_assoc2id(event->asoc);
-
- /* context value that is set via SCTP_CONTEXT socket option. */
+ /* Context value that is set via SCTP_CONTEXT socket option. */
sinfo.sinfo_context = event->asoc->default_rcv_context;
-
/* These fields are not used while receiving. */
sinfo.sinfo_timetolive = 0;

put_cmsg(msghdr, IPPROTO_SCTP, SCTP_SNDRCV,
- sizeof(struct sctp_sndrcvinfo), (void *)&sinfo);
+ sizeof(sinfo), &sinfo);
}

/* Do accounting for bytes received and hold a reference to the association

Luis Henriques

unread,

Aug 18, 2014, 6:00:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: "H. Peter Anvin" <h...@linux.intel.com>

commit 3891a04aafd668686239349ea58f3314ea2af86b upstream.

The IRET instruction, when returning to a 16-bit segment, only
restores the bottom 16 bits of the user space stack pointer. This
causes some 16-bit software to break, but it also leaks kernel state
to user space. We have a software workaround for that ("espfix") for
the 32-bit kernel, but it relies on a nonzero stack segment base which
is not available in 64-bit mode.

In checkin:

b3b42ac2cbae x86-64, modify_ldt: Ban 16-bit segments on 64-bit kernels

we "solved" this by forbidding 16-bit segments on 64-bit kernels, with
the logic that 16-bit support is crippled on 64-bit kernels anyway (no
V86 support), but it turns out that people are doing stuff like
running old Win16 binaries under Wine and expect it to work.

This works around this by creating percpu "ministacks", each of which
is mapped 2^16 times 64K apart. When we detect that the return SS is
on the LDT, we copy the IRET frame to the ministack and use the
relevant alias to return to userspace. The ministacks are mapped
readonly, so if IRET faults we promote #GP to #DF which is an IST
vector and thus has its own stack; we then do the fixup in the #DF
handler.

(Making #GP an IST exception would make the msr_safe functions unsafe
in NMI/MC context, and quite possibly have other effects.)

Special thanks to:

- Andy Lutomirski, for the suggestion of using very small stack slots
and copy (as opposed to map) the IRET frame there, and for the
suggestion to mark them readonly and let the fault promote to #DF.
- Konrad Wilk for paravirt fixup and testing.
- Borislav Petkov for testing help and useful comments.

Reported-by: Brian Gerst <brg...@gmail.com>

Signed-off-by: H. Peter Anvin <h...@linux.intel.com>

Link: http://lkml.kernel.org/r/1398816946-3351-1-...@linux.intel.com
Cc: Konrad Rzeszutek Wilk <konra...@oracle.com>
Cc: Borislav Petkov <b...@alien8.de>
Cc: Andrew Lutomriski <aml...@gmail.com>
Cc: Linus Torvalds <torv...@linux-foundation.org>
Cc: Dirk Hohndel <di...@hohndel.org>
Cc: Arjan van de Ven <arjan.va...@intel.com>
Cc: comex <com...@gmail.com>
Cc: Alexander van Heukelum <heuk...@fastmail.fm>
Cc: Boris Ostrovsky <boris.o...@oracle.com>

[ luis: backported to 3.11: adjusted context ]

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

Documentation/x86/x86_64/mm.txt | 2 +
arch/x86/include/asm/pgtable_64_types.h | 2 +
arch/x86/include/asm/setup.h | 3 +
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/entry_64.S | 73 ++++++++++-
arch/x86/kernel/espfix_64.c | 208 ++++++++++++++++++++++++++++++++
arch/x86/kernel/ldt.c | 11 --
arch/x86/kernel/smpboot.c | 7 ++
arch/x86/mm/dump_pagetables.c | 40 ++++--
init/main.c | 4 +
10 files changed, 325 insertions(+), 26 deletions(-)
create mode 100644 arch/x86/kernel/espfix_64.c

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 881582f75c9c..bd4370487b07 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,6 +12,8 @@ ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
+ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
+... unused hole ...
ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0
ffffffffa0000000 - ffffffffff5fffff (=1525 MB) module mapping space
ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 2d883440cb9a..b1609f2c524c 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -61,6 +61,8 @@ typedef struct { pteval_t pte; } pte_t;
#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
#define MODULES_END _AC(0xffffffffff000000, UL)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
+#define ESPFIX_PGD_ENTRY _AC(-2, UL)
+#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT)

#define EARLY_DYNAMIC_PAGE_TABLES 64

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index b7bf3505e1ec..93797d17ef32 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -60,6 +60,9 @@ extern void x86_ce4100_early_setup(void);

static inline void x86_ce4100_early_setup(void) { }
#endif

+extern void init_espfix_bsp(void);
+extern void init_espfix_ap(void);
+

#ifndef _SETUP

/*
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 88d99ea77723..b347fabda159 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-y += syscall_$(BITS).o
obj-$(CONFIG_X86_64) += vsyscall_64.o
obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
+obj-$(CONFIG_X86_64) += espfix_64.o
obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index bcdb3ca664d2..b7b03a91d713 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -58,6 +58,7 @@
#include <asm/asm.h>
#include <asm/context_tracking.h>
#include <asm/smap.h>
+#include <asm/pgtable_types.h>
#include <linux/err.h>

/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@@ -1055,8 +1056,16 @@ restore_args:
RESTORE_ARGS 1,8,1

irq_return:
+ /*
+ * Are we returning to a stack segment from the LDT? Note: in
+ * 64-bit mode SS:RSP on the exception stack is always valid.
+ */
+ testb $4,(SS-RIP)(%rsp)
+ jnz irq_return_ldt
+
+irq_return_iret:
INTERRUPT_RETURN
- _ASM_EXTABLE(irq_return, bad_iret)
+ _ASM_EXTABLE(irq_return_iret, bad_iret)

#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
@@ -1064,6 +1073,30 @@ ENTRY(native_iret)
_ASM_EXTABLE(native_iret, bad_iret)
#endif

+irq_return_ldt:
+ pushq_cfi %rax
+ pushq_cfi %rdi
+ SWAPGS
+ movq PER_CPU_VAR(espfix_waddr),%rdi
+ movq %rax,(0*8)(%rdi) /* RAX */
+ movq (2*8)(%rsp),%rax /* RIP */
+ movq %rax,(1*8)(%rdi)
+ movq (3*8)(%rsp),%rax /* CS */
+ movq %rax,(2*8)(%rdi)
+ movq (4*8)(%rsp),%rax /* RFLAGS */
+ movq %rax,(3*8)(%rdi)
+ movq (6*8)(%rsp),%rax /* SS */
+ movq %rax,(5*8)(%rdi)
+ movq (5*8)(%rsp),%rax /* RSP */
+ movq %rax,(4*8)(%rdi)
+ andl $0xffff0000,%eax
+ popq_cfi %rdi
+ orq PER_CPU_VAR(espfix_stack),%rax
+ SWAPGS
+ movq %rax,%rsp
+ popq_cfi %rax
+ jmp irq_return_iret
+
.section .fixup,"ax"
bad_iret:
/*
@@ -1127,9 +1160,41 @@ ENTRY(retint_kernel)
call preempt_schedule_irq
jmp exit_intr
#endif
-
CFI_ENDPROC
END(common_interrupt)
+
+ /*
+ * If IRET takes a fault on the espfix stack, then we
+ * end up promoting it to a doublefault. In that case,
+ * modify the stack to make it look like we just entered
+ * the #GP handler from user space, similar to bad_iret.
+ */
+ ALIGN
+__do_double_fault:
+ XCPT_FRAME 1 RDI+8
+ movq RSP(%rdi),%rax /* Trap on the espfix stack? */
+ sarq $PGDIR_SHIFT,%rax
+ cmpl $ESPFIX_PGD_ENTRY,%eax
+ jne do_double_fault /* No, just deliver the fault */
+ cmpl $__KERNEL_CS,CS(%rdi)
+ jne do_double_fault
+ movq RIP(%rdi),%rax
+ cmpq $irq_return_iret,%rax
+#ifdef CONFIG_PARAVIRT
+ je 1f
+ cmpq $native_iret,%rax
+#endif
+ jne do_double_fault /* This shouldn't happen... */
+1:
+ movq PER_CPU_VAR(kernel_stack),%rax
+ subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */
+ movq %rax,RSP(%rdi)
+ movq $0,(%rax) /* Missing (lost) #GP error code */
+ movq $general_protection,RIP(%rdi)
+ retq
+ CFI_ENDPROC
+END(__do_double_fault)
+
/*
* End of kprobes section
*/
@@ -1320,7 +1385,7 @@ zeroentry overflow do_overflow
zeroentry bounds do_bounds
zeroentry invalid_op do_invalid_op
zeroentry device_not_available do_device_not_available
-paranoiderrorentry double_fault do_double_fault
+paranoiderrorentry double_fault __do_double_fault
zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
errorentry invalid_TSS do_invalid_TSS
errorentry segment_not_present do_segment_not_present
@@ -1607,7 +1672,7 @@ error_sti:
*/
error_kernelspace:
incl %ebx
- leaq irq_return(%rip),%rcx
+ leaq irq_return_iret(%rip),%rcx
cmpq %rcx,RIP+8(%rsp)
je error_swapgs
movl %ecx,%eax /* zero extend */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
new file mode 100644
index 000000000000..8a64da36310f
--- /dev/null
+++ b/arch/x86/kernel/espfix_64.c
@@ -0,0 +1,208 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2014 Intel Corporation; author: H. Peter Anvin
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * The IRET instruction, when returning to a 16-bit segment, only
+ * restores the bottom 16 bits of the user space stack pointer. This
+ * causes some 16-bit software to break, but it also leaks kernel state
+ * to user space.
+ *
+ * This works around this by creating percpu "ministacks", each of which
+ * is mapped 2^16 times 64K apart. When we detect that the return SS is
+ * on the LDT, we copy the IRET frame to the ministack and use the
+ * relevant alias to return to userspace. The ministacks are mapped
+ * readonly, so if the IRET fault we promote #GP to #DF which is an IST
+ * vector and thus has its own stack; we then do the fixup in the #DF
+ * handler.
+ *
+ * This file sets up the ministacks and the related page tables. The
+ * actual ministack invocation is in entry_64.S.
+ */
+
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/random.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/setup.h>
+
+/*
+ * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
+ * it up to a cache line to avoid unnecessary sharing.
+ */
+#define ESPFIX_STACK_SIZE (8*8UL)
+#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
+
+/* There is address space for how many espfix pages? */
+#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16))
+
+#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
+#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
+# error "Need more than one PGD for the ESPFIX hack"
+#endif
+
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
+
+/* This contains the *bottom* address of the espfix stack */
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
+
+/* Initialization mutex - should this be a spinlock? */
+static DEFINE_MUTEX(espfix_init_mutex);
+
+/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */
+#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE)
+static void *espfix_pages[ESPFIX_MAX_PAGES];
+
+static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD]
+ __aligned(PAGE_SIZE);
+
+static unsigned int page_random, slot_random;
+
+/*
+ * This returns the bottom address of the espfix stack for a specific CPU.
+ * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case
+ * we have to account for some amount of padding at the end of each page.
+ */
+static inline unsigned long espfix_base_addr(unsigned int cpu)
+{
+ unsigned long page, slot;
+ unsigned long addr;
+
+ page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random;
+ slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE;
+ addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE);
+ addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16);
+ addr += ESPFIX_BASE_ADDR;
+ return addr;
+}
+
+#define PTE_STRIDE (65536/PAGE_SIZE)
+#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE)
+#define ESPFIX_PMD_CLONES PTRS_PER_PMD
+#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES))
+
+#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX)
+
+static void init_espfix_random(void)
+{
+ unsigned long rand;
+
+ /*
+ * This is run before the entropy pools are initialized,
+ * but this is hopefully better than nothing.
+ */
+ if (!arch_get_random_long(&rand)) {
+ /* The constant is an arbitrary large prime */
+ rdtscll(rand);
+ rand *= 0xc345c6b72fd16123UL;
+ }
+
+ slot_random = rand % ESPFIX_STACKS_PER_PAGE;
+ page_random = (rand / ESPFIX_STACKS_PER_PAGE)
+ & (ESPFIX_PAGE_SPACE - 1);
+}
+
+void __init init_espfix_bsp(void)
+{
+ pgd_t *pgd_p;
+ pteval_t ptemask;
+
+ ptemask = __supported_pte_mask;
+
+ /* Install the espfix pud into the kernel page directory */
+ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+
+ /* Randomize the locations */
+ init_espfix_random();
+
+ /* The rest is the same as for any other processor */
+ init_espfix_ap();
+}
+
+void init_espfix_ap(void)
+{
+ unsigned int cpu, page;
+ unsigned long addr;
+ pud_t pud, *pud_p;
+ pmd_t pmd, *pmd_p;
+ pte_t pte, *pte_p;
+ int n;
+ void *stack_page;
+ pteval_t ptemask;
+
+ /* We only have to do this once... */
+ if (likely(this_cpu_read(espfix_stack)))
+ return; /* Already initialized */
+
+ cpu = smp_processor_id();
+ addr = espfix_base_addr(cpu);
+ page = cpu/ESPFIX_STACKS_PER_PAGE;
+
+ /* Did another CPU already set this up? */
+ stack_page = ACCESS_ONCE(espfix_pages[page]);
+ if (likely(stack_page))
+ goto done;
+
+ mutex_lock(&espfix_init_mutex);
+
+ /* Did we race on the lock? */
+ stack_page = ACCESS_ONCE(espfix_pages[page]);
+ if (stack_page)
+ goto unlock_done;
+
+ ptemask = __supported_pte_mask;
+
+ pud_p = &espfix_pud_page[pud_index(addr)];
+ pud = *pud_p;
+ if (!pud_present(pud)) {
+ pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
+ pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pud(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PUD_CLONES; n++)
+ set_pud(&pud_p[n], pud);
+ }
+
+ pmd_p = pmd_offset(&pud, addr);
+ pmd = *pmd_p;
+ if (!pmd_present(pmd)) {
+ pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
+ pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pmd(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PMD_CLONES; n++)
+ set_pmd(&pmd_p[n], pmd);
+ }
+
+ pte_p = pte_offset_kernel(&pmd, addr);
+ stack_page = (void *)__get_free_page(GFP_KERNEL);
+ pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
+ paravirt_alloc_pte(&init_mm, __pa(stack_page) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PTE_CLONES; n++)
+ set_pte(&pte_p[n*PTE_STRIDE], pte);
+
+ /* Job is done for this CPU and any CPU which shares this page */
+ ACCESS_ONCE(espfix_pages[page]) = stack_page;
+
+unlock_done:
+ mutex_unlock(&espfix_init_mutex);
+done:
+ this_cpu_write(espfix_stack, addr);
+ this_cpu_write(espfix_waddr, (unsigned long)stack_page
+ + (addr & ~PAGE_MASK));
+}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index af1d14a9ebda..ebc987398923 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -229,17 +229,6 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
}
}

- /*
- * On x86-64 we do not support 16-bit segments due to
- * IRET leaking the high bits of the kernel stack address.
- */
-#ifdef CONFIG_X86_64
- if (!ldt_info.seg_32bit) {
- error = -EINVAL;
- goto out_unlock;
- }
-#endif
-
fill_ldt(&ldt, &ldt_info);
if (oldmode)
ldt.avl = 0;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index aecc98a93d1b..a2475d0f3af0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -265,6 +265,13 @@ static void notrace start_secondary(void *unused)
check_tsc_sync_target();

/*
+ * Enable the espfix hack for this CPU
+ */
+#ifdef CONFIG_X86_64
+ init_espfix_ap();
+#endif
+
+ /*
* We need to hold vector_lock so there the set of online cpus
* does not change while we are assigning vectors to cpus. Holding
* this lock ensures we don't half assign or remove an irq from a cpu.
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0002a3a33081..8f556f70b9df 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -30,11 +30,13 @@ struct pg_state {
unsigned long start_address;
unsigned long current_address;
const struct addr_marker *marker;
+ unsigned long lines;
};

struct addr_marker {
unsigned long start_address;
const char *name;
+ unsigned long max_lines;
};

/* indices for address_markers; keep sync'd w/ address_markers below */
@@ -45,6 +47,7 @@ enum address_markers_idx {
LOW_KERNEL_NR,
VMALLOC_START_NR,
VMEMMAP_START_NR,
+ ESPFIX_START_NR,
HIGH_KERNEL_NR,
MODULES_VADDR_NR,
MODULES_END_NR,
@@ -67,6 +70,7 @@ static struct addr_marker address_markers[] = {
{ PAGE_OFFSET, "Low Kernel Mapping" },
{ VMALLOC_START, "vmalloc() Area" },
{ VMEMMAP_START, "Vmemmap" },
+ { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
{ __START_KERNEL_map, "High Kernel Mapping" },
{ MODULES_VADDR, "Modules" },
{ MODULES_END, "End Modules" },
@@ -163,7 +167,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,
pgprot_t new_prot, int level)
{
pgprotval_t prot, cur;
- static const char units[] = "KMGTPE";
+ static const char units[] = "BKMGTPE";

/*
* If we have a "break" in the series, we need to flush the state that
@@ -178,6 +182,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,
st->current_prot = new_prot;
st->level = level;
st->marker = address_markers;
+ st->lines = 0;
seq_printf(m, "---[ %s ]---\n", st->marker->name);
} else if (prot != cur || level != st->level ||
st->current_address >= st->marker[1].start_address) {
@@ -188,17 +193,21 @@ static void note_page(struct seq_file *m, struct pg_state *st,
/*
* Now print the actual finished series
*/
- seq_printf(m, "0x%0*lx-0x%0*lx ",
- width, st->start_address,
- width, st->current_address);
-
- delta = (st->current_address - st->start_address) >> 10;
- while (!(delta & 1023) && unit[1]) {
- delta >>= 10;
- unit++;
+ if (!st->marker->max_lines ||
+ st->lines < st->marker->max_lines) {
+ seq_printf(m, "0x%0*lx-0x%0*lx ",
+ width, st->start_address,
+ width, st->current_address);
+
+ delta = st->current_address - st->start_address;
+ while (!(delta & 1023) && unit[1]) {
+ delta >>= 10;
+ unit++;
+ }
+ seq_printf(m, "%9lu%c ", delta, *unit);
+ printk_prot(m, st->current_prot, st->level);
}
- seq_printf(m, "%9lu%c ", delta, *unit);
- printk_prot(m, st->current_prot, st->level);
+ st->lines++;

/*
* We print markers for special areas of address space,
@@ -206,7 +215,16 @@ static void note_page(struct seq_file *m, struct pg_state *st,
* This helps in the interpretation.
*/
if (st->current_address >= st->marker[1].start_address) {
+ if (st->marker->max_lines &&
+ st->lines > st->marker->max_lines) {
+ unsigned long nskip =
+ st->lines - st->marker->max_lines;
+ seq_printf(m, "... %lu entr%s skipped ... \n",
+ nskip,
+ nskip == 1 ? "y" : "ies");
+ }
st->marker++;
+ st->lines = 0;
seq_printf(m, "---[ %s ]---\n", st->marker->name);
}

diff --git a/init/main.c b/init/main.c
index 586cd3359c02..1f93d4080aa8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -608,6 +608,10 @@ asmlinkage void __init start_kernel(void)
if (efi_enabled(EFI_RUNTIME_SERVICES))
efi_enter_virtual_mode();
#endif
+#ifdef CONFIG_X86_64
+ /* Should be run before the first non-init thread is created */
+ init_espfix_bsp();
+#endif
thread_info_cache_init();
cred_init();
fork_init(totalram_pages);

Luis Henriques

unread,

Aug 18, 2014, 6:00:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Sven Wegener <sven.w...@stealer.net>

commit 8142b215501f8b291a108a202b3a053a265b03dd upstream.

Commit 554086d ("x86_32, entry: Do syscall exit work on badsys
(CVE-2014-4508)") introduced a regression in the x86_32 syscall entry
code, resulting in syscall() not returning proper errors for undefined
syscalls on CPUs supporting the sysenter feature.

The following code:

> int result = syscall(666);
> printf("result=%d errno=%d error=%s\n", result, errno, strerror(errno));

results in:

> result=666 errno=0 error=Success

Obviously, the syscall return value is the called syscall number, but it
should have been an ENOSYS error. When run under ptrace it behaves
correctly, which makes it hard to debug in the wild:

> result=-1 errno=38 error=Function not implemented

The %eax register is the return value register. For debugging via ptrace
the syscall entry code stores the complete register context on the
stack. The badsys handlers only store the ENOSYS error code in the
ptrace register set and do not set %eax like a regular syscall handler
would. The old resume_userspace call chain contains code that clobbers
%eax and it restores %eax from the ptrace registers afterwards. The same
goes for the ptrace-enabled call chain. When ptrace is not used, the
syscall return value is the passed-in syscall number from the untouched
%eax register.

Use %eax as the return value register in syscall_badsys and
sysenter_badsys, like a real syscall handler does, and have the caller
push the value onto the stack for ptrace access.

Signed-off-by: Sven Wegener <sven.w...@stealer.net>
Link: http://lkml.kernel.org/r/alpine.LNX.2.11.1...@titan.int.lan.stealer.net
Reviewed-and-tested-by: Andy Lutomirski <lu...@amacapital.net>
Signed-off-by: H. Peter Anvin <h...@zytor.com>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/x86/kernel/entry_32.S | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 4de29bc5a5b4..7b5e08fc505e 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -436,8 +436,8 @@ sysenter_do_call:
cmpl $(NR_syscalls), %eax
jae sysenter_badsys
call *sys_call_table(,%eax,4)
- movl %eax,PT_EAX(%esp)
sysenter_after_call:
+ movl %eax,PT_EAX(%esp)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
@@ -517,6 +517,7 @@ ENTRY(system_call)
jae syscall_badsys
syscall_call:
call *sys_call_table(,%eax,4)
+syscall_after_call:
movl %eax,PT_EAX(%esp) # store the return value
syscall_exit:
LOCKDEP_SYS_EXIT
@@ -686,12 +687,12 @@ syscall_fault:
END(syscall_fault)

syscall_badsys:
- movl $-ENOSYS,PT_EAX(%esp)
- jmp syscall_exit
+ movl $-ENOSYS,%eax
+ jmp syscall_after_call
END(syscall_badsys)

sysenter_badsys:
- movl $-ENOSYS,PT_EAX(%esp)
+ movl $-ENOSYS,%eax
jmp sysenter_after_call
END(syscall_badsys)
CFI_ENDPROC

Luis Henriques

unread,

Aug 18, 2014, 6:00:07 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Catalin Marinas <catalin...@arm.com>

commit 7f88f88f83ed609650a01b18572e605ea50cd163 upstream.

Commit 248ac0e1943a ("mm/vmalloc: remove guard page from between vmap
blocks") had the side effect of making vmap_area.va_end member point to
the next vmap_area.va_start. This was creating an artificial reference
to vmalloc'ed objects and kmemleak was rarely reporting vmalloc() leaks.

This patch marks the vmap_area containing pointers explicitly and
reduces the min ref_count to 2 as vm_struct still contains a reference
to the vmalloc'ed object. The kmemleak add_scan_area() function has
been improved to allow a SIZE_MAX argument covering the rest of the
object (for simpler calling sites).

Signed-off-by: Catalin Marinas <catalin...@arm.com>

Signed-off-by: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Linus Torvalds <torv...@linux-foundation.org>

Cc: Qiang Huang <h.huan...@huawei.com>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

mm/kmemleak.c | 4 +++-
mm/vmalloc.c | 14 ++++++++++----
2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c8d7f3110fd0..c526ded73971 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -753,7 +753,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
}

spin_lock_irqsave(&object->lock, flags);
- if (ptr + size > object->pointer + object->size) {
+ if (size == SIZE_MAX) {
+ size = object->pointer + object->size - ptr;
+ } else if (ptr + size > object->pointer + object->size) {
kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
dump_object_info(object);
kmem_cache_free(scan_area_cache, area);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 13a54953a273..4ae02e52f1a0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -359,6 +359,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
if (unlikely(!va))
return ERR_PTR(-ENOMEM);

+ /*
+ * Only scan the relevant parts containing pointers to other objects
+ * to avoid false negatives.
+ */
+ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
+
retry:
spin_lock(&vmap_area_lock);
/*
@@ -1651,11 +1657,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
clear_vm_uninitialized_flag(area);

/*
- * A ref_count = 3 is needed because the vm_struct and vmap_area
- * structures allocated in the __get_vm_area_node() function contain
- * references to the virtual address of the vmalloc'ed block.
+ * A ref_count = 2 is needed because vm_struct allocated in
+ * __get_vm_area_node() contains a reference to the virtual address of
+ * the vmalloc'ed block.
*/
- kmemleak_alloc(addr, real_size, 3, gfp_mask);
+ kmemleak_alloc(addr, real_size, 2, gfp_mask);

return addr;

Luis Henriques

unread,

Aug 18, 2014, 6:00:07 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Andrey Utkin <andrey.kri...@gmail.com>

commit 36beddc272c111689f3042bf3d10a64d8a805f93 upstream.

Setting just skb->sk without taking its reference and setting a
destructor is invalid. However, in the places where this was done, skb
is used in a way not requiring skb->sk setting. So dropping the setting
of skb->sk.
Thanks to Eric Dumazet <eric.d...@gmail.com> for correct solution.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=79441
Reported-by: Ed Martin <edma...@edman007.com>
Signed-off-by: Andrey Utkin <andrey.kri...@gmail.com>
Signed-off-by: Eric Dumazet <edum...@google.com>

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

net/appletalk/ddp.c | 3 ---
1 file changed, 3 deletions(-)

diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 7d424ac6e760..43e875c84429 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1489,8 +1489,6 @@ static int atalk_rcv(struct sk_buff *skb, struct net_device *dev,
goto drop;

/* Queue packet (standard) */
- skb->sk = sock;
-
if (sock_queue_rcv_skb(sock, skb) < 0)
goto drop;

@@ -1644,7 +1642,6 @@ static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
if (!skb)
goto out;

- skb->sk = sk;
skb_reserve(skb, ddp_dl->header_length);
skb_reserve(skb, dev->hard_header_len);
skb->dev = dev;

Luis Henriques

unread,

Aug 18, 2014, 6:00:05 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Tejun Heo <t...@kernel.org>

commit 0b462c89e31f7eb6789713437eb551833ee16ff3 upstream.

While a queue is being destroyed, all the blkgs are destroyed and its
->root_blkg pointer is set to NULL. If someone else starts to drain
while the queue is in this state, the following oops happens.

NULL pointer dereference at 0000000000000028
IP: [<ffffffff8144e944>] blk_throtl_drain+0x84/0x230
PGD e4a1067 PUD b773067 PMD 0
Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
Modules linked in: cfq_iosched(-) [last unloaded: cfq_iosched]
CPU: 1 PID: 537 Comm: bash Not tainted 3.16.0-rc3-work+ #2
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
task: ffff88000e222250 ti: ffff88000efd4000 task.ti: ffff88000efd4000
RIP: 0010:[<ffffffff8144e944>] [<ffffffff8144e944>] blk_throtl_drain+0x84/0x230
RSP: 0018:ffff88000efd7bf0 EFLAGS: 00010046
RAX: 0000000000000000 RBX: ffff880015091450 RCX: 0000000000000001
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: ffff88000efd7c10 R08: 0000000000000000 R09: 0000000000000001
R10: ffff88000e222250 R11: 0000000000000000 R12: ffff880015091450
R13: ffff880015092e00 R14: ffff880015091d70 R15: ffff88001508fc28
FS: 00007f1332650740(0000) GS:ffff88001fa80000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000028 CR3: 0000000009446000 CR4: 00000000000006e0
Stack:
ffffffff8144e8f6 ffff880015091450 0000000000000000 ffff880015091d80
ffff88000efd7c28 ffffffff8144ae2f ffff880015091450 ffff88000efd7c58
ffffffff81427641 ffff880015091450 ffffffff82401f00 ffff880015091450
Call Trace:
[<ffffffff8144ae2f>] blkcg_drain_queue+0x1f/0x60
[<ffffffff81427641>] __blk_drain_queue+0x71/0x180
[<ffffffff81429b3e>] blk_queue_bypass_start+0x6e/0xb0
[<ffffffff814498b8>] blkcg_deactivate_policy+0x38/0x120
[<ffffffff8144ec44>] blk_throtl_exit+0x34/0x50
[<ffffffff8144aea5>] blkcg_exit_queue+0x35/0x40
[<ffffffff8142d476>] blk_release_queue+0x26/0xd0
[<ffffffff81454968>] kobject_cleanup+0x38/0x70
[<ffffffff81454848>] kobject_put+0x28/0x60
[<ffffffff81427505>] blk_put_queue+0x15/0x20
[<ffffffff817d07bb>] scsi_device_dev_release_usercontext+0x16b/0x1c0
[<ffffffff810bc339>] execute_in_process_context+0x89/0xa0
[<ffffffff817d064c>] scsi_device_dev_release+0x1c/0x20
[<ffffffff817930e2>] device_release+0x32/0xa0
[<ffffffff81454968>] kobject_cleanup+0x38/0x70
[<ffffffff81454848>] kobject_put+0x28/0x60
[<ffffffff817934d7>] put_device+0x17/0x20
[<ffffffff817d11b9>] __scsi_remove_device+0xa9/0xe0
[<ffffffff817d121b>] scsi_remove_device+0x2b/0x40
[<ffffffff817d1257>] sdev_store_delete+0x27/0x30
[<ffffffff81792ca8>] dev_attr_store+0x18/0x30
[<ffffffff8126f75e>] sysfs_kf_write+0x3e/0x50
[<ffffffff8126ea87>] kernfs_fop_write+0xe7/0x170
[<ffffffff811f5e9f>] vfs_write+0xaf/0x1d0
[<ffffffff811f69bd>] SyS_write+0x4d/0xc0
[<ffffffff81d24692>] system_call_fastpath+0x16/0x1b

776687bce42b ("block, blk-mq: draining can't be skipped even if
bypass_depth was non-zero") made it easier to trigger this bug by
making blk_queue_bypass_start() drain even when it loses the first
bypass test to blk_cleanup_queue(); however, the bug has always been
there even before the commit as blk_queue_bypass_start() could race
against queue destruction, win the initial bypass test but perform the
actual draining after blk_cleanup_queue() already destroyed all blkgs.

Fix it by skippping calling into policy draining if all the blkgs are
already gone.

Signed-off-by: Tejun Heo <t...@kernel.org>
Reported-by: Shirish Pargaonkar <sparg...@suse.com>
Reported-by: Sasha Levin <sasha...@oracle.com>
Reported-by: Jet Chen <jet....@intel.com>
Tested-by: Shirish Pargaonkar <sparg...@suse.com>
Signed-off-by: Jens Axboe <ax...@fb.com>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

block/blk-cgroup.c | 7 +++++++
1 file changed, 7 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 354efcdad847..61f595f4525f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -856,6 +856,13 @@ void blkcg_drain_queue(struct request_queue *q)
{
lockdep_assert_held(q->queue_lock);

+ /*
+ * @q could be exiting and already have destroyed all blkgs as
+ * indicated by NULL root_blkg. If so, don't confuse policies.
+ */
+ if (!q->root_blkg)
+ return;
+
blk_throtl_drain(q);

Luis Henriques

unread,

Aug 18, 2014, 6:00:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Linus Torvalds <torv...@linux-foundation.org>

commit 2062afb4f804afef61cbe62a30cac9a46e58e067 upstream.

Michel Dänzer and a couple of other people reported inexplicable random
oopses in the scheduler, and the cause turns out to be gcc mis-compiling
the load_balance() function when debugging is enabled. The gcc bug
apparently goes back to gcc-4.5, but slight optimization changes means
that it now showed up as a problem in 4.9.0 and 4.9.1.

The instruction scheduling problem causes gcc to schedule a spill
operation to before the stack frame has been created, which in turn can
corrupt the spilled value if an interrupt comes in. There may be other
effects of this bug too, but that's the code generation problem seen in
Michel's case.

This is fixed in current gcc HEAD, but the workaround as suggested by
Markus Trippelsdorf is pretty simple: use -fno-var-tracking-assignments
when compiling the kernel, which disables the gcc code that causes the
problem. This can result in slightly worse debug information for
variable accesses, but that is infinitely preferable to actual code
generation problems.

Doing this unconditionally (not just for CONFIG_DEBUG_INFO) also allows
non-debug builds to verify that the debug build would be identical: we
can do

export GCC_COMPARE_DEBUG=1

to make gcc internally verify that the result of the build is
independent of the "-g" flag (it will make the compiler build everything
twice, toggling the debug flag, and compare the results).

Without the "-fno-var-tracking-assignments" option, the build would fail
(even with 4.8.3 that didn't show the actual stack frame bug) with a gcc
compare failure.

See also gcc bugzilla:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61801

Reported-by: Michel Dänzer <mic...@daenzer.net>
Suggested-by: Markus Trippelsdorf <mar...@trippelsdorf.de>
Cc: Jakub Jelinek <ja...@redhat.com>
Signed-off-by: Linus Torvalds <torv...@linux-foundation.org>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

Makefile | 2 ++

1 file changed, 2 insertions(+)

diff --git a/Makefile b/Makefile
index 305f1d382b99..2f2d7f4dce1d 100644
--- a/Makefile
+++ b/Makefile
@@ -614,6 +614,8 @@ KBUILD_CFLAGS += -fomit-frame-pointer
endif
endif

+KBUILD_CFLAGS += $(call cc-option, -fno-var-tracking-assignments)
+
ifdef CONFIG_DEBUG_INFO
KBUILD_CFLAGS += -g
KBUILD_AFLAGS += -gdwarf-2

Luis Henriques

unread,

Aug 18, 2014, 6:00:05 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Kevin Hao <haok...@gmail.com>

commit 1871ee134b73fb4cadab75752a7152ed2813c751 upstream.

The sata on fsl mpc8315e is broken after the commit 8a4aeec8d2d6
("libata/ahci: accommodate tag ordered controllers"). The reason is
that the ata controller on this SoC only implement a queue depth of
16. When issuing the commands in tag order, all the commands in tag
16 ~ 31 are mapped to tag 0 unconditionally and then causes the sata
malfunction. It makes no senses to use a 32 queue in software while
the hardware has less queue depth. So consider the queue depth
implemented by the hardware when requesting a command tag.

Fixes: 8a4aeec8d2d6 ("libata/ahci: accommodate tag ordered controllers")
Signed-off-by: Kevin Hao <haok...@gmail.com>
Acked-by: Dan Williams <dan.j.w...@intel.com>
Signed-off-by: Tejun Heo <t...@kernel.org>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/ata/libata-core.c | 22 +++++++++++++++++++---
1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 6b45338919b7..b53d81fc9dba 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4758,6 +4758,10 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
* ata_qc_new - Request an available ATA command, for queueing
* @ap: target port
*
+ * Some ATA host controllers may implement a queue depth which is less
+ * than ATA_MAX_QUEUE. So we shouldn't allocate a tag which is beyond
+ * the hardware limitation.
+ *
* LOCKING:
* None.
*/
@@ -4765,14 +4769,16 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
{
struct ata_queued_cmd *qc = NULL;
- unsigned int i, tag;
+ unsigned int i, tag, max_queue;
+
+ max_queue = ap->scsi_host->can_queue;

/* no command while frozen */
if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
return NULL;

- for (i = 0; i < ATA_MAX_QUEUE; i++) {
- tag = (i + ap->last_tag + 1) % ATA_MAX_QUEUE;
+ for (i = 0, tag = ap->last_tag + 1; i < max_queue; i++, tag++) {
+ tag = tag < max_queue ? tag : 0;

/* the last tag is reserved for internal command. */
if (tag == ATA_TAG_INTERNAL)
@@ -6155,6 +6161,16 @@ int ata_host_register(struct ata_host *host, struct scsi_host_template *sht)
{
int i, rc;

+ /*
+ * The max queue supported by hardware must not be greater than
+ * ATA_MAX_QUEUE.
+ */
+ if (sht->can_queue > ATA_MAX_QUEUE) {
+ dev_err(host->dev, "BUG: the hardware max queue is too large\n");
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
/* host must have been started */
if (!(host->flags & ATA_HOST_STARTED)) {
dev_err(host->dev, "BUG: trying to register unstarted host\n");

Luis Henriques

unread,

Aug 18, 2014, 6:00:07 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Michael Brown <mbr...@fensystems.co.uk>

commit c7fb93ec51d462ec3540a729ba446663c26a0505 upstream.

The PE/COFF headers currently describe only the initialised-data
portions of the image, and result in no space being allocated for the
uninitialised-data portions. Consequently, the EFI boot stub will end
up overwriting unexpected areas of memory, with unpredictable results.

Fix by including a .bss section in the PE/COFF headers (functionally
equivalent to the init_size field in the bzImage header).

Signed-off-by: Michael Brown <mbr...@fensystems.co.uk>
Cc: Thomas Bächler <tho...@archlinux.org>
Cc: Josh Boyer <jwb...@fedoraproject.org>
Signed-off-by: Matt Fleming <matt.f...@intel.com>
[ luis: backported to 3.11: used Michael's backport ]

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/x86/boot/header.S | 26 ++++++++++++++++++++++----
arch/x86/boot/tools/build.c | 37 ++++++++++++++++++++++++++++++-------
2 files changed, 52 insertions(+), 11 deletions(-)

diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 9ec06a1f6d61..425712462178 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -91,10 +91,9 @@ bs_die:

.section ".bsdata", "a"
bugger_off_msg:
- .ascii "Direct floppy boot is not supported. "
- .ascii "Use a boot loader program instead.\r\n"
+ .ascii "Use a boot loader.\r\n"
.ascii "\n"
- .ascii "Remove disk and press any key to reboot ...\r\n"
+ .ascii "Remove disk and press any key to reboot...\r\n"
.byte 0

#ifdef CONFIG_EFI_STUB
@@ -108,7 +107,7 @@ coff_header:
#else
.word 0x8664 # x86-64
#endif
- .word 3 # nr_sections
+ .word 4 # nr_sections
.long 0 # TimeDateStamp
.long 0 # PointerToSymbolTable
.long 1 # NumberOfSymbols
@@ -250,6 +249,25 @@ section_table:
.word 0 # NumberOfLineNumbers
.long 0x60500020 # Characteristics (section flags)

+ #
+ # The offset & size fields are filled in by build.c.
+ #
+ .ascii ".bss"
+ .byte 0
+ .byte 0
+ .byte 0
+ .byte 0
+ .long 0
+ .long 0x0
+ .long 0 # Size of initialized data
+ # on disk
+ .long 0x0
+ .long 0 # PointerToRelocations
+ .long 0 # PointerToLineNumbers
+ .word 0 # NumberOfRelocations
+ .word 0 # NumberOfLineNumbers
+ .long 0xc8000080 # Characteristics (section flags)
+
#endif /* CONFIG_EFI_STUB */

# Kernel attributes; used by setup. This is part 1 of the
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index c941d6a8887f..687dd281c23e 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -141,7 +141,7 @@ static void usage(void)

#ifdef CONFIG_EFI_STUB

-static void update_pecoff_section_header(char *section_name, u32 offset, u32 size)
+static void update_pecoff_section_header_fields(char *section_name, u32 vma, u32 size, u32 datasz, u32 offset)
{
unsigned int pe_header;
unsigned short num_sections;
@@ -162,10 +162,10 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz
put_unaligned_le32(size, section + 0x8);

/* section header vma field */
- put_unaligned_le32(offset, section + 0xc);
+ put_unaligned_le32(vma, section + 0xc);

/* section header 'size of initialised data' field */
- put_unaligned_le32(size, section + 0x10);
+ put_unaligned_le32(datasz, section + 0x10);

/* section header 'file offset' field */
put_unaligned_le32(offset, section + 0x14);
@@ -177,6 +177,11 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz
}
}

+static void update_pecoff_section_header(char *section_name, u32 offset, u32 size)
+{
+ update_pecoff_section_header_fields(section_name, offset, size, size, offset);
+}
+
static void update_pecoff_setup_and_reloc(unsigned int size)
{
u32 setup_offset = 0x200;
@@ -201,9 +206,6 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)

pe_header = get_unaligned_le32(&buf[0x3c]);

- /* Size of image */
- put_unaligned_le32(file_sz, &buf[pe_header + 0x50]);
-
/*
* Size of code: Subtract the size of the first sector (512 bytes)
* which includes the header.
@@ -218,6 +220,22 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)
update_pecoff_section_header(".text", text_start, text_sz);
}

+static void update_pecoff_bss(unsigned int file_sz, unsigned int init_sz)
+{
+ unsigned int pe_header;
+ unsigned int bss_sz = init_sz - file_sz;
+
+ pe_header = get_unaligned_le32(&buf[0x3c]);
+
+ /* Size of uninitialized data */
+ put_unaligned_le32(bss_sz, &buf[pe_header + 0x24]);
+
+ /* Size of image */
+ put_unaligned_le32(init_sz, &buf[pe_header + 0x50]);
+
+ update_pecoff_section_header_fields(".bss", file_sz, bss_sz, 0, 0);
+}
+
#endif /* CONFIG_EFI_STUB */

@@ -269,6 +287,9 @@ int main(int argc, char ** argv)
int fd;
void *kernel;
u32 crc = 0xffffffffUL;
+#ifdef CONFIG_EFI_STUB
+ unsigned int init_sz;
+#endif

/* Defaults for old kernel */
#ifdef CONFIG_X86_32
@@ -339,7 +360,9 @@ int main(int argc, char ** argv)
put_unaligned_le32(sys_size, &buf[0x1f4]);

#ifdef CONFIG_EFI_STUB
- update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz));
+ update_pecoff_text(setup_sectors * 512, i + (sys_size * 16));
+ init_sz = get_unaligned_le32(&buf[0x260]);
+ update_pecoff_bss(i + (sys_size * 16), init_sz);

#ifdef CONFIG_X86_64 /* Yes, this is really how we defined it :( */
efi_stub_entry -= 0x200;

Luis Henriques

unread,

Aug 18, 2014, 6:00:05 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: "David S. Miller" <da...@davemloft.net>

commit e5c460f46ae7ee94831cb55cb980f942aa9e5a85 upstream.

This was found using Dave Jone's trinity tool.

When a user process which is 32-bit performs a load or a store, the
cpu chops off the top 32-bits of the effective address before
translating it.

This is because we run 32-bit tasks with the PSTATE_AM (address
masking) bit set.

We can't run the kernel with that bit set, so when the kernel accesses
userspace no address masking occurs.

Since a 32-bit process will have no mappings in that region we will
properly fault, so we don't try to handle this using access_ok(),
which can safely just be a NOP on sparc64.

Real faults from 32-bit processes should never generate such addresses
so a bug check was added long ago, and it barks in the logs if this
happens.

But it also barks when a kernel user access causes this condition, and
that _can_ happen. For example, if a pointer passed into a system call
is "0xfffffffc" and the kernel access 4 bytes offset from that pointer.

Just handle such faults normally via the exception entries.

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/sparc/mm/fault_64.c | 16 +---------------
1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 1992fa04095f..ea83f82464da 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -280,18 +280,6 @@ static void noinline __kprobes bogus_32bit_fault_tpc(struct pt_regs *regs)
show_regs(regs);
}

-static void noinline __kprobes bogus_32bit_fault_address(struct pt_regs *regs,
- unsigned long addr)
-{
- static int times;
-
- if (times++ < 10)
- printk(KERN_ERR "FAULT[%s:%d]: 32-bit process "
- "reports 64-bit fault address [%lx]\n",
- current->comm, current->pid, addr);
- show_regs(regs);
-}
-
asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
{
struct mm_struct *mm = current->mm;
@@ -320,10 +308,8 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
goto intr_or_no_mm;
}
}
- if (unlikely((address >> 32) != 0)) {
- bogus_32bit_fault_address(regs, address);
+ if (unlikely((address >> 32) != 0))
goto intr_or_no_mm;
- }
}

if (regs->tstate & TSTATE_PRIV) {

Luis Henriques

unread,

Aug 18, 2014, 6:00:07 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Mikulas Patocka <mpat...@redhat.com>

commit 694617474e33b8603fc76e090ed7d09376514b1a upstream.

The patch 3e374919b314f20e2a04f641ebc1093d758f66a4 is supposed to fix the
problem where kmem_cache_create incorrectly reports duplicate cache name
and fails. The problem is described in the header of that patch.

However, the patch doesn't really fix the problem because of these
reasons:

* the logic to test for debugging is reversed. It was intended to perform
the check only if slub debugging is enabled (which implies that caches
with the same parameters are not merged). Therefore, there should be
#if !defined(CONFIG_SLUB) || defined(CONFIG_SLUB_DEBUG_ON)
The current code has the condition reversed and performs the test if
debugging is disabled.

* slub debugging may be enabled or disabled based on kernel command line,
CONFIG_SLUB_DEBUG_ON is just the default settings. Therefore the test
based on definition of CONFIG_SLUB_DEBUG_ON is unreliable.

This patch fixes the problem by removing the test
"!defined(CONFIG_SLUB_DEBUG_ON)". Therefore, duplicate names are never
checked if the SLUB allocator is used.

Note to stable kernel maintainers: when backporint this patch, please
backport also the patch 3e374919b314f20e2a04f641ebc1093d758f66a4.

Acked-by: David Rientjes <rien...@google.com>
Acked-by: Christoph Lameter <c...@linux.com>
Signed-off-by: Mikulas Patocka <mpat...@redhat.com>
Signed-off-by: Pekka Enberg <pen...@kernel.org>

[ luis: backported to 3.11: adjusted context ]

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

mm/slab_common.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/slab_common.c b/mm/slab_common.c
index d43477196b22..954382b3c188 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -55,7 +55,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
continue;
}

-#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON)
+#if !defined(CONFIG_SLUB)
/*
* For simplicity, we won't check this in the list of memcg
* caches. We have control over memcg naming, and if there

Luis Henriques

unread,

Aug 18, 2014, 6:00:08 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Eliad Peller <el...@wizery.com>

commit 8c26d458394be44e135d1c6bd4557e1c4e1a0535 upstream.

tsc can be NULL (mac80211 currently always passes NULL),
resulting in NULL-dereference. check before copying it.

Signed-off-by: Eliad Peller <eliadx...@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel...@intel.com>
Signed-off-by: Johannes Berg <johann...@intel.com>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

net/wireless/trace.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index e1534baf2ebb..c4d5da718171 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1994,7 +1994,8 @@ TRACE_EVENT(cfg80211_michael_mic_failure,
MAC_ASSIGN(addr, addr);
__entry->key_type = key_type;
__entry->key_id = key_id;
- memcpy(__entry->tsc, tsc, 6);
+ if (tsc)
+ memcpy(__entry->tsc, tsc, 6);
),
TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT ", key type: %d, key id: %d, tsc: %pm",
NETDEV_PR_ARG, MAC_PR_ARG(addr), __entry->key_type,

Luis Henriques

unread,

Aug 18, 2014, 6:00:07 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: "H. Peter Anvin" <h...@zytor.com>

commit 197725de65477bc8509b41388157c1a2283542bb upstream.

Make espfix64 a hidden Kconfig option. This fixes the x86-64 UML
build which had broken due to the non-existence of init_espfix_bsp()
in UML: since UML uses its own Kconfig, this option does not appear in
the UML build.

This also makes it possible to make support for 16-bit segments a
configuration option, for the people who want to minimize the size of
the kernel.

Reported-by: Ingo Molnar <mi...@kernel.org>

Signed-off-by: H. Peter Anvin <h...@zytor.com>

Cc: Richard Weinberger <ric...@nod.at>
Link: http://lkml.kernel.org/r/1398816946-3351-1-...@linux.intel.com

[ luis: backported to 3.11: adjusted context ]

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/x86/Kconfig | 4 ++++
arch/x86/kernel/Makefile | 2 +-
arch/x86/kernel/smpboot.c | 2 +-
init/main.c | 2 +-
4 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b32ebf92b0ce..ba5fdd09c61e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -967,6 +967,10 @@ config VM86
XFree86 to initialize some video cards via BIOS. Disabling this
option saves about 6k.

+config X86_ESPFIX64
+ def_bool y
+ depends on X86_64
+
config TOSHIBA
tristate "Toshiba Laptop support"
depends on X86_32
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b347fabda159..b714a3537683 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -29,7 +29,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o

obj-y += syscall_$(BITS).o
obj-$(CONFIG_X86_64) += vsyscall_64.o
obj-$(CONFIG_X86_64) += vsyscall_emu_64.o

-obj-$(CONFIG_X86_64) += espfix_64.o
+obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o

obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index a2475d0f3af0..af9296c6c9ed 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -267,7 +267,7 @@ static void notrace start_secondary(void *unused)
/*

* Enable the espfix hack for this CPU

*/
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_ESPFIX64
init_espfix_ap();
#endif

diff --git a/init/main.c b/init/main.c
index 1f93d4080aa8..e4c1b0173c87 100644
--- a/init/main.c
+++ b/init/main.c
@@ -608,7 +608,7 @@ asmlinkage void __init start_kernel(void)
if (efi_enabled(EFI_RUNTIME_SERVICES))
efi_enter_virtual_mode();
#endif
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_ESPFIX64

/* Should be run before the first non-init thread is created */

init_espfix_bsp();
#endif

Luis Henriques

unread,

Aug 18, 2014, 6:00:07 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Sowmini Varadhan <sowmini....@oracle.com>

commit a4b70a07ed12a71131cab7adce2ce91c71b37060 upstream.

Nothing cleans up the objects created by
vnet_new(), they are completely leaked.

vnet_exit(), after doing the vio_unregister_driver() to clean
up ports, should call a helper function that iterates over vnet_list
and cleans up those objects. This includes unregister_netdevice()
as well as free_netdev().

Signed-off-by: Sowmini Varadhan <sowmini....@oracle.com>
Acked-by: Dave Kleikamp <dave.k...@oracle.com>
Reviewed-by: Karl Volz <karl...@oracle.com>

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/net/ethernet/sun/sunvnet.c | 20 +++++++++++++++++++-
1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c
index 3df56840a3b9..398faff8be7a 100644
--- a/drivers/net/ethernet/sun/sunvnet.c
+++ b/drivers/net/ethernet/sun/sunvnet.c
@@ -1083,6 +1083,24 @@ static struct vnet *vnet_find_or_create(const u64 *local_mac)
return vp;
}

+static void vnet_cleanup(void)
+{
+ struct vnet *vp;
+ struct net_device *dev;
+
+ mutex_lock(&vnet_list_mutex);
+ while (!list_empty(&vnet_list)) {
+ vp = list_first_entry(&vnet_list, struct vnet, list);
+ list_del(&vp->list);
+ dev = vp->dev;
+ /* vio_unregister_driver() should have cleaned up port_list */
+ BUG_ON(!list_empty(&vp->port_list));
+ unregister_netdev(dev);
+ free_netdev(dev);
+ }
+ mutex_unlock(&vnet_list_mutex);
+}
+
static const char *local_mac_prop = "local-mac-address";

static struct vnet *vnet_find_parent(struct mdesc_handle *hp,
@@ -1240,7 +1258,6 @@ static int vnet_port_remove(struct vio_dev *vdev)

kfree(port);

- unregister_netdev(vp->dev);
}
return 0;
}
@@ -1268,6 +1285,7 @@ static int __init vnet_init(void)
static void __exit vnet_exit(void)
{
vio_unregister_driver(&vnet_port_driver);
+ vnet_cleanup();
}

module_init(vnet_init);

Luis Henriques

unread,

Aug 18, 2014, 6:00:06 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Silesh C V <svel...@mvista.com>

commit aed8adb7688d5744cb484226820163af31d2499a upstream.

Commit 079148b919d0 ("coredump: factor out the setting of PF_DUMPCORE")
cleaned up the setting of PF_DUMPCORE by removing it from all the
linux_binfmt->core_dump() and moving it to zap_threads().But this ended
up clearing all the previously set flags. This causes issues during
core generation when tsk->flags is checked again (eg. for PF_USED_MATH
to dump floating point registers). Fix this.

Signed-off-by: Silesh C V <svel...@mvista.com>
Acked-by: Oleg Nesterov <ol...@redhat.com>
Cc: Mandeep Singh Baines <m...@chromium.org>

Signed-off-by: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Linus Torvalds <torv...@linux-foundation.org>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

fs/coredump.c | 2 +-

1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/coredump.c b/fs/coredump.c
index 729a2ededcb9..b2fa3937a7eb 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -302,7 +302,7 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
if (unlikely(nr < 0))
return nr;

- tsk->flags = PF_DUMPCORE;
+ tsk->flags |= PF_DUMPCORE;
if (atomic_read(&mm->mm_users) == nr + 1)
goto done;
/*

Luis Henriques

unread,

Aug 18, 2014, 6:00:07 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: willy tarreau <w...@1wt.eu>

commit 74c41b048db1073a04827d7f39e95ac1935524cc upstream.

Stats writers are mvneta_rx() and mvneta_tx(). They don't lock anything
when they update the stats, and as a result, it randomly happens that
the stats freeze on SMP if two updates happen during stats retrieval.
This is very easily reproducible by starting two HTTP servers and binding
each of them to a different CPU, then consulting /proc/net/dev in loops
during transfers, the interface should immediately lock up. This issue
also randomly happens upon link state changes during transfers, because
the stats are collected in this situation, but it takes more attempts to
reproduce it.

The comments in netdevice.h suggest using per_cpu stats instead to get
rid of this issue.

This patch implements this. It merges both rx_stats and tx_stats into
a single "stats" member with a single syncp. Both mvneta_rx() and
mvneta_rx() now only update the a single CPU's counters.

In turn, mvneta_get_stats64() does the summing by iterating over all CPUs
to get their respective stats.

With this change, stats are still correct and no more lockup is encountered.

Note that this bug was present since the first import of the mvneta
driver. It might make sense to backport it to some stable trees. If
so, it depends on "d33dc73 net: mvneta: increase the 64-bit rx/tx stats
out of the hot path".

Cc: Thomas Petazzoni <thomas.p...@free-electrons.com>
Cc: Gregory CLEMENT <gregory...@free-electrons.com>
Reviewed-by: Eric Dumazet <edum...@google.com>
Tested-by: Arnaud Ebalard <ar...@natisbad.org>
Signed-off-by: Willy Tarreau <w...@1wt.eu>

Signed-off-by: David S. Miller <da...@davemloft.net>

[wt: port to 3.10 : u64_stats_init() does not exist in 3.10 and is not needed]
Signed-off-by: Willy Tarreau <w...@1wt.eu>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

drivers/net/ethernet/marvell/mvneta.c | 74 +++++++++++++++++++++++------------
1 file changed, 48 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 64bf21be7ff9..18cee4f1590e 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -221,10 +221,12 @@

#define MVNETA_RX_BUF_SIZE(pkt_size) ((pkt_size) + NET_SKB_PAD)

-struct mvneta_stats {
+struct mvneta_pcpu_stats {
struct u64_stats_sync syncp;
- u64 packets;
- u64 bytes;
+ u64 rx_packets;
+ u64 rx_bytes;
+ u64 tx_packets;
+ u64 tx_bytes;
};

struct mvneta_port {
@@ -250,8 +252,7 @@ struct mvneta_port {
u8 mcast_count[256];
u16 tx_ring_size;
u16 rx_ring_size;
- struct mvneta_stats tx_stats;
- struct mvneta_stats rx_stats;
+ struct mvneta_pcpu_stats *stats;

struct mii_bus *mii_bus;
struct phy_device *phy_dev;
@@ -430,21 +431,29 @@ struct rtnl_link_stats64 *mvneta_get_stats64(struct net_device *dev,
{
struct mvneta_port *pp = netdev_priv(dev);
unsigned int start;
+ int cpu;

- memset(stats, 0, sizeof(struct rtnl_link_stats64));
-
- do {
- start = u64_stats_fetch_begin_bh(&pp->rx_stats.syncp);
- stats->rx_packets = pp->rx_stats.packets;
- stats->rx_bytes = pp->rx_stats.bytes;
- } while (u64_stats_fetch_retry_bh(&pp->rx_stats.syncp, start));
+ for_each_possible_cpu(cpu) {
+ struct mvneta_pcpu_stats *cpu_stats;
+ u64 rx_packets;
+ u64 rx_bytes;
+ u64 tx_packets;
+ u64 tx_bytes;

+ cpu_stats = per_cpu_ptr(pp->stats, cpu);
+ do {
+ start = u64_stats_fetch_begin_bh(&cpu_stats->syncp);
+ rx_packets = cpu_stats->rx_packets;
+ rx_bytes = cpu_stats->rx_bytes;
+ tx_packets = cpu_stats->tx_packets;
+ tx_bytes = cpu_stats->tx_bytes;
+ } while (u64_stats_fetch_retry_bh(&cpu_stats->syncp, start));

- do {
- start = u64_stats_fetch_begin_bh(&pp->tx_stats.syncp);
- stats->tx_packets = pp->tx_stats.packets;
- stats->tx_bytes = pp->tx_stats.bytes;
- } while (u64_stats_fetch_retry_bh(&pp->tx_stats.syncp, start));
+ stats->rx_packets += rx_packets;
+ stats->rx_bytes += rx_bytes;
+ stats->tx_packets += tx_packets;
+ stats->tx_bytes += tx_bytes;
+ }

stats->rx_errors = dev->stats.rx_errors;
stats->rx_dropped = dev->stats.rx_dropped;
@@ -1420,10 +1429,12 @@ static int mvneta_rx(struct mvneta_port *pp, int rx_todo,
}

if (rcvd_pkts) {
- u64_stats_update_begin(&pp->rx_stats.syncp);
- pp->rx_stats.packets += rcvd_pkts;
- pp->rx_stats.bytes += rcvd_bytes;
- u64_stats_update_end(&pp->rx_stats.syncp);
+ struct mvneta_pcpu_stats *stats = this_cpu_ptr(pp->stats);
+
+ u64_stats_update_begin(&stats->syncp);
+ stats->rx_packets += rcvd_pkts;
+ stats->rx_bytes += rcvd_bytes;
+ u64_stats_update_end(&stats->syncp);

}

/* Update rxq management counters */

@@ -1556,11 +1567,12 @@ static int mvneta_tx(struct sk_buff *skb, struct net_device *dev)

out:
if (frags > 0) {
- u64_stats_update_begin(&pp->tx_stats.syncp);
- pp->tx_stats.packets++;
- pp->tx_stats.bytes += skb->len;
- u64_stats_update_end(&pp->tx_stats.syncp);
+ struct mvneta_pcpu_stats *stats = this_cpu_ptr(pp->stats);

+ u64_stats_update_begin(&stats->syncp);
+ stats->tx_packets++;
+ stats->tx_bytes += skb->len;
+ u64_stats_update_end(&stats->syncp);
} else {
dev->stats.tx_dropped++;
dev_kfree_skb_any(skb);
@@ -2768,6 +2780,13 @@ static int mvneta_probe(struct platform_device *pdev)
goto err_clk;
}

+ /* Alloc per-cpu stats */
+ pp->stats = alloc_percpu(struct mvneta_pcpu_stats);
+ if (!pp->stats) {
+ err = -ENOMEM;
+ goto err_unmap;
+ }
+
dt_mac_addr = of_get_mac_address(dn);
if (dt_mac_addr && is_valid_ether_addr(dt_mac_addr)) {
mac_from = "device tree";
@@ -2797,7 +2816,7 @@ static int mvneta_probe(struct platform_device *pdev)
err = mvneta_init(pp, phy_addr);
if (err < 0) {
dev_err(&pdev->dev, "can't init eth hal\n");
- goto err_unmap;
+ goto err_free_stats;
}
mvneta_port_power_up(pp, phy_mode);

@@ -2827,6 +2846,8 @@ static int mvneta_probe(struct platform_device *pdev)

err_deinit:
mvneta_deinit(pp);
+err_free_stats:
+ free_percpu(pp->stats);
err_unmap:
iounmap(pp->base);
err_clk:
@@ -2847,6 +2868,7 @@ static int mvneta_remove(struct platform_device *pdev)
unregister_netdev(dev);
mvneta_deinit(pp);
clk_disable_unprepare(pp->clk);
+ free_percpu(pp->stats);
iounmap(pp->base);
irq_dispose_mapping(dev->irq);
free_netdev(dev);

Luis Henriques

unread,

Aug 18, 2014, 6:00:07 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Johannes Berg <johann...@intel.com>

commit 08b9939997df30e42a228e1ecb97f99e9c8ea84e upstream.

This reverts commit 277d916fc2e959c3f106904116bb4f7b1148d47a as it was
at least breaking iwlwifi by setting the IEEE80211_TX_CTL_NO_PS_BUFFER
flag in all kinds of interface modes, not only for AP mode where it is
appropriate.

To avoid reintroducing the original problem, explicitly check for probe
request frames in the multicast buffering code.

Fixes: 277d916fc2e9 ("mac80211: move "bufferable MMPDU" check to fix AP mode scan")
Signed-off-by: Johannes Berg <johann...@intel.com>
[ luis: backported to 3.11: based on Johannes' backport for 3.10 ]

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

net/mac80211/tx.c | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 5fd26c6bad2a..cc3be10bab23 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -398,6 +398,9 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx)
if (ieee80211_has_order(hdr->frame_control))
return TX_CONTINUE;

+ if (ieee80211_is_probe_req(hdr->frame_control))
+ return TX_CONTINUE;
+
if (tx->local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)
info->hw_queue = tx->sdata->vif.cab_queue;

@@ -448,6 +451,7 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx)
{
struct sta_info *sta = tx->sta;
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
struct ieee80211_local *local = tx->local;

if (unlikely(!sta))
@@ -458,6 +462,15 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx)
!(info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER))) {
int ac = skb_get_queue_mapping(tx->skb);

+ /* only deauth, disassoc and action are bufferable MMPDUs */
+ if (ieee80211_is_mgmt(hdr->frame_control) &&
+ !ieee80211_is_deauth(hdr->frame_control) &&
+ !ieee80211_is_disassoc(hdr->frame_control) &&
+ !ieee80211_is_action(hdr->frame_control)) {
+ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER;
+ return TX_CONTINUE;
+ }
+
ps_dbg(sta->sdata, "STA %pM aid %d: PS buffer for AC %d\n",
sta->sta.addr, sta->sta.aid, ac);
if (tx->local->total_ps_buffered >= TOTAL_MAX_TX_BUFFER)
@@ -515,22 +528,9 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx)
static ieee80211_tx_result debug_noinline
ieee80211_tx_h_ps_buf(struct ieee80211_tx_data *tx)
{
- struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
- struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
-
if (unlikely(tx->flags & IEEE80211_TX_PS_BUFFERED))
return TX_CONTINUE;

- /* only deauth, disassoc and action are bufferable MMPDUs */
- if (ieee80211_is_mgmt(hdr->frame_control) &&
- !ieee80211_is_deauth(hdr->frame_control) &&
- !ieee80211_is_disassoc(hdr->frame_control) &&
- !ieee80211_is_action(hdr->frame_control)) {
- if (tx->flags & IEEE80211_TX_UNICAST)
- info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER;
- return TX_CONTINUE;
- }
-
if (tx->flags & IEEE80211_TX_UNICAST)
return ieee80211_tx_h_unicast_ps_buf(tx);
else

Luis Henriques

unread,

Aug 18, 2014, 6:00:07 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: "David S. Miller" <da...@davemloft.net>

commit 5aa4ecfd0ddb1e6dcd1c886e6c49677550f581aa upstream.

This is the prevent previous stores from overlapping the block stores
done by the memcpy loop.

Based upon a glibc patch by Jose E. Marchesi

Signed-off-by: David S. Miller <da...@davemloft.net>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/sparc/lib/NG2memcpy.S | 1 +

1 file changed, 1 insertion(+)

diff --git a/arch/sparc/lib/NG2memcpy.S b/arch/sparc/lib/NG2memcpy.S
index 2c20ad63ddbf..30eee6e8a81b 100644
--- a/arch/sparc/lib/NG2memcpy.S
+++ b/arch/sparc/lib/NG2memcpy.S
@@ -236,6 +236,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
*/
VISEntryHalf

+ membar #Sync
alignaddr %o1, %g0, %g0

add %o1, (64 - 1), %o4

Luis Henriques

unread,

Aug 18, 2014, 6:00:08 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: Andy Lutomirski <lu...@amacapital.net>

commit 7209a75d2009dbf7745e2fd354abf25c3deb3ca3 upstream.

This moves the espfix64 logic into native_iret. To make this work,
it gets rid of the native patch for INTERRUPT_RETURN:
INTERRUPT_RETURN on native kernels is now 'jmp native_iret'.

This changes the 16-bit SS behavior on Xen from OOPSing to leaking
some bits of the Xen hypervisor's RSP (I think).

[ hpa: this is a nonzero cost on native, but probably not enough to
measure. Xen needs to fix this in their own code, probably doing
something equivalent to espfix64. ]

Signed-off-by: Andy Lutomirski <lu...@amacapital.net>
Link: http://lkml.kernel.org/r/7b8f1d8ef6597cb16ae004a43c5698...@amacapital.net

Signed-off-by: H. Peter Anvin <h...@linux.intel.com>

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

arch/x86/include/asm/irqflags.h | 2 +-
arch/x86/kernel/entry_64.S | 28 ++++++++++------------------
arch/x86/kernel/paravirt_patch_64.c | 2 --
3 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index bba3cf88e624..0a8b519226b8 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -129,7 +129,7 @@ static inline notrace unsigned long arch_local_irq_save(void)

#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */

-#define INTERRUPT_RETURN iretq
+#define INTERRUPT_RETURN jmp native_iret
#define USERGS_SYSRET64 \
swapgs; \
sysretq;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 01cbb4eb9179..87986bea82b7 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1056,27 +1056,24 @@ restore_args:
RESTORE_ARGS 1,8,1

irq_return:
+ INTERRUPT_RETURN
+
+ENTRY(native_iret)
/*

* Are we returning to a stack segment from the LDT? Note: in

* 64-bit mode SS:RSP on the exception stack is always valid.

*/
#ifdef CONFIG_X86_ESPFIX64
testb $4,(SS-RIP)(%rsp)
- jnz irq_return_ldt
+ jnz native_irq_return_ldt
#endif

-irq_return_iret:
- INTERRUPT_RETURN
- _ASM_EXTABLE(irq_return_iret, bad_iret)
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_iret)
+native_irq_return_iret:
iretq
- _ASM_EXTABLE(native_iret, bad_iret)
-#endif
+ _ASM_EXTABLE(native_irq_return_iret, bad_iret)

#ifdef CONFIG_X86_ESPFIX64
-irq_return_ldt:
+native_irq_return_ldt:
pushq_cfi %rax
pushq_cfi %rdi
SWAPGS
@@ -1098,7 +1095,7 @@ irq_return_ldt:
SWAPGS
movq %rax,%rsp
popq_cfi %rax
- jmp irq_return_iret
+ jmp native_irq_return_iret
#endif

.section .fixup,"ax"
@@ -1184,13 +1181,8 @@ __do_double_fault:
cmpl $__KERNEL_CS,CS(%rdi)
jne do_double_fault
movq RIP(%rdi),%rax
- cmpq $irq_return_iret,%rax
-#ifdef CONFIG_PARAVIRT
- je 1f
- cmpq $native_iret,%rax
-#endif
+ cmpq $native_irq_return_iret,%rax

jne do_double_fault /* This shouldn't happen... */

-1:
movq PER_CPU_VAR(kernel_stack),%rax

subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */

movq %rax,RSP(%rdi)
@@ -1680,7 +1672,7 @@ error_sti:
*/
error_kernelspace:
incl %ebx
- leaq irq_return_iret(%rip),%rcx
+ leaq native_irq_return_iret(%rip),%rcx

cmpq %rcx,RIP+8(%rsp)
je error_swapgs
movl %ecx,%eax /* zero extend */

diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 3f08f34f93eb..a1da6737ba5b 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -6,7 +6,6 @@ DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
-DEF_NATIVE(pv_cpu_ops, iret, "iretq");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
@@ -50,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, save_fl);
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_cpu_ops, iret);
PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
PATCH_SITE(pv_cpu_ops, usergs_sysret32);
PATCH_SITE(pv_cpu_ops, usergs_sysret64);

Luis Henriques

unread,

Aug 18, 2014, 6:00:07 AM8/18/14

to

3.11.10.15 -stable review patch. If anyone has any objections, please let me know.

------------------

From: John Stultz <john....@linaro.org>

commit aac74dc495456412c4130a1167ce4beb6c1f0b38 upstream.

After learning we'll need some sort of deferred printk functionality in
the timekeeping core, Peter suggested we rename the printk_sched function
so it can be reused by needed subsystems.

This only changes the function name. No logic changes.

Signed-off-by: John Stultz <john....@linaro.org>
Reviewed-by: Steven Rostedt <ros...@goodmis.org>
Cc: Jan Kara <ja...@suse.cz>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Jiri Bohac <jbo...@suse.cz>
Cc: Thomas Gleixner <tg...@linutronix.de>
Cc: Ingo Molnar <mi...@redhat.com>

Signed-off-by: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Linus Torvalds <torv...@linux-foundation.org>

[ luis: prereq for 504d58745c9c ("timer: Fix lock inversion between
hrtimer_bases.lock and scheduler locks"); backported to 3.11:
- dropped changes to kernel/sched/deadline.c ]

Signed-off-by: Luis Henriques <luis.he...@canonical.com>
---

include/linux/printk.h | 6 +++---
kernel/printk/printk.c | 2 +-
kernel/sched/core.c | 2 +-
kernel/sched/rt.c | 2 +-
4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 22c7052e9372..708b8a84f6c0 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -124,9 +124,9 @@ asmlinkage __printf(1, 2) __cold
int printk(const char *fmt, ...);

/*
- * Special printk facility for scheduler use only, _DO_NOT_USE_ !
+ * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ !
*/
-__printf(1, 2) __cold int printk_sched(const char *fmt, ...);
+__printf(1, 2) __cold int printk_deferred(const char *fmt, ...);

/*
* Please don't use printk_ratelimit(), because it shares ratelimiting state
@@ -161,7 +161,7 @@ int printk(const char *s, ...)
return 0;
}
static inline __printf(1, 2) __cold
-int printk_sched(const char *s, ...)
+int printk_deferred(const char *s, ...)
{
return 0;
}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 41832ed685a9..385d33583931 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2463,7 +2463,7 @@ void wake_up_klogd(void)
preempt_enable();
}

-int printk_sched(const char *fmt, ...)
+int printk_deferred(const char *fmt, ...)
{
unsigned long flags;
va_list args;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 68924be0e1e1..65f48e3b7100 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1239,7 +1239,7 @@ out:
* leave kernel.
*/
if (p->mm && printk_ratelimit()) {
- printk_sched("process %d (%s) no longer affine to cpu%d\n",
+ printk_deferred("process %d (%s) no longer affine to cpu%d\n",
task_pid_nr(p), p->comm, cpu);
}
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 417b1b3fd7e9..8d5724775918 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -827,7 +827,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)

if (!once) {
once = true;
- printk_sched("sched: RT throttling activated\n");
+ printk_deferred("sched: RT throttling activated\n");
}
} else {
/*