Blob Blame History Raw
From 5444e381f5784d32d741864312909d2a6afe428e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 27 Aug 2013 05:46:32 -0700
Subject: [PATCH 01/47] tcp: TSO packets automatic sizing

[ Upstream commits 6d36824e730f247b602c90e8715a792003e3c5a7,
  02cf4ebd82ff0ac7254b88e466820a290ed8289a, and parts of
  7eec4174ff29cd42f2acfae8112f51c228545d40 ]

After hearing many people over past years complaining against TSO being
bursty or even buggy, we are proud to present automatic sizing of TSO
packets.

One part of the problem is that tcp_tso_should_defer() uses an heuristic
relying on upcoming ACKS instead of a timer, but more generally, having
big TSO packets makes little sense for low rates, as it tends to create
micro bursts on the network, and general consensus is to reduce the
buffering amount.

This patch introduces a per socket sk_pacing_rate, that approximates
the current sending rate, and allows us to size the TSO packets so
that we try to send one packet every ms.

This field could be set by other transports.

Patch has no impact for high speed flows, where having large TSO packets
makes sense to reach line rate.

For other flows, this helps better packet scheduling and ACK clocking.

This patch increases performance of TCP flows in lossy environments.

A new sysctl (tcp_min_tso_segs) is added, to specify the
minimal size of a TSO packet (default being 2).

A follow-up patch will provide a new packet scheduler (FQ), using
sk_pacing_rate as an input to perform optional per flow pacing.

This explains why we chose to set sk_pacing_rate to twice the current
rate, allowing 'slow start' ramp up.

sk_pacing_rate = 2 * cwnd * mss / srtt

v2: Neal Cardwell reported a suspect deferring of last two segments on
initial write of 10 MSS, I had to change tcp_tso_should_defer() to take
into account tp->xmit_size_goal_segs

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Van Jacobson <vanj@google.com>
Cc: Tom Herbert <therbert@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  9 +++++++++
 include/net/sock.h                     |  2 ++
 include/net/tcp.h                      |  1 +
 net/core/sock.c                        |  1 +
 net/ipv4/sysctl_net_ipv4.c             | 10 ++++++++++
 net/ipv4/tcp.c                         | 28 +++++++++++++++++++++++-----
 net/ipv4/tcp_input.c                   | 34 +++++++++++++++++++++++++++++++++-
 net/ipv4/tcp_output.c                  |  2 +-
 8 files changed, 80 insertions(+), 7 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 1074290..b522883 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -478,6 +478,15 @@ tcp_syn_retries - INTEGER
 tcp_timestamps - BOOLEAN
 	Enable timestamps as defined in RFC1323.
 
+tcp_min_tso_segs - INTEGER
+	Minimal number of segments per TSO frame.
+	Since linux-3.12, TCP does an automatic sizing of TSO frames,
+	depending on flow rate, instead of filling 64Kbytes packets.
+	For specific usages, it's possible to force TCP to build big
+	TSO frames. Note that TCP stack might split too big TSO packets
+	if available window is too small.
+	Default: 2
+
 tcp_tso_win_divisor - INTEGER
 	This allows control over what percentage of the congestion window
 	can be consumed by a single TSO frame.
diff --git a/include/net/sock.h b/include/net/sock.h
index 31d5cfb..04e148f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -232,6 +232,7 @@ struct cg_proto;
   *	@sk_napi_id: id of the last napi context to receive data for sk
   *	@sk_ll_usec: usecs to busypoll when there is no data
   *	@sk_allocation: allocation mode
+  *	@sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
   *	@sk_sndbuf: size of send buffer in bytes
   *	@sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
   *		   %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -361,6 +362,7 @@ struct sock {
 	kmemcheck_bitfield_end(flags);
 	int			sk_wmem_queued;
 	gfp_t			sk_allocation;
+	u32			sk_pacing_rate; /* bytes per second */
 	netdev_features_t	sk_route_caps;
 	netdev_features_t	sk_route_nocaps;
 	int			sk_gso_type;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d198005..46cb8a4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -284,6 +284,7 @@ extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
 extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
+extern int sysctl_tcp_min_tso_segs;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/core/sock.c b/net/core/sock.c
index 2c097c5..8729d91 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2297,6 +2297,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_ll_usec		=	sysctl_net_busy_read;
 #endif
 
+	sk->sk_pacing_rate = ~0U;
 	/*
 	 * Before updating sk_refcnt, we must commit prior changes to memory
 	 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 610e324..6900b8b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -29,6 +29,7 @@
 static int zero;
 static int one = 1;
 static int four = 4;
+static int gso_max_segs = GSO_MAX_SEGS;
 static int tcp_retr1_max = 255;
 static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -754,6 +755,15 @@ static struct ctl_table ipv4_table[] = {
 		.extra2		= &four,
 	},
 	{
+		.procname	= "tcp_min_tso_segs",
+		.data		= &sysctl_tcp_min_tso_segs,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &gso_max_segs,
+	},
+	{
 		.procname	= "udp_mem",
 		.data		= &sysctl_udp_mem,
 		.maxlen		= sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 95544e4..ec586e5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -283,6 +283,8 @@
 
 int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
 
+int sysctl_tcp_min_tso_segs __read_mostly = 2;
+
 struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 
@@ -789,12 +791,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
 	xmit_size_goal = mss_now;
 
 	if (large_allowed && sk_can_gso(sk)) {
-		xmit_size_goal = ((sk->sk_gso_max_size - 1) -
-				  inet_csk(sk)->icsk_af_ops->net_header_len -
-				  inet_csk(sk)->icsk_ext_hdr_len -
-				  tp->tcp_header_len);
+		u32 gso_size, hlen;
+
+		/* Maybe we should/could use sk->sk_prot->max_header here ? */
+		hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
+		       inet_csk(sk)->icsk_ext_hdr_len +
+		       tp->tcp_header_len;
+
+		/* Goal is to send at least one packet per ms,
+		 * not one big TSO packet every 100 ms.
+		 * This preserves ACK clocking and is consistent
+		 * with tcp_tso_should_defer() heuristic.
+		 */
+		gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
+		gso_size = max_t(u32, gso_size,
+				 sysctl_tcp_min_tso_segs * mss_now);
+
+		xmit_size_goal = min_t(u32, gso_size,
+				       sk->sk_gso_max_size - 1 - hlen);
 
-		/* TSQ : try to have two TSO segments in flight */
+		/* TSQ : try to have at least two segments in flight
+		 * (one in NIC TX ring, another in Qdisc)
+		 */
 		xmit_size_goal = min_t(u32, xmit_size_goal,
 				       sysctl_tcp_limit_output_bytes >> 1);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3ca2139..2f0e94b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
 	}
 }
 
+/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
+ * Note: TCP stack does not yet implement pacing.
+ * FQ packet scheduler can be used to implement cheap but effective
+ * TCP pacing, to smooth the burst on large writes when packets
+ * in flight is significantly lower than cwnd (or rwin)
+ */
+static void tcp_update_pacing_rate(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	u64 rate;
+
+	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
+	rate = (u64)tp->mss_cache * 2 * (HZ << 3);
+
+	rate *= max(tp->snd_cwnd, tp->packets_out);
+
+	/* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3),
+	 * be conservative and assume srtt = 1 (125 us instead of 1.25 ms)
+	 * We probably need usec resolution in the future.
+	 * Note: This also takes care of possible srtt=0 case,
+	 * when tcp_rtt_estimator() was not yet called.
+	 */
+	if (tp->srtt > 8 + 2)
+		do_div(rate, tp->srtt);
+
+	sk->sk_pacing_rate = min_t(u64, rate, ~0U);
+}
+
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
  * routine referred to above.
  */
@@ -3269,7 +3297,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	u32 ack_seq = TCP_SKB_CB(skb)->seq;
 	u32 ack = TCP_SKB_CB(skb)->ack_seq;
 	bool is_dupack = false;
-	u32 prior_in_flight;
+	u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
 	u32 prior_fackets;
 	int prior_packets = tp->packets_out;
 	const int prior_unsacked = tp->packets_out - tp->sacked_out;
@@ -3375,6 +3403,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS)
 		tcp_schedule_loss_probe(sk);
+	if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
+		tcp_update_pacing_rate(sk);
 	return 1;
 
 no_queue:
@@ -5671,6 +5701,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		} else
 			tcp_init_metrics(sk);
 
+		tcp_update_pacing_rate(sk);
+
 		/* Prevent spurious tcp_cwnd_restart() on first data packet */
 		tp->lsndtime = tcp_time_stamp;
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 170737a..7b263c3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1628,7 +1628,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
 
 	/* If a full-sized TSO skb can be sent, do it. */
 	if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
-			   sk->sk_gso_max_segs * tp->mss_cache))
+			   tp->xmit_size_goal_segs * tp->mss_cache))
 		goto send_now;
 
 	/* Middle in queue won't get any more data, full sendable already? */
-- 
1.7.11.7


From 1b6c7d9979e1db1d42bd0545452a9d204c019582 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Sep 2013 03:28:54 -0700
Subject: [PATCH 02/47] tcp: TSQ can use a dynamic limit

[ Upstream commit c9eeec26e32e087359160406f96e0949b3cc6f10 ]

When TCP Small Queues was added, we used a sysctl to limit amount of
packets queues on Qdisc/device queues for a given TCP flow.

Problem is this limit is either too big for low rates, or too small
for high rates.

Now TCP stack has rate estimation in sk->sk_pacing_rate, and TSO
auto sizing, it can better control number of packets in Qdisc/device
queues.

New limit is two packets or at least 1 to 2 ms worth of packets.

Low rates flows benefit from this patch by having even smaller
number of packets in queues, allowing for faster recovery,
better RTT estimations.

High rates flows benefit from this patch by allowing more than 2 packets
in flight as we had reports this was a limiting factor to reach line
rate. [ In particular if TX completion is delayed because of coalescing
parameters ]

Example for a single flow on 10Gbp link controlled by FQ/pacing

14 packets in flight instead of 2

$ tc -s -d qd
qdisc fq 8001: dev eth0 root refcnt 32 limit 10000p flow_limit 100p
buckets 1024 quantum 3028 initial_quantum 15140
 Sent 1168459366606 bytes 771822841 pkt (dropped 0, overlimits 0
requeues 6822476)
 rate 9346Mbit 771713pps backlog 953820b 14p requeues 6822476
  2047 flow, 2046 inactive, 1 throttled, delay 15673 ns
  2372 gc, 0 highprio, 0 retrans, 9739249 throttled, 0 flows_plimit

Note that sk_pacing_rate is currently set to twice the actual rate, but
this might be refined in the future when a flow is in congestion
avoidance.

Additional change : skb->destructor should be set to tcp_wfree().

A future patch (for linux 3.13+) might remove tcp_limit_output_bytes

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Wei Liu <wei.liu2@citrix.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7b263c3..fe897ed 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -892,8 +892,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	skb_orphan(skb);
 	skb->sk = sk;
-	skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
-			  tcp_wfree : sock_wfree;
+	skb->destructor = tcp_wfree;
 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 
 	/* Build TCP header and checksum it. */
@@ -1837,7 +1836,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 	while ((skb = tcp_send_head(sk))) {
 		unsigned int limit;
 
-
 		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
 		BUG_ON(!tso_segs);
 
@@ -1866,13 +1864,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 				break;
 		}
 
-		/* TSQ : sk_wmem_alloc accounts skb truesize,
-		 * including skb overhead. But thats OK.
+		/* TCP Small Queues :
+		 * Control number of packets in qdisc/devices to two packets / or ~1 ms.
+		 * This allows for :
+		 *  - better RTT estimation and ACK scheduling
+		 *  - faster recovery
+		 *  - high rates
 		 */
-		if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
+		limit = max(skb->truesize, sk->sk_pacing_rate >> 10);
+
+		if (atomic_read(&sk->sk_wmem_alloc) > limit) {
 			set_bit(TSQ_THROTTLED, &tp->tsq_flags);
 			break;
 		}
+
 		limit = mss_now;
 		if (tso_segs > 1 && !tcp_urg_mode(tp))
 			limit = tcp_mss_split_point(sk, skb, mss_now,
-- 
1.7.11.7


From 4f25abff83e2780265eaa17d437b7659ea543bd5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 15 Oct 2013 11:54:30 -0700
Subject: [PATCH 03/47] tcp: must unclone packets before mangling them

[ Upstream commit c52e2421f7368fd36cbe330d2cf41b10452e39a9 ]

TCP stack should make sure it owns skbs before mangling them.

We had various crashes using bnx2x, and it turned out gso_size
was cleared right before bnx2x driver was populating TC descriptor
of the _previous_ packet send. TCP stack can sometime retransmit
packets that are still in Qdisc.

Of course we could make bnx2x driver more robust (using
ACCESS_ONCE(shinfo->gso_size) for example), but the bug is TCP stack.

We have identified two points where skb_unclone() was needed.

This patch adds a WARN_ON_ONCE() to warn us if we missed another
fix of this kind.

Kudos to Neal for finding the root cause of this bug. Its visible
using small MSS.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fe897ed..28c0d6a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -981,6 +981,9 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
 static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
 				 unsigned int mss_now)
 {
+	/* Make sure we own this skb before messing gso_size/gso_segs */
+	WARN_ON_ONCE(skb_cloned(skb));
+
 	if (skb->len <= mss_now || !sk_can_gso(sk) ||
 	    skb->ip_summed == CHECKSUM_NONE) {
 		/* Avoid the costly divide in the normal
@@ -1062,9 +1065,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 	if (nsize < 0)
 		nsize = 0;
 
-	if (skb_cloned(skb) &&
-	    skb_is_nonlinear(skb) &&
-	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+	if (skb_unclone(skb, GFP_ATOMIC))
 		return -ENOMEM;
 
 	/* Get a new skb... force flag on. */
@@ -2339,6 +2340,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 		int oldpcount = tcp_skb_pcount(skb);
 
 		if (unlikely(oldpcount > 1)) {
+			if (skb_unclone(skb, GFP_ATOMIC))
+				return -ENOMEM;
 			tcp_init_tso_segs(sk, skb, cur_mss);
 			tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
 		}
-- 
1.7.11.7


From 8731e25f7527ca851045eb0715d998d1ac07aadb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 4 Oct 2013 10:31:41 -0700
Subject: [PATCH 04/47] tcp: do not forget FIN in tcp_shifted_skb()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 5e8a402f831dbe7ee831340a91439e46f0d38acd ]

Yuchung found following problem :

 There are bugs in the SACK processing code, merging part in
 tcp_shift_skb_data(), that incorrectly resets or ignores the sacked
 skbs FIN flag. When a receiver first SACK the FIN sequence, and later
 throw away ofo queue (e.g., sack-reneging), the sender will stop
 retransmitting the FIN flag, and hangs forever.

Following packetdrill test can be used to reproduce the bug.

$ cat sack-merge-bug.pkt
`sysctl -q net.ipv4.tcp_fack=0`

// Establish a connection and send 10 MSS.
0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+.000 bind(3, ..., ...) = 0
+.000 listen(3, 1) = 0

+.050 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+.000 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+.001 < . 1:1(0) ack 1 win 1024
+.000 accept(3, ..., ...) = 4

+.100 write(4, ..., 12000) = 12000
+.000 shutdown(4, SHUT_WR) = 0
+.000 > . 1:10001(10000) ack 1
+.050 < . 1:1(0) ack 2001 win 257
+.000 > FP. 10001:12001(2000) ack 1
+.050 < . 1:1(0) ack 2001 win 257 <sack 10001:11001,nop,nop>
+.050 < . 1:1(0) ack 2001 win 257 <sack 10001:12002,nop,nop>
// SACK reneg
+.050 < . 1:1(0) ack 12001 win 257
+0 %{ print "unacked: ",tcpi_unacked }%
+5 %{ print "" }%

First, a typo inverted left/right of one OR operation, then
code forgot to advance end_seq if the merged skb carried FIN.

Bug was added in 2.6.29 by commit 832d11c5cd076ab
("tcp: Try to restore large SKBs while SACK processing")

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Acked-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2f0e94b..61e2360 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1279,7 +1279,10 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 		tp->lost_cnt_hint -= tcp_skb_pcount(prev);
 	}
 
-	TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(prev)->tcp_flags;
+	TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+		TCP_SKB_CB(prev)->end_seq++;
+
 	if (skb == tcp_highest_sack(sk))
 		tcp_advance_highest_sack(sk, skb);
 
-- 
1.7.11.7


From bfc0a00d669a4fa0835c417f01c50c18996d1e60 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Sat, 12 Oct 2013 10:16:27 -0700
Subject: [PATCH 05/47] tcp: fix incorrect ca_state in tail loss probe

[ Upstream commit 031afe4990a7c9dbff41a3a742c44d3e740ea0a1 ]

On receiving an ACK that covers the loss probe sequence, TLP
immediately sets the congestion state to Open, even though some packets
are not recovered and retransmisssion are on the way.  The later ACks
may trigger a WARN_ON check in step D of tcp_fastretrans_alert(), e.g.,
https://bugzilla.redhat.com/show_bug.cgi?id=989251

The fix is to follow the similar procedure in recovery by calling
tcp_try_keep_open(). The sender switches to Open state if no packets
are retransmissted. Otherwise it goes to Disorder and let subsequent
ACKs move the state to Recovery or Open.

Reported-By: Michael Sterrett <michael@sterretts.net>
Tested-By: Dormando <dormando@rydia.net>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 61e2360..723951a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3284,7 +3284,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
 			tcp_init_cwnd_reduction(sk, true);
 			tcp_set_ca_state(sk, TCP_CA_CWR);
 			tcp_end_cwnd_reduction(sk);
-			tcp_set_ca_state(sk, TCP_CA_Open);
+			tcp_try_keep_open(sk);
 			NET_INC_STATS_BH(sock_net(sk),
 					 LINUX_MIB_TCPLOSSPROBERECOVERY);
 		}
-- 
1.7.11.7


From 05c9fdfad860abd64136d8ccd88dbf84e40bd5f5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 1 Oct 2013 21:04:11 -0700
Subject: [PATCH 06/47] net: do not call sock_put() on TIMEWAIT sockets

[ Upstream commit 80ad1d61e72d626e30ebe8529a0455e660ca4693 ]

commit 3ab5aee7fe84 ("net: Convert TCP & DCCP hash tables to use RCU /
hlist_nulls") incorrectly used sock_put() on TIMEWAIT sockets.

We should instead use inet_twsk_put()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_hashtables.c  | 2 +-
 net/ipv6/inet6_hashtables.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 7bd8983..96da9c7 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -287,7 +287,7 @@ begintw:
 			if (unlikely(!INET_TW_MATCH(sk, net, acookie,
 						    saddr, daddr, ports,
 						    dif))) {
-				sock_put(sk);
+				inet_twsk_put(inet_twsk(sk));
 				goto begintw;
 			}
 			goto out;
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 32b4a16..066640e 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -116,7 +116,7 @@ begintw:
 			}
 			if (unlikely(!INET6_TW_MATCH(sk, net, saddr, daddr,
 						     ports, dif))) {
-				sock_put(sk);
+				inet_twsk_put(inet_twsk(sk));
 				goto begintw;
 			}
 			goto out;
-- 
1.7.11.7


From bc7fd34d31c17b0e4c100013e77277a2ed7e15cf Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <mschiffer@universe-factory.net>
Date: Fri, 27 Sep 2013 18:03:39 +0200
Subject: [PATCH 07/47] batman-adv: set up network coding packet handlers
 during module init

[ Upstream commit 6c519bad7b19a2c14a075b400edabaa630330123 ]

batman-adv saves its table of packet handlers as a global state, so handlers
must be set up only once (and setting them up a second time will fail).

The recently-added network coding support tries to set up its handler each time
a new softif is registered, which obviously fails when more that one softif is
used (and in consequence, the softif creation fails).

Fix this by splitting up batadv_nc_init into batadv_nc_init (which is called
only once) and batadv_nc_mesh_init (which is called for each softif); in
addition batadv_nc_free is renamed to batadv_nc_mesh_free to keep naming
consistent.

Signed-off-by: Matthias Schiffer <mschiffer@universe-factory.net>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <antonio@meshcoding.com>
---
 net/batman-adv/main.c           |  5 +++--
 net/batman-adv/network-coding.c | 28 ++++++++++++++++++----------
 net/batman-adv/network-coding.h | 14 ++++++++++----
 3 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 08125f3..c8e0671 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -61,6 +61,7 @@ static int __init batadv_init(void)
 	batadv_recv_handler_init();
 
 	batadv_iv_init();
+	batadv_nc_init();
 
 	batadv_event_workqueue = create_singlethread_workqueue("bat_events");
 
@@ -138,7 +139,7 @@ int batadv_mesh_init(struct net_device *soft_iface)
 	if (ret < 0)
 		goto err;
 
-	ret = batadv_nc_init(bat_priv);
+	ret = batadv_nc_mesh_init(bat_priv);
 	if (ret < 0)
 		goto err;
 
@@ -163,7 +164,7 @@ void batadv_mesh_free(struct net_device *soft_iface)
 	batadv_vis_quit(bat_priv);
 
 	batadv_gw_node_purge(bat_priv);
-	batadv_nc_free(bat_priv);
+	batadv_nc_mesh_free(bat_priv);
 	batadv_dat_free(bat_priv);
 	batadv_bla_free(bat_priv);
 
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index a487d46..4ecc0b6 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -35,6 +35,20 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
 				       struct batadv_hard_iface *recv_if);
 
 /**
+ * batadv_nc_init - one-time initialization for network coding
+ */
+int __init batadv_nc_init(void)
+{
+	int ret;
+
+	/* Register our packet type */
+	ret = batadv_recv_handler_register(BATADV_CODED,
+					   batadv_nc_recv_coded_packet);
+
+	return ret;
+}
+
+/**
  * batadv_nc_start_timer - initialise the nc periodic worker
  * @bat_priv: the bat priv with all the soft interface information
  */
@@ -45,10 +59,10 @@ static void batadv_nc_start_timer(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_nc_init - initialise coding hash table and start house keeping
+ * batadv_nc_mesh_init - initialise coding hash table and start house keeping
  * @bat_priv: the bat priv with all the soft interface information
  */
-int batadv_nc_init(struct batadv_priv *bat_priv)
+int batadv_nc_mesh_init(struct batadv_priv *bat_priv)
 {
 	bat_priv->nc.timestamp_fwd_flush = jiffies;
 	bat_priv->nc.timestamp_sniffed_purge = jiffies;
@@ -70,11 +84,6 @@ int batadv_nc_init(struct batadv_priv *bat_priv)
 	batadv_hash_set_lock_class(bat_priv->nc.coding_hash,
 				   &batadv_nc_decoding_hash_lock_class_key);
 
-	/* Register our packet type */
-	if (batadv_recv_handler_register(BATADV_CODED,
-					 batadv_nc_recv_coded_packet) < 0)
-		goto err;
-
 	INIT_DELAYED_WORK(&bat_priv->nc.work, batadv_nc_worker);
 	batadv_nc_start_timer(bat_priv);
 
@@ -1721,12 +1730,11 @@ free_nc_packet:
 }
 
 /**
- * batadv_nc_free - clean up network coding memory
+ * batadv_nc_mesh_free - clean up network coding memory
  * @bat_priv: the bat priv with all the soft interface information
  */
-void batadv_nc_free(struct batadv_priv *bat_priv)
+void batadv_nc_mesh_free(struct batadv_priv *bat_priv)
 {
-	batadv_recv_handler_unregister(BATADV_CODED);
 	cancel_delayed_work_sync(&bat_priv->nc.work);
 
 	batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash, NULL);
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
index 85a4ec8..ddfa618 100644
--- a/net/batman-adv/network-coding.h
+++ b/net/batman-adv/network-coding.h
@@ -22,8 +22,9 @@
 
 #ifdef CONFIG_BATMAN_ADV_NC
 
-int batadv_nc_init(struct batadv_priv *bat_priv);
-void batadv_nc_free(struct batadv_priv *bat_priv);
+int batadv_nc_init(void);
+int batadv_nc_mesh_init(struct batadv_priv *bat_priv);
+void batadv_nc_mesh_free(struct batadv_priv *bat_priv);
 void batadv_nc_update_nc_node(struct batadv_priv *bat_priv,
 			      struct batadv_orig_node *orig_node,
 			      struct batadv_orig_node *orig_neigh_node,
@@ -46,12 +47,17 @@ int batadv_nc_init_debugfs(struct batadv_priv *bat_priv);
 
 #else /* ifdef CONFIG_BATMAN_ADV_NC */
 
-static inline int batadv_nc_init(struct batadv_priv *bat_priv)
+static inline int batadv_nc_init(void)
 {
 	return 0;
 }
 
-static inline void batadv_nc_free(struct batadv_priv *bat_priv)
+static inline int batadv_nc_mesh_init(struct batadv_priv *bat_priv)
+{
+	return 0;
+}
+
+static inline void batadv_nc_mesh_free(struct batadv_priv *bat_priv)
 {
 	return;
 }
-- 
1.7.11.7


From 8be4005ed947924104df5850944a20b7f6570137 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Cachereul?= <f.cachereul@alphalink.fr>
Date: Wed, 2 Oct 2013 10:16:02 +0200
Subject: [PATCH 08/47] l2tp: fix kernel panic when using IPv4-mapped IPv6
 addresses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit e18503f41f9b12132c95d7c31ca6ee5155e44e5c ]

IPv4 mapped addresses cause kernel panic.
The patch juste check whether the IPv6 address is an IPv4 mapped
address. If so, use IPv4 API instead of IPv6.

[  940.026915] general protection fault: 0000 [#1]
[  940.026915] Modules linked in: l2tp_ppp l2tp_netlink l2tp_core pppox ppp_generic slhc loop psmouse
[  940.026915] CPU: 0 PID: 3184 Comm: memcheck-amd64- Not tainted 3.11.0+ #1
[  940.026915] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[  940.026915] task: ffff880007130e20 ti: ffff88000737e000 task.ti: ffff88000737e000
[  940.026915] RIP: 0010:[<ffffffff81333780>]  [<ffffffff81333780>] ip6_xmit+0x276/0x326
[  940.026915] RSP: 0018:ffff88000737fd28  EFLAGS: 00010286
[  940.026915] RAX: c748521a75ceff48 RBX: ffff880000c30800 RCX: 0000000000000000
[  940.026915] RDX: ffff88000075cc4e RSI: 0000000000000028 RDI: ffff8800060e5a40
[  940.026915] RBP: ffff8800060e5a40 R08: 0000000000000000 R09: ffff88000075cc90
[  940.026915] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88000737fda0
[  940.026915] R13: 0000000000000000 R14: 0000000000002000 R15: ffff880005d3b580
[  940.026915] FS:  00007f163dc5e800(0000) GS:ffffffff81623000(0000) knlGS:0000000000000000
[  940.026915] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  940.026915] CR2: 00000004032dc940 CR3: 0000000005c25000 CR4: 00000000000006f0
[  940.026915] Stack:
[  940.026915]  ffff88000075cc4e ffffffff81694e90 ffff880000c30b38 0000000000000020
[  940.026915]  11000000523c4bac ffff88000737fdb4 0000000000000000 ffff880000c30800
[  940.026915]  ffff880005d3b580 ffff880000c30b38 ffff8800060e5a40 0000000000000020
[  940.026915] Call Trace:
[  940.026915]  [<ffffffff81356cc3>] ? inet6_csk_xmit+0xa4/0xc4
[  940.026915]  [<ffffffffa0038535>] ? l2tp_xmit_skb+0x503/0x55a [l2tp_core]
[  940.026915]  [<ffffffff812b8d3b>] ? pskb_expand_head+0x161/0x214
[  940.026915]  [<ffffffffa003e91d>] ? pppol2tp_xmit+0xf2/0x143 [l2tp_ppp]
[  940.026915]  [<ffffffffa00292e0>] ? ppp_channel_push+0x36/0x8b [ppp_generic]
[  940.026915]  [<ffffffffa00293fe>] ? ppp_write+0xaf/0xc5 [ppp_generic]
[  940.026915]  [<ffffffff8110ead4>] ? vfs_write+0xa2/0x106
[  940.026915]  [<ffffffff8110edd6>] ? SyS_write+0x56/0x8a
[  940.026915]  [<ffffffff81378ac0>] ? system_call_fastpath+0x16/0x1b
[  940.026915] Code: 00 49 8b 8f d8 00 00 00 66 83 7c 11 02 00 74 60 49
8b 47 58 48 83 e0 fe 48 8b 80 18 01 00 00 48 85 c0 74 13 48 8b 80 78 02
00 00 <48> ff 40 28 41 8b 57 68 48 01 50 30 48 8b 54 24 08 49 c7 c1 51
[  940.026915] RIP  [<ffffffff81333780>] ip6_xmit+0x276/0x326
[  940.026915]  RSP <ffff88000737fd28>
[  940.057945] ---[ end trace be8aba9a61c8b7f3 ]---
[  940.058583] Kernel panic - not syncing: Fatal exception in interrupt

Signed-off-by: François CACHEREUL <f.cachereul@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_core.c | 27 +++++++++++++++++++++++----
 net/l2tp/l2tp_core.h |  3 +++
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index feae495..aedaa2c 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -496,6 +496,7 @@ out:
 static inline int l2tp_verify_udp_checksum(struct sock *sk,
 					   struct sk_buff *skb)
 {
+	struct l2tp_tunnel *tunnel = (struct l2tp_tunnel *)sk->sk_user_data;
 	struct udphdr *uh = udp_hdr(skb);
 	u16 ulen = ntohs(uh->len);
 	__wsum psum;
@@ -504,7 +505,7 @@ static inline int l2tp_verify_udp_checksum(struct sock *sk,
 		return 0;
 
 #if IS_ENABLED(CONFIG_IPV6)
-	if (sk->sk_family == PF_INET6) {
+	if (sk->sk_family == PF_INET6 && !tunnel->v4mapped) {
 		if (!uh->check) {
 			LIMIT_NETDEBUG(KERN_INFO "L2TP: IPv6: checksum is 0\n");
 			return 1;
@@ -1128,7 +1129,7 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
 	/* Queue the packet to IP for output */
 	skb->local_df = 1;
 #if IS_ENABLED(CONFIG_IPV6)
-	if (skb->sk->sk_family == PF_INET6)
+	if (skb->sk->sk_family == PF_INET6 && !tunnel->v4mapped)
 		error = inet6_csk_xmit(skb, NULL);
 	else
 #endif
@@ -1255,7 +1256,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
 
 		/* Calculate UDP checksum if configured to do so */
 #if IS_ENABLED(CONFIG_IPV6)
-		if (sk->sk_family == PF_INET6)
+		if (sk->sk_family == PF_INET6 && !tunnel->v4mapped)
 			l2tp_xmit_ipv6_csum(sk, skb, udp_len);
 		else
 #endif
@@ -1704,6 +1705,24 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32
 	if (cfg != NULL)
 		tunnel->debug = cfg->debug;
 
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == PF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		if (ipv6_addr_v4mapped(&np->saddr) &&
+		    ipv6_addr_v4mapped(&np->daddr)) {
+			struct inet_sock *inet = inet_sk(sk);
+
+			tunnel->v4mapped = true;
+			inet->inet_saddr = np->saddr.s6_addr32[3];
+			inet->inet_rcv_saddr = np->rcv_saddr.s6_addr32[3];
+			inet->inet_daddr = np->daddr.s6_addr32[3];
+		} else {
+			tunnel->v4mapped = false;
+		}
+	}
+#endif
+
 	/* Mark socket as an encapsulation socket. See net/ipv4/udp.c */
 	tunnel->encap = encap;
 	if (encap == L2TP_ENCAPTYPE_UDP) {
@@ -1712,7 +1731,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32
 		udp_sk(sk)->encap_rcv = l2tp_udp_encap_recv;
 		udp_sk(sk)->encap_destroy = l2tp_udp_encap_destroy;
 #if IS_ENABLED(CONFIG_IPV6)
-		if (sk->sk_family == PF_INET6)
+		if (sk->sk_family == PF_INET6 && !tunnel->v4mapped)
 			udpv6_encap_enable();
 		else
 #endif
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 66a559b..6f251cb 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -194,6 +194,9 @@ struct l2tp_tunnel {
 	struct sock		*sock;		/* Parent socket */
 	int			fd;		/* Parent fd, if tunnel socket
 						 * was created by userspace */
+#if IS_ENABLED(CONFIG_IPV6)
+	bool			v4mapped;
+#endif
 
 	struct work_struct	del_work;
 
-- 
1.7.11.7


From 0ec2b01190b1a2ba020241ab89730bf7e7d77b9c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 8 Oct 2013 15:44:26 -0400
Subject: [PATCH 09/47] l2tp: Fix build warning with ipv6 disabled.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 8d8a51e26a6d415e1470759f2cf5f3ee3ee86196 ]

net/l2tp/l2tp_core.c: In function ‘l2tp_verify_udp_checksum’:
net/l2tp/l2tp_core.c:499:22: warning: unused variable ‘tunnel’ [-Wunused-variable]

Create a helper "l2tp_tunnel()" to facilitate this, and as a side
effect get rid of a bunch of unnecessary void pointer casts.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_core.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index aedaa2c..b076e83 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -115,6 +115,11 @@ struct l2tp_net {
 static void l2tp_session_set_header_len(struct l2tp_session *session, int version);
 static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel);
 
+static inline struct l2tp_tunnel *l2tp_tunnel(struct sock *sk)
+{
+	return sk->sk_user_data;
+}
+
 static inline struct l2tp_net *l2tp_pernet(struct net *net)
 {
 	BUG_ON(!net);
@@ -496,7 +501,6 @@ out:
 static inline int l2tp_verify_udp_checksum(struct sock *sk,
 					   struct sk_buff *skb)
 {
-	struct l2tp_tunnel *tunnel = (struct l2tp_tunnel *)sk->sk_user_data;
 	struct udphdr *uh = udp_hdr(skb);
 	u16 ulen = ntohs(uh->len);
 	__wsum psum;
@@ -505,7 +509,7 @@ static inline int l2tp_verify_udp_checksum(struct sock *sk,
 		return 0;
 
 #if IS_ENABLED(CONFIG_IPV6)
-	if (sk->sk_family == PF_INET6 && !tunnel->v4mapped) {
+	if (sk->sk_family == PF_INET6 && !l2tp_tunnel(sk)->v4mapped) {
 		if (!uh->check) {
 			LIMIT_NETDEBUG(KERN_INFO "L2TP: IPv6: checksum is 0\n");
 			return 1;
@@ -1305,10 +1309,9 @@ EXPORT_SYMBOL_GPL(l2tp_xmit_skb);
  */
 static void l2tp_tunnel_destruct(struct sock *sk)
 {
-	struct l2tp_tunnel *tunnel;
+	struct l2tp_tunnel *tunnel = l2tp_tunnel(sk);
 	struct l2tp_net *pn;
 
-	tunnel = sk->sk_user_data;
 	if (tunnel == NULL)
 		goto end;
 
@@ -1676,7 +1679,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32
 	}
 
 	/* Check if this socket has already been prepped */
-	tunnel = (struct l2tp_tunnel *)sk->sk_user_data;
+	tunnel = l2tp_tunnel(sk);
 	if (tunnel != NULL) {
 		/* This socket has already been prepped */
 		err = -EBUSY;
-- 
1.7.11.7


From 35e64a9e465a85ffacd373439c1caa757e407656 Mon Sep 17 00:00:00 2001
From: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Date: Wed, 2 Oct 2013 12:57:20 +0200
Subject: [PATCH 10/47] net: mv643xx_eth: update statistics timer from timer
 context only

[ Upstream commit 041b4ddb84989f06ff1df0ca869b950f1ee3cb1c ]

Each port driver installs a periodic timer to update port statistics
by calling mib_counters_update. As mib_counters_update is also called
from non-timer context, we should not reschedule the timer there but
rather move it to timer-only context.

Signed-off-by: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Acked-by: Jason Cooper <jason@lakedaemon.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mv643xx_eth.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index c35db73..51c138b 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -1131,15 +1131,13 @@ static void mib_counters_update(struct mv643xx_eth_private *mp)
 	p->rx_discard += rdlp(mp, RX_DISCARD_FRAME_CNT);
 	p->rx_overrun += rdlp(mp, RX_OVERRUN_FRAME_CNT);
 	spin_unlock_bh(&mp->mib_counters_lock);
-
-	mod_timer(&mp->mib_counters_timer, jiffies + 30 * HZ);
 }
 
 static void mib_counters_timer_wrapper(unsigned long _mp)
 {
 	struct mv643xx_eth_private *mp = (void *)_mp;
-
 	mib_counters_update(mp);
+	mod_timer(&mp->mib_counters_timer, jiffies + 30 * HZ);
 }
 
 
-- 
1.7.11.7


From b6b20d9c54b23ba35c5807e45ff7d9579503bffa Mon Sep 17 00:00:00 2001
From: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Date: Wed, 2 Oct 2013 12:57:21 +0200
Subject: [PATCH 11/47] net: mv643xx_eth: fix orphaned statistics timer crash

[ Upstream commit f564412c935111c583b787bcc18157377b208e2e ]

The periodic statistics timer gets started at port _probe() time, but
is stopped on _stop() only. In a modular environment, this can cause
the timer to access already deallocated memory, if the module is unloaded
without starting the eth device. To fix this, we add the timer right
before the port is started, instead of at _probe() time.

Signed-off-by: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Acked-by: Jason Cooper <jason@lakedaemon.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mv643xx_eth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index 51c138b..39334d4 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -2235,6 +2235,7 @@ static int mv643xx_eth_open(struct net_device *dev)
 		mp->int_mask |= INT_TX_END_0 << i;
 	}
 
+	add_timer(&mp->mib_counters_timer);
 	port_start(mp);
 
 	wrlp(mp, INT_MASK_EXT, INT_EXT_LINK_PHY | INT_EXT_TX);
@@ -2914,7 +2915,6 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 	mp->mib_counters_timer.data = (unsigned long)mp;
 	mp->mib_counters_timer.function = mib_counters_timer_wrapper;
 	mp->mib_counters_timer.expires = jiffies + 30 * HZ;
-	add_timer(&mp->mib_counters_timer);
 
 	spin_lock_init(&mp->mib_counters_lock);
 
-- 
1.7.11.7


From b8baf1c21a214c1b836eef390c9d6e153293fef9 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 3 Oct 2013 00:27:20 +0300
Subject: [PATCH 12/47] net: heap overflow in __audit_sockaddr()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 1661bf364ae9c506bc8795fef70d1532931be1e8 ]

We need to cap ->msg_namelen or it leads to a buffer overflow when we
to the memcpy() in __audit_sockaddr().  It requires CAP_AUDIT_CONTROL to
exploit this bug.

The call tree is:
___sys_recvmsg()
  move_addr_to_user()
    audit_sockaddr()
      __audit_sockaddr()

Reported-by: Jüri Aedla <juri.aedla@gmail.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/compat.c |  2 ++
 net/socket.c | 24 ++++++++++++++++++++----
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/net/compat.c b/net/compat.c
index f0a1ba6..8903258 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -71,6 +71,8 @@ int get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg)
 	    __get_user(kmsg->msg_controllen, &umsg->msg_controllen) ||
 	    __get_user(kmsg->msg_flags, &umsg->msg_flags))
 		return -EFAULT;
+	if (kmsg->msg_namelen > sizeof(struct sockaddr_storage))
+		return -EINVAL;
 	kmsg->msg_name = compat_ptr(tmp1);
 	kmsg->msg_iov = compat_ptr(tmp2);
 	kmsg->msg_control = compat_ptr(tmp3);
diff --git a/net/socket.c b/net/socket.c
index b2d7c62..4b94643 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1973,6 +1973,16 @@ struct used_address {
 	unsigned int name_len;
 };
 
+static int copy_msghdr_from_user(struct msghdr *kmsg,
+				 struct msghdr __user *umsg)
+{
+	if (copy_from_user(kmsg, umsg, sizeof(struct msghdr)))
+		return -EFAULT;
+	if (kmsg->msg_namelen > sizeof(struct sockaddr_storage))
+		return -EINVAL;
+	return 0;
+}
+
 static int ___sys_sendmsg(struct socket *sock, struct msghdr __user *msg,
 			 struct msghdr *msg_sys, unsigned int flags,
 			 struct used_address *used_address)
@@ -1991,8 +2001,11 @@ static int ___sys_sendmsg(struct socket *sock, struct msghdr __user *msg,
 	if (MSG_CMSG_COMPAT & flags) {
 		if (get_compat_msghdr(msg_sys, msg_compat))
 			return -EFAULT;
-	} else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr)))
-		return -EFAULT;
+	} else {
+		err = copy_msghdr_from_user(msg_sys, msg);
+		if (err)
+			return err;
+	}
 
 	if (msg_sys->msg_iovlen > UIO_FASTIOV) {
 		err = -EMSGSIZE;
@@ -2200,8 +2213,11 @@ static int ___sys_recvmsg(struct socket *sock, struct msghdr __user *msg,
 	if (MSG_CMSG_COMPAT & flags) {
 		if (get_compat_msghdr(msg_sys, msg_compat))
 			return -EFAULT;
-	} else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr)))
-		return -EFAULT;
+	} else {
+		err = copy_msghdr_from_user(msg_sys, msg);
+		if (err)
+			return err;
+	}
 
 	if (msg_sys->msg_iovlen > UIO_FASTIOV) {
 		err = -EMSGSIZE;
-- 
1.7.11.7


From 6e24497ef79e18f5b1ddce66712d55093a6cf3e9 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 22 Oct 2013 10:59:18 -0400
Subject: [PATCH 13/47] sit: amend "allow to use rtnl ops on fb tunnel"

Amend backport to 3.11.y of

 [ Upstream commit 205983c43700ac3a81e7625273a3fa83cd2759b5 ]

The discussion thread in the upstream commit mentions that in
backports to stable-* branches, the line

  - unregister_netdevice_queue(sitn->fb_tunnel_dev, &list);

must be omitted if that branch does not have commit 5e6700b3bf98
("sit: add support of x-netns"). This line has correctly been omitted
in the backport to 3.10, which indeed does not have that commit.

It was also removed in the backport to 3.11.y, which does have that
commit.

This causes the following steps to hit a BUG at net/core/dev.c:5039:

  `modprobe sit; rmmod sit`

The bug demonstrates that it causes a device to be unregistered twice.
The simple fix is to apply the one line in the upstream commit that
was dropped in the backport to 3.11 (3783100374653e2e7fbdf68c710f5).
This brings the logic in line with upstream linux, net and net-next
branches.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: Veaceslav Falico <vfalico@redhat.com>
---
 net/ipv6/sit.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 86f639b..a51ad07 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1708,7 +1708,6 @@ static void __net_exit sit_exit_net(struct net *net)
 
 	rtnl_lock();
 	sit_destroy_tunnels(sitn, &list);
-	unregister_netdevice_queue(sitn->fb_tunnel_dev, &list);
 	unregister_netdevice_many(&list);
 	rtnl_unlock();
 }
-- 
1.7.11.7


From 6c7e3c3382670fe98debedf2ddaff8abf2944bb4 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Mon, 30 Sep 2013 22:03:06 +0200
Subject: [PATCH 14/47] proc connector: fix info leaks

[ Upstream commit e727ca82e0e9616ab4844301e6bae60ca7327682 ]

Initialize event_data for all possible message types to prevent leaking
kernel stack contents to userland (up to 20 bytes). Also set the flags
member of the connector message to 0 to prevent leaking two more stack
bytes this way.

Cc: stable@vger.kernel.org  # v2.6.15+
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/connector/cn_proc.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index 08ae128..c73fc2b 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -65,6 +65,7 @@ void proc_fork_connector(struct task_struct *task)
 
 	msg = (struct cn_msg *)buffer;
 	ev = (struct proc_event *)msg->data;
+	memset(&ev->event_data, 0, sizeof(ev->event_data));
 	get_seq(&msg->seq, &ev->cpu);
 	ktime_get_ts(&ts); /* get high res monotonic timestamp */
 	put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
@@ -80,6 +81,7 @@ void proc_fork_connector(struct task_struct *task)
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = 0; /* not used */
 	msg->len = sizeof(*ev);
+	msg->flags = 0; /* not used */
 	/*  If cn_netlink_send() failed, the data is not sent */
 	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
 }
@@ -96,6 +98,7 @@ void proc_exec_connector(struct task_struct *task)
 
 	msg = (struct cn_msg *)buffer;
 	ev = (struct proc_event *)msg->data;
+	memset(&ev->event_data, 0, sizeof(ev->event_data));
 	get_seq(&msg->seq, &ev->cpu);
 	ktime_get_ts(&ts); /* get high res monotonic timestamp */
 	put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
@@ -106,6 +109,7 @@ void proc_exec_connector(struct task_struct *task)
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = 0; /* not used */
 	msg->len = sizeof(*ev);
+	msg->flags = 0; /* not used */
 	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
 }
 
@@ -122,6 +126,7 @@ void proc_id_connector(struct task_struct *task, int which_id)
 
 	msg = (struct cn_msg *)buffer;
 	ev = (struct proc_event *)msg->data;
+	memset(&ev->event_data, 0, sizeof(ev->event_data));
 	ev->what = which_id;
 	ev->event_data.id.process_pid = task->pid;
 	ev->event_data.id.process_tgid = task->tgid;
@@ -145,6 +150,7 @@ void proc_id_connector(struct task_struct *task, int which_id)
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = 0; /* not used */
 	msg->len = sizeof(*ev);
+	msg->flags = 0; /* not used */
 	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
 }
 
@@ -160,6 +166,7 @@ void proc_sid_connector(struct task_struct *task)
 
 	msg = (struct cn_msg *)buffer;
 	ev = (struct proc_event *)msg->data;
+	memset(&ev->event_data, 0, sizeof(ev->event_data));
 	get_seq(&msg->seq, &ev->cpu);
 	ktime_get_ts(&ts); /* get high res monotonic timestamp */
 	put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
@@ -170,6 +177,7 @@ void proc_sid_connector(struct task_struct *task)
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = 0; /* not used */
 	msg->len = sizeof(*ev);
+	msg->flags = 0; /* not used */
 	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
 }
 
@@ -185,6 +193,7 @@ void proc_ptrace_connector(struct task_struct *task, int ptrace_id)
 
 	msg = (struct cn_msg *)buffer;
 	ev = (struct proc_event *)msg->data;
+	memset(&ev->event_data, 0, sizeof(ev->event_data));
 	get_seq(&msg->seq, &ev->cpu);
 	ktime_get_ts(&ts); /* get high res monotonic timestamp */
 	put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
@@ -203,6 +212,7 @@ void proc_ptrace_connector(struct task_struct *task, int ptrace_id)
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = 0; /* not used */
 	msg->len = sizeof(*ev);
+	msg->flags = 0; /* not used */
 	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
 }
 
@@ -218,6 +228,7 @@ void proc_comm_connector(struct task_struct *task)
 
 	msg = (struct cn_msg *)buffer;
 	ev = (struct proc_event *)msg->data;
+	memset(&ev->event_data, 0, sizeof(ev->event_data));
 	get_seq(&msg->seq, &ev->cpu);
 	ktime_get_ts(&ts); /* get high res monotonic timestamp */
 	put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
@@ -229,6 +240,7 @@ void proc_comm_connector(struct task_struct *task)
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = 0; /* not used */
 	msg->len = sizeof(*ev);
+	msg->flags = 0; /* not used */
 	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
 }
 
@@ -244,6 +256,7 @@ void proc_coredump_connector(struct task_struct *task)
 
 	msg = (struct cn_msg *)buffer;
 	ev = (struct proc_event *)msg->data;
+	memset(&ev->event_data, 0, sizeof(ev->event_data));
 	get_seq(&msg->seq, &ev->cpu);
 	ktime_get_ts(&ts); /* get high res monotonic timestamp */
 	put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
@@ -254,6 +267,7 @@ void proc_coredump_connector(struct task_struct *task)
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = 0; /* not used */
 	msg->len = sizeof(*ev);
+	msg->flags = 0; /* not used */
 	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
 }
 
@@ -269,6 +283,7 @@ void proc_exit_connector(struct task_struct *task)
 
 	msg = (struct cn_msg *)buffer;
 	ev = (struct proc_event *)msg->data;
+	memset(&ev->event_data, 0, sizeof(ev->event_data));
 	get_seq(&msg->seq, &ev->cpu);
 	ktime_get_ts(&ts); /* get high res monotonic timestamp */
 	put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
@@ -281,6 +296,7 @@ void proc_exit_connector(struct task_struct *task)
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = 0; /* not used */
 	msg->len = sizeof(*ev);
+	msg->flags = 0; /* not used */
 	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
 }
 
@@ -304,6 +320,7 @@ static void cn_proc_ack(int err, int rcvd_seq, int rcvd_ack)
 
 	msg = (struct cn_msg *)buffer;
 	ev = (struct proc_event *)msg->data;
+	memset(&ev->event_data, 0, sizeof(ev->event_data));
 	msg->seq = rcvd_seq;
 	ktime_get_ts(&ts); /* get high res monotonic timestamp */
 	put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
@@ -313,6 +330,7 @@ static void cn_proc_ack(int err, int rcvd_seq, int rcvd_ack)
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = rcvd_ack + 1;
 	msg->len = sizeof(*ev);
+	msg->flags = 0; /* not used */
 	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
 }
 
-- 
1.7.11.7


From f3d398e2465b3b74987a3a2fc42ea3e8c83d2166 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Fri, 4 Oct 2013 17:04:48 +0200
Subject: [PATCH 15/47] ipv4: fix ineffective source address selection

[ Upstream commit 0a7e22609067ff524fc7bbd45c6951dd08561667 ]

When sending out multicast messages, the source address in inet->mc_addr is
ignored and rewritten by an autoselected one. This is caused by a typo in
commit 813b3b5db831 ("ipv4: Use caller's on-stack flowi as-is in output
route lookups").

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a9a54a2..2de16d9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2074,7 +2074,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 							      RT_SCOPE_LINK);
 			goto make_route;
 		}
-		if (fl4->saddr) {
+		if (!fl4->saddr) {
 			if (ipv4_is_multicast(fl4->daddr))
 				fl4->saddr = inet_select_addr(dev_out, 0,
 							      fl4->flowi4_scope);
-- 
1.7.11.7


From 8fd516716afeb4631cf790a2be7ca30d0a664b01 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Sat, 5 Oct 2013 21:25:17 +0200
Subject: [PATCH 16/47] can: dev: fix nlmsg size calculation in can_get_size()

[ Upstream commit fe119a05f8ca481623a8d02efcc984332e612528 ]

This patch fixes the calculation of the nlmsg size, by adding the missing
nla_total_size().

Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/can/dev.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index f9cba41..1870c47 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -705,14 +705,14 @@ static size_t can_get_size(const struct net_device *dev)
 	size_t size;
 
 	size = nla_total_size(sizeof(u32));   /* IFLA_CAN_STATE */
-	size += sizeof(struct can_ctrlmode);  /* IFLA_CAN_CTRLMODE */
+	size += nla_total_size(sizeof(struct can_ctrlmode));  /* IFLA_CAN_CTRLMODE */
 	size += nla_total_size(sizeof(u32));  /* IFLA_CAN_RESTART_MS */
-	size += sizeof(struct can_bittiming); /* IFLA_CAN_BITTIMING */
-	size += sizeof(struct can_clock);     /* IFLA_CAN_CLOCK */
+	size += nla_total_size(sizeof(struct can_bittiming)); /* IFLA_CAN_BITTIMING */
+	size += nla_total_size(sizeof(struct can_clock));     /* IFLA_CAN_CLOCK */
 	if (priv->do_get_berr_counter)        /* IFLA_CAN_BERR_COUNTER */
-		size += sizeof(struct can_berr_counter);
+		size += nla_total_size(sizeof(struct can_berr_counter));
 	if (priv->bittiming_const)	      /* IFLA_CAN_BITTIMING_CONST */
-		size += sizeof(struct can_bittiming_const);
+		size += nla_total_size(sizeof(struct can_bittiming_const));
 
 	return size;
 }
-- 
1.7.11.7


From 1b3231ca7e26084580145c904dd10a60cac35c63 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <fabio.estevam@freescale.com>
Date: Sat, 5 Oct 2013 17:56:59 -0300
Subject: [PATCH 17/47] net: secure_seq: Fix warning when CONFIG_IPV6 and
 CONFIG_INET are not selected

[ Upstream commit cb03db9d0e964568407fb08ea46cc2b6b7f67587 ]

net_secret() is only used when CONFIG_IPV6 or CONFIG_INET are selected.

Building a defconfig with both of these symbols unselected (Using the ARM
at91sam9rl_defconfig, for example) leads to the following build warning:

$ make at91sam9rl_defconfig
#
# configuration written to .config
#

$ make net/core/secure_seq.o
scripts/kconfig/conf --silentoldconfig Kconfig
  CHK     include/config/kernel.release
  CHK     include/generated/uapi/linux/version.h
  CHK     include/generated/utsrelease.h
make[1]: `include/generated/mach-types.h' is up to date.
  CALL    scripts/checksyscalls.sh
  CC      net/core/secure_seq.o
net/core/secure_seq.c:17:13: warning: 'net_secret_init' defined but not used [-Wunused-function]

Fix this warning by protecting the definition of net_secret() with these
symbols.

Reported-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Fabio Estevam <fabio.estevam@freescale.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/secure_seq.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 3f1ec15..8d9d05e 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -10,6 +10,7 @@
 
 #include <net/secure_seq.h>
 
+#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
 #define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4)
 
 static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
@@ -29,6 +30,7 @@ static void net_secret_init(void)
 		cmpxchg(&net_secret[--i], 0, tmp);
 	}
 }
+#endif
 
 #ifdef CONFIG_INET
 static u32 seq_scale(u32 seq)
-- 
1.7.11.7


From 538680b534f30fe6531099f87267bb676c935351 Mon Sep 17 00:00:00 2001
From: Paul Durrant <paul.durrant@citrix.com>
Date: Tue, 8 Oct 2013 14:56:44 +0100
Subject: [PATCH 18/47] xen-netback: Don't destroy the netdev until the vif is
 shut down

[ upstream commit id: 279f438e36c0a70b23b86d2090aeec50155034a9 ]

Without this patch, if a frontend cycles through states Closing
and Closed (which Windows frontends need to do) then the netdev
will be destroyed and requires re-invocation of hotplug scripts
to restore state before the frontend can move to Connected. Thus
when udev is not in use the backend gets stuck in InitWait.

With this patch, the netdev is left alone whilst the backend is
still online and is only de-registered and freed just prior to
destroying the vif (which is also nicely symmetrical with the
netdev allocation and registration being done during probe) so
no re-invocation of hotplug scripts is required.

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Wei Liu <wei.liu2@citrix.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
---
 drivers/net/xen-netback/common.h    |  1 +
 drivers/net/xen-netback/interface.c | 23 +++++++++--------------
 drivers/net/xen-netback/xenbus.c    | 17 ++++++++++++-----
 3 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 8a4d77e..4d9a5e7 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -120,6 +120,7 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
 		   unsigned long rx_ring_ref, unsigned int tx_evtchn,
 		   unsigned int rx_evtchn);
 void xenvif_disconnect(struct xenvif *vif);
+void xenvif_free(struct xenvif *vif);
 
 void xenvif_get(struct xenvif *vif);
 void xenvif_put(struct xenvif *vif);
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 087d2db..73336c1 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -326,6 +326,9 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
 	}
 
 	netdev_dbg(dev, "Successfully created xenvif\n");
+
+	__module_get(THIS_MODULE);
+
 	return vif;
 }
 
@@ -413,12 +416,6 @@ void xenvif_carrier_off(struct xenvif *vif)
 
 void xenvif_disconnect(struct xenvif *vif)
 {
-	/* Disconnect funtion might get called by generic framework
-	 * even before vif connects, so we need to check if we really
-	 * need to do a module_put.
-	 */
-	int need_module_put = 0;
-
 	if (netif_carrier_ok(vif->dev))
 		xenvif_carrier_off(vif);
 
@@ -432,18 +429,16 @@ void xenvif_disconnect(struct xenvif *vif)
 			unbind_from_irqhandler(vif->tx_irq, vif);
 			unbind_from_irqhandler(vif->rx_irq, vif);
 		}
-		/* vif->irq is valid, we had a module_get in
-		 * xenvif_connect.
-		 */
-		need_module_put = 1;
 	}
 
-	unregister_netdev(vif->dev);
-
 	xen_netbk_unmap_frontend_rings(vif);
+}
+
+void xenvif_free(struct xenvif *vif)
+{
+	unregister_netdev(vif->dev);
 
 	free_netdev(vif->dev);
 
-	if (need_module_put)
-		module_put(THIS_MODULE);
+	module_put(THIS_MODULE);
 }
diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c
index 1fe48fe3..a53782e 100644
--- a/drivers/net/xen-netback/xenbus.c
+++ b/drivers/net/xen-netback/xenbus.c
@@ -42,7 +42,7 @@ static int netback_remove(struct xenbus_device *dev)
 	if (be->vif) {
 		kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
 		xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
-		xenvif_disconnect(be->vif);
+		xenvif_free(be->vif);
 		be->vif = NULL;
 	}
 	kfree(be);
@@ -213,9 +213,18 @@ static void disconnect_backend(struct xenbus_device *dev)
 {
 	struct backend_info *be = dev_get_drvdata(&dev->dev);
 
+	if (be->vif)
+		xenvif_disconnect(be->vif);
+}
+
+static void destroy_backend(struct xenbus_device *dev)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+
 	if (be->vif) {
+		kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
 		xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
-		xenvif_disconnect(be->vif);
+		xenvif_free(be->vif);
 		be->vif = NULL;
 	}
 }
@@ -246,14 +255,11 @@ static void frontend_changed(struct xenbus_device *dev,
 	case XenbusStateConnected:
 		if (dev->state == XenbusStateConnected)
 			break;
-		backend_create_xenvif(be);
 		if (be->vif)
 			connect(be);
 		break;
 
 	case XenbusStateClosing:
-		if (be->vif)
-			kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
 		disconnect_backend(dev);
 		xenbus_switch_state(dev, XenbusStateClosing);
 		break;
@@ -262,6 +268,7 @@ static void frontend_changed(struct xenbus_device *dev,
 		xenbus_switch_state(dev, XenbusStateClosed);
 		if (xenbus_dev_is_online(dev))
 			break;
+		destroy_backend(dev);
 		/* fall through if not online */
 	case XenbusStateUnknown:
 		device_unregister(&dev->dev);
-- 
1.7.11.7


From 29bb21656d747e62d55b9e1929b23eadcd6be324 Mon Sep 17 00:00:00 2001
From: Amir Vadai <amirv@mellanox.com>
Date: Mon, 7 Oct 2013 13:38:12 +0200
Subject: [PATCH 19/47] net/mlx4_en: Rename name of mlx4_en_rx_alloc members

[ Upstream commit 70fbe0794393829d9acd686428d87c27b6f6984b ]

Add page prefix to page related members: @size and @offset into
@page_size and @page_offset

CC: Eric Dumazet <edumazet@google.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c   | 40 ++++++++++++++++------------
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |  4 +--
 2 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index dec455c..066fc27 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -70,14 +70,15 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
 		put_page(page);
 		return -ENOMEM;
 	}
-	page_alloc->size = PAGE_SIZE << order;
+	page_alloc->page_size = PAGE_SIZE << order;
 	page_alloc->page = page;
 	page_alloc->dma = dma;
-	page_alloc->offset = frag_info->frag_align;
+	page_alloc->page_offset = frag_info->frag_align;
 	/* Not doing get_page() for each frag is a big win
 	 * on asymetric workloads.
 	 */
-	atomic_set(&page->_count, page_alloc->size / frag_info->frag_stride);
+	atomic_set(&page->_count,
+		   page_alloc->page_size / frag_info->frag_stride);
 	return 0;
 }
 
@@ -96,16 +97,19 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
 	for (i = 0; i < priv->num_frags; i++) {
 		frag_info = &priv->frag_info[i];
 		page_alloc[i] = ring_alloc[i];
-		page_alloc[i].offset += frag_info->frag_stride;
-		if (page_alloc[i].offset + frag_info->frag_stride <= ring_alloc[i].size)
+		page_alloc[i].page_offset += frag_info->frag_stride;
+
+		if (page_alloc[i].page_offset + frag_info->frag_stride <=
+		    ring_alloc[i].page_size)
 			continue;
+
 		if (mlx4_alloc_pages(priv, &page_alloc[i], frag_info, gfp))
 			goto out;
 	}
 
 	for (i = 0; i < priv->num_frags; i++) {
 		frags[i] = ring_alloc[i];
-		dma = ring_alloc[i].dma + ring_alloc[i].offset;
+		dma = ring_alloc[i].dma + ring_alloc[i].page_offset;
 		ring_alloc[i] = page_alloc[i];
 		rx_desc->data[i].addr = cpu_to_be64(dma);
 	}
@@ -117,7 +121,7 @@ out:
 		frag_info = &priv->frag_info[i];
 		if (page_alloc[i].page != ring_alloc[i].page) {
 			dma_unmap_page(priv->ddev, page_alloc[i].dma,
-				page_alloc[i].size, PCI_DMA_FROMDEVICE);
+				page_alloc[i].page_size, PCI_DMA_FROMDEVICE);
 			page = page_alloc[i].page;
 			atomic_set(&page->_count, 1);
 			put_page(page);
@@ -132,9 +136,10 @@ static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
 {
 	const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
 
-	if (frags[i].offset + frag_info->frag_stride > frags[i].size)
-		dma_unmap_page(priv->ddev, frags[i].dma, frags[i].size,
-					 PCI_DMA_FROMDEVICE);
+	if (frags[i].page_offset + frag_info->frag_stride >
+	    frags[i].page_size)
+		dma_unmap_page(priv->ddev, frags[i].dma, frags[i].page_size,
+			       PCI_DMA_FROMDEVICE);
 
 	if (frags[i].page)
 		put_page(frags[i].page);
@@ -161,7 +166,7 @@ out:
 
 		page_alloc = &ring->page_alloc[i];
 		dma_unmap_page(priv->ddev, page_alloc->dma,
-			       page_alloc->size, PCI_DMA_FROMDEVICE);
+			       page_alloc->page_size, PCI_DMA_FROMDEVICE);
 		page = page_alloc->page;
 		atomic_set(&page->_count, 1);
 		put_page(page);
@@ -184,10 +189,11 @@ static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv,
 		       i, page_count(page_alloc->page));
 
 		dma_unmap_page(priv->ddev, page_alloc->dma,
-				page_alloc->size, PCI_DMA_FROMDEVICE);
-		while (page_alloc->offset + frag_info->frag_stride < page_alloc->size) {
+				page_alloc->page_size, PCI_DMA_FROMDEVICE);
+		while (page_alloc->page_offset + frag_info->frag_stride <
+		       page_alloc->page_size) {
 			put_page(page_alloc->page);
-			page_alloc->offset += frag_info->frag_stride;
+			page_alloc->page_offset += frag_info->frag_stride;
 		}
 		page_alloc->page = NULL;
 	}
@@ -478,7 +484,7 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
 		/* Save page reference in skb */
 		__skb_frag_set_page(&skb_frags_rx[nr], frags[nr].page);
 		skb_frag_size_set(&skb_frags_rx[nr], frag_info->frag_size);
-		skb_frags_rx[nr].page_offset = frags[nr].offset;
+		skb_frags_rx[nr].page_offset = frags[nr].page_offset;
 		skb->truesize += frag_info->frag_stride;
 		frags[nr].page = NULL;
 	}
@@ -517,7 +523,7 @@ static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv,
 
 	/* Get pointer to first fragment so we could copy the headers into the
 	 * (linear part of the) skb */
-	va = page_address(frags[0].page) + frags[0].offset;
+	va = page_address(frags[0].page) + frags[0].page_offset;
 
 	if (length <= SMALL_PACKET_SIZE) {
 		/* We are copying all relevant data to the skb - temporarily
@@ -645,7 +651,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 			dma_sync_single_for_cpu(priv->ddev, dma, sizeof(*ethh),
 						DMA_FROM_DEVICE);
 			ethh = (struct ethhdr *)(page_address(frags[0].page) +
-						 frags[0].offset);
+						 frags[0].page_offset);
 
 			if (is_multicast_ether_addr(ethh->h_dest)) {
 				struct mlx4_mac_entry *entry;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 5e0aa56..bf06e36 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -237,8 +237,8 @@ struct mlx4_en_tx_desc {
 struct mlx4_en_rx_alloc {
 	struct page	*page;
 	dma_addr_t	dma;
-	u32		offset;
-	u32		size;
+	u32		page_offset;
+	u32		page_size;
 };
 
 struct mlx4_en_tx_ring {
-- 
1.7.11.7


From 4bd2cc99115d31513bfe3c2bd7bcfe67fc081ae8 Mon Sep 17 00:00:00 2001
From: Amir Vadai <amirv@mellanox.com>
Date: Mon, 7 Oct 2013 13:38:13 +0200
Subject: [PATCH 20/47] net/mlx4_en: Fix pages never dma unmapped on rx

[ Upstream commit 021f1107ffdae7a82af6c53f4c52654062e365c6 ]

This patch fixes a bug introduced by commit 51151a16 (mlx4: allow
order-0 memory allocations in RX path).

dma_unmap_page never reached because condition to detect last fragment
in page is wrong. offset+frag_stride can't be greater than size, need to
make sure no additional frag will fit in page => compare offset +
frag_stride + next_frag_size instead.
next_frag_size is the same as the current one, since page is shared only
with frags of the same size.

CC: Eric Dumazet <edumazet@google.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 066fc27..afe2efa 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -135,9 +135,10 @@ static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
 			      int i)
 {
 	const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
+	u32 next_frag_end = frags[i].page_offset + 2 * frag_info->frag_stride;
 
-	if (frags[i].page_offset + frag_info->frag_stride >
-	    frags[i].page_size)
+
+	if (next_frag_end > frags[i].page_size)
 		dma_unmap_page(priv->ddev, frags[i].dma, frags[i].page_size,
 			       PCI_DMA_FROMDEVICE);
 
-- 
1.7.11.7


From af64f33fff313187ca01ddb7db09b537a89208dd Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Mon, 7 Oct 2013 23:19:58 +0200
Subject: [PATCH 21/47] net: vlan: fix nlmsg size calculation in
 vlan_get_size()

[ Upstream commit c33a39c575068c2ea9bffb22fd6de2df19c74b89 ]

This patch fixes the calculation of the nlmsg size, by adding the missing
nla_total_size().

Cc: Patrick McHardy <kaber@trash.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan_netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index 3091297..c7e634a 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -171,7 +171,7 @@ static size_t vlan_get_size(const struct net_device *dev)
 
 	return nla_total_size(2) +	/* IFLA_VLAN_PROTOCOL */
 	       nla_total_size(2) +	/* IFLA_VLAN_ID */
-	       sizeof(struct ifla_vlan_flags) + /* IFLA_VLAN_FLAGS */
+	       nla_total_size(sizeof(struct ifla_vlan_flags)) + /* IFLA_VLAN_FLAGS */
 	       vlan_qos_map_size(vlan->nr_ingress_mappings) +
 	       vlan_qos_map_size(vlan->nr_egress_mappings);
 }
-- 
1.7.11.7


From 74869292aeb07213144e34b0e21e23f7e3c9f61f Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevic@redhat.com>
Date: Thu, 10 Oct 2013 15:57:59 -0400
Subject: [PATCH 22/47] bridge: update mdb expiration timer upon reports.

[ Upstream commit f144febd93d5ee534fdf23505ab091b2b9088edc ]

commit 9f00b2e7cf241fa389733d41b615efdaa2cb0f5b
	bridge: only expire the mdb entry when query is received
changed the mdb expiration timer to be armed only when QUERY is
received.  Howerver, this causes issues in an environment where
the multicast server socket comes and goes very fast while a client
is trying to send traffic to it.

The root cause is a race where a sequence of LEAVE followed by REPORT
messages can race against QUERY messages generated in response to LEAVE.
The QUERY ends up starting the expiration timer, and that timer can
potentially expire after the new REPORT message has been received signaling
the new join operation.  This leads to a significant drop in multicast
traffic and possible complete stall.

The solution is to have REPORT messages update the expiration timer
on entries that already exist.

CC: Cong Wang <xiyou.wangcong@gmail.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Vlad Yasevich <vyasevic@redhat.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index bbcb435..0e3fea7 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -610,6 +610,9 @@ rehash:
 		break;
 
 	default:
+		/* If we have an existing entry, update it's expire timer */
+		mod_timer(&mp->timer,
+			  jiffies + br->multicast_membership_interval);
 		goto out;
 	}
 
@@ -679,8 +682,12 @@ static int br_multicast_add_group(struct net_bridge *br,
 	for (pp = &mp->ports;
 	     (p = mlock_dereference(*pp, br)) != NULL;
 	     pp = &p->next) {
-		if (p->port == port)
+		if (p->port == port) {
+			/* We already have a portgroup, update the timer.  */
+			mod_timer(&p->timer,
+				  jiffies + br->multicast_membership_interval);
 			goto out;
+		}
 		if ((unsigned long)p->port < (unsigned long)port)
 			break;
 	}
-- 
1.7.11.7


From d9f02cfe59400677feea276d4b27981f6d91825a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Linus=20L=C3=BCssing?= <linus.luessing@web.de>
Date: Sun, 20 Oct 2013 00:58:57 +0200
Subject: [PATCH 23/47] Revert "bridge: only expire the mdb entry when query
 is received"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 454594f3b93a49ef568cd190c5af31376b105a7b ]

While this commit was a good attempt to fix issues occuring when no
multicast querier is present, this commit still has two more issues:

1) There are cases where mdb entries do not expire even if there is a
querier present. The bridge will unnecessarily continue flooding
multicast packets on the according ports.

2) Never removing an mdb entry could be exploited for a Denial of
Service by an attacker on the local link, slowly, but steadily eating up
all memory.

Actually, this commit became obsolete with
"bridge: disable snooping if there is no querier" (b00589af3b)
which included fixes for a few more cases.

Therefore reverting the following commits (the commit stated in the
commit message plus three of its follow up fixes):

====================
Revert "bridge: update mdb expiration timer upon reports."
This reverts commit f144febd93d5ee534fdf23505ab091b2b9088edc.
Revert "bridge: do not call setup_timer() multiple times"
This reverts commit 1faabf2aab1fdaa1ace4e8c829d1b9cf7bfec2f1.
Revert "bridge: fix some kernel warning in multicast timer"
This reverts commit c7e8e8a8f7a70b343ca1e0f90a31e35ab2d16de1.
Revert "bridge: only expire the mdb entry when query is received"
This reverts commit 9f00b2e7cf241fa389733d41b615efdaa2cb0f5b.
====================

CC: Cong Wang <amwang@redhat.com>
Signed-off-by: Linus Lüssing <linus.luessing@web.de>
Reviewed-by: Vlad Yasevich <vyasevich@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_mdb.c       |  2 +-
 net/bridge/br_multicast.c | 47 +++++++++++++++++++++++++++--------------------
 net/bridge/br_private.h   |  1 -
 3 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 6319c43..de3a0e7 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -451,7 +451,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
 		call_rcu_bh(&p->rcu, br_multicast_free_pg);
 		err = 0;
 
-		if (!mp->ports && !mp->mglist && mp->timer_armed &&
+		if (!mp->ports && !mp->mglist &&
 		    netif_running(br->dev))
 			mod_timer(&mp->timer, jiffies);
 		break;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 0e3fea7..fbad619 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -271,7 +271,7 @@ static void br_multicast_del_pg(struct net_bridge *br,
 		del_timer(&p->timer);
 		call_rcu_bh(&p->rcu, br_multicast_free_pg);
 
-		if (!mp->ports && !mp->mglist && mp->timer_armed &&
+		if (!mp->ports && !mp->mglist &&
 		    netif_running(br->dev))
 			mod_timer(&mp->timer, jiffies);
 
@@ -610,9 +610,6 @@ rehash:
 		break;
 
 	default:
-		/* If we have an existing entry, update it's expire timer */
-		mod_timer(&mp->timer,
-			  jiffies + br->multicast_membership_interval);
 		goto out;
 	}
 
@@ -622,7 +619,6 @@ rehash:
 
 	mp->br = br;
 	mp->addr = *group;
-
 	setup_timer(&mp->timer, br_multicast_group_expired,
 		    (unsigned long)mp);
 
@@ -662,6 +658,7 @@ static int br_multicast_add_group(struct net_bridge *br,
 	struct net_bridge_mdb_entry *mp;
 	struct net_bridge_port_group *p;
 	struct net_bridge_port_group __rcu **pp;
+	unsigned long now = jiffies;
 	int err;
 
 	spin_lock(&br->multicast_lock);
@@ -676,18 +673,15 @@ static int br_multicast_add_group(struct net_bridge *br,
 
 	if (!port) {
 		mp->mglist = true;
+		mod_timer(&mp->timer, now + br->multicast_membership_interval);
 		goto out;
 	}
 
 	for (pp = &mp->ports;
 	     (p = mlock_dereference(*pp, br)) != NULL;
 	     pp = &p->next) {
-		if (p->port == port) {
-			/* We already have a portgroup, update the timer.  */
-			mod_timer(&p->timer,
-				  jiffies + br->multicast_membership_interval);
-			goto out;
-		}
+		if (p->port == port)
+			goto found;
 		if ((unsigned long)p->port < (unsigned long)port)
 			break;
 	}
@@ -698,6 +692,8 @@ static int br_multicast_add_group(struct net_bridge *br,
 	rcu_assign_pointer(*pp, p);
 	br_mdb_notify(br->dev, port, group, RTM_NEWMDB);
 
+found:
+	mod_timer(&p->timer, now + br->multicast_membership_interval);
 out:
 	err = 0;
 
@@ -1197,9 +1193,6 @@ static int br_ip4_multicast_query(struct net_bridge *br,
 	if (!mp)
 		goto out;
 
-	mod_timer(&mp->timer, now + br->multicast_membership_interval);
-	mp->timer_armed = true;
-
 	max_delay *= br->multicast_last_member_count;
 
 	if (mp->mglist &&
@@ -1276,9 +1269,6 @@ static int br_ip6_multicast_query(struct net_bridge *br,
 	if (!mp)
 		goto out;
 
-	mod_timer(&mp->timer, now + br->multicast_membership_interval);
-	mp->timer_armed = true;
-
 	max_delay *= br->multicast_last_member_count;
 	if (mp->mglist &&
 	    (timer_pending(&mp->timer) ?
@@ -1364,7 +1354,7 @@ static void br_multicast_leave_group(struct net_bridge *br,
 			call_rcu_bh(&p->rcu, br_multicast_free_pg);
 			br_mdb_notify(br->dev, port, group, RTM_DELMDB);
 
-			if (!mp->ports && !mp->mglist && mp->timer_armed &&
+			if (!mp->ports && !mp->mglist &&
 			    netif_running(br->dev))
 				mod_timer(&mp->timer, jiffies);
 		}
@@ -1376,12 +1366,30 @@ static void br_multicast_leave_group(struct net_bridge *br,
 		     br->multicast_last_member_interval;
 
 	if (!port) {
-		if (mp->mglist && mp->timer_armed &&
+		if (mp->mglist &&
 		    (timer_pending(&mp->timer) ?
 		     time_after(mp->timer.expires, time) :
 		     try_to_del_timer_sync(&mp->timer) >= 0)) {
 			mod_timer(&mp->timer, time);
 		}
+
+		goto out;
+	}
+
+	for (p = mlock_dereference(mp->ports, br);
+	     p != NULL;
+	     p = mlock_dereference(p->next, br)) {
+		if (p->port != port)
+			continue;
+
+		if (!hlist_unhashed(&p->mglist) &&
+		    (timer_pending(&p->timer) ?
+		     time_after(p->timer.expires, time) :
+		     try_to_del_timer_sync(&p->timer) >= 0)) {
+			mod_timer(&p->timer, time);
+		}
+
+		break;
 	}
 out:
 	spin_unlock(&br->multicast_lock);
@@ -1798,7 +1806,6 @@ void br_multicast_stop(struct net_bridge *br)
 		hlist_for_each_entry_safe(mp, n, &mdb->mhash[i],
 					  hlist[ver]) {
 			del_timer(&mp->timer);
-			mp->timer_armed = false;
 			call_rcu_bh(&mp->rcu, br_multicast_free_group);
 		}
 	}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index cde1eb1..aa05bd8 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -126,7 +126,6 @@ struct net_bridge_mdb_entry
 	struct timer_list		timer;
 	struct br_ip			addr;
 	bool				mglist;
-	bool				timer_armed;
 };
 
 struct net_bridge_mdb_htable
-- 
1.7.11.7


From 40420baad983147cd23e6de95c958c96b96be727 Mon Sep 17 00:00:00 2001
From: Christophe Gouault <christophe.gouault@6wind.com>
Date: Tue, 8 Oct 2013 17:21:22 +0200
Subject: [PATCH 24/47] vti: get rid of nf mark rule in prerouting

[ Upstream commit 7263a5187f9e9de45fcb51349cf0e031142c19a1 ]

This patch fixes and improves the use of vti interfaces (while
lightly changing the way of configuring them).

Currently:

- it is necessary to identify and mark inbound IPsec
  packets destined to each vti interface, via netfilter rules in
  the mangle table at prerouting hook.

- the vti module cannot retrieve the right tunnel in input since
  commit b9959fd3: vti tunnels all have an i_key, but the tunnel lookup
  is done with flag TUNNEL_NO_KEY, so there no chance to retrieve them.

- the i_key is used by the outbound processing as a mark to lookup
  for the right SP and SA bundle.

This patch uses the o_key to store the vti mark (instead of i_key) and
enables:

- to avoid the need for previously marking the inbound skbuffs via a
  netfilter rule.
- to properly retrieve the right tunnel in input, only based on the IPsec
  packet outer addresses.
- to properly perform an inbound policy check (using the tunnel o_key
  as a mark).
- to properly perform an outbound SPD and SAD lookup (using the tunnel
  o_key as a mark).
- to keep the current mark of the skbuff. The skbuff mark is neither
  used nor changed by the vti interface. Only the vti interface o_key
  is used.

SAs have a wildcard mark.
SPs have a mark equal to the vti interface o_key.

The vti interface must be created as follows (i_key = 0, o_key = mark):

   ip link add vti1 mode vti local 1.1.1.1 remote 2.2.2.2 okey 1

The SPs attached to vti1 must be created as follows (mark = vti1 o_key):

   ip xfrm policy add dir out mark 1 tmpl src 1.1.1.1 dst 2.2.2.2 \
      proto esp mode tunnel
   ip xfrm policy add dir in  mark 1 tmpl src 2.2.2.2 dst 1.1.1.1 \
      proto esp mode tunnel

The SAs are created with the default wildcard mark. There is no
distinction between global vs. vti SAs. Just their addresses will
possibly link them to a vti interface:

   ip xfrm state add src 1.1.1.1 dst 2.2.2.2 proto esp spi 1000 mode tunnel \
                 enc "cbc(aes)" "azertyuiopqsdfgh"

   ip xfrm state add src 2.2.2.2 dst 1.1.1.1 proto esp spi 2000 mode tunnel \
                 enc "cbc(aes)" "sqbdhgqsdjqjsdfh"

To avoid matching "global" (not vti) SPs in vti interfaces, global SPs
should no use the default wildcard mark, but explicitly match mark 0.

To avoid a double SPD lookup in input and output (in global and vti SPDs),
the NOPOLICY and NOXFRM options should be set on the vti interfaces:

   echo 1 > /proc/sys/net/ipv4/conf/vti1/disable_policy
   echo 1 > /proc/sys/net/ipv4/conf/vti1/disable_xfrm

The outgoing traffic is steered to vti1 by a route via the vti interface:

   ip route add 192.168.0.0/16 dev vti1

The incoming IPsec traffic is steered to vti1 because its outer addresses
match the vti1 tunnel configuration.

Signed-off-by: Christophe Gouault <christophe.gouault@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_vti.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 17cc0ff..0656041 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -285,8 +285,17 @@ static int vti_rcv(struct sk_buff *skb)
 	tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
 	if (tunnel != NULL) {
 		struct pcpu_tstats *tstats;
+		u32 oldmark = skb->mark;
+		int ret;
 
-		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+
+		/* temporarily mark the skb with the tunnel o_key, to
+		 * only match policies with this mark.
+		 */
+		skb->mark = be32_to_cpu(tunnel->parms.o_key);
+		ret = xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb);
+		skb->mark = oldmark;
+		if (!ret)
 			return -1;
 
 		tstats = this_cpu_ptr(tunnel->dev->tstats);
@@ -295,7 +304,6 @@ static int vti_rcv(struct sk_buff *skb)
 		tstats->rx_bytes += skb->len;
 		u64_stats_update_end(&tstats->syncp);
 
-		skb->mark = 0;
 		secpath_reset(skb);
 		skb->dev = tunnel->dev;
 		return 1;
@@ -327,7 +335,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	memset(&fl4, 0, sizeof(fl4));
 	flowi4_init_output(&fl4, tunnel->parms.link,
-			   be32_to_cpu(tunnel->parms.i_key), RT_TOS(tos),
+			   be32_to_cpu(tunnel->parms.o_key), RT_TOS(tos),
 			   RT_SCOPE_UNIVERSE,
 			   IPPROTO_IPIP, 0,
 			   dst, tiph->saddr, 0, 0);
-- 
1.7.11.7


From d74d8a563ec79425464d7a8aeaa1796724fea7bc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 10 Oct 2013 06:30:09 -0700
Subject: [PATCH 25/47] l2tp: must disable bh before calling l2tp_xmit_skb()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 455cc32bf128e114455d11ad919321ab89a2c312 ]

François Cachereul made a very nice bug report and suspected
the bh_lock_sock() / bh_unlok_sock() pair used in l2tp_xmit_skb() from
process context was not good.

This problem was added by commit 6af88da14ee284aaad6e4326da09a89191ab6165
("l2tp: Fix locking in l2tp_core.c").

l2tp_eth_dev_xmit() runs from BH context, so we must disable BH
from other l2tp_xmit_skb() users.

[  452.060011] BUG: soft lockup - CPU#1 stuck for 23s! [accel-pppd:6662]
[  452.061757] Modules linked in: l2tp_ppp l2tp_netlink l2tp_core pppoe pppox
ppp_generic slhc ipv6 ext3 mbcache jbd virtio_balloon xfs exportfs dm_mod
virtio_blk ata_generic virtio_net floppy ata_piix libata virtio_pci virtio_ring virtio [last unloaded: scsi_wait_scan]
[  452.064012] CPU 1
[  452.080015] BUG: soft lockup - CPU#2 stuck for 23s! [accel-pppd:6643]
[  452.080015] CPU 2
[  452.080015]
[  452.080015] Pid: 6643, comm: accel-pppd Not tainted 3.2.46.mini #1 Bochs Bochs
[  452.080015] RIP: 0010:[<ffffffff81059f6c>]  [<ffffffff81059f6c>] do_raw_spin_lock+0x17/0x1f
[  452.080015] RSP: 0018:ffff88007125fc18  EFLAGS: 00000293
[  452.080015] RAX: 000000000000aba9 RBX: ffffffff811d0703 RCX: 0000000000000000
[  452.080015] RDX: 00000000000000ab RSI: ffff8800711f6896 RDI: ffff8800745c8110
[  452.080015] RBP: ffff88007125fc18 R08: 0000000000000020 R09: 0000000000000000
[  452.080015] R10: 0000000000000000 R11: 0000000000000280 R12: 0000000000000286
[  452.080015] R13: 0000000000000020 R14: 0000000000000240 R15: 0000000000000000
[  452.080015] FS:  00007fdc0cc24700(0000) GS:ffff8800b6f00000(0000) knlGS:0000000000000000
[  452.080015] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  452.080015] CR2: 00007fdb054899b8 CR3: 0000000074404000 CR4: 00000000000006a0
[  452.080015] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  452.080015] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[  452.080015] Process accel-pppd (pid: 6643, threadinfo ffff88007125e000, task ffff8800b27e6dd0)
[  452.080015] Stack:
[  452.080015]  ffff88007125fc28 ffffffff81256559 ffff88007125fc98 ffffffffa01b2bd1
[  452.080015]  ffff88007125fc58 000000000000000c 00000000029490d0 0000009c71dbe25e
[  452.080015]  000000000000005c 000000080000000e 0000000000000000 ffff880071170600
[  452.080015] Call Trace:
[  452.080015]  [<ffffffff81256559>] _raw_spin_lock+0xe/0x10
[  452.080015]  [<ffffffffa01b2bd1>] l2tp_xmit_skb+0x189/0x4ac [l2tp_core]
[  452.080015]  [<ffffffffa01c2d36>] pppol2tp_sendmsg+0x15e/0x19c [l2tp_ppp]
[  452.080015]  [<ffffffff811c7872>] __sock_sendmsg_nosec+0x22/0x24
[  452.080015]  [<ffffffff811c83bd>] sock_sendmsg+0xa1/0xb6
[  452.080015]  [<ffffffff81254e88>] ? __schedule+0x5c1/0x616
[  452.080015]  [<ffffffff8103c7c6>] ? __dequeue_signal+0xb7/0x10c
[  452.080015]  [<ffffffff810bbd21>] ? fget_light+0x75/0x89
[  452.080015]  [<ffffffff811c8444>] ? sockfd_lookup_light+0x20/0x56
[  452.080015]  [<ffffffff811c9b34>] sys_sendto+0x10c/0x13b
[  452.080015]  [<ffffffff8125cac2>] system_call_fastpath+0x16/0x1b
[  452.080015] Code: 81 48 89 e5 72 0c 31 c0 48 81 ff 45 66 25 81 0f 92 c0 5d c3 55 b8 00 01 00 00 48 89 e5 f0 66 0f c1 07 0f b6 d4 38 d0 74 06 f3 90 <8a> 07 eb f6 5d c3 90 90 55 48 89 e5 9c 58 0f 1f 44 00 00 5d c3
[  452.080015] Call Trace:
[  452.080015]  [<ffffffff81256559>] _raw_spin_lock+0xe/0x10
[  452.080015]  [<ffffffffa01b2bd1>] l2tp_xmit_skb+0x189/0x4ac [l2tp_core]
[  452.080015]  [<ffffffffa01c2d36>] pppol2tp_sendmsg+0x15e/0x19c [l2tp_ppp]
[  452.080015]  [<ffffffff811c7872>] __sock_sendmsg_nosec+0x22/0x24
[  452.080015]  [<ffffffff811c83bd>] sock_sendmsg+0xa1/0xb6
[  452.080015]  [<ffffffff81254e88>] ? __schedule+0x5c1/0x616
[  452.080015]  [<ffffffff8103c7c6>] ? __dequeue_signal+0xb7/0x10c
[  452.080015]  [<ffffffff810bbd21>] ? fget_light+0x75/0x89
[  452.080015]  [<ffffffff811c8444>] ? sockfd_lookup_light+0x20/0x56
[  452.080015]  [<ffffffff811c9b34>] sys_sendto+0x10c/0x13b
[  452.080015]  [<ffffffff8125cac2>] system_call_fastpath+0x16/0x1b
[  452.064012]
[  452.064012] Pid: 6662, comm: accel-pppd Not tainted 3.2.46.mini #1 Bochs Bochs
[  452.064012] RIP: 0010:[<ffffffff81059f6e>]  [<ffffffff81059f6e>] do_raw_spin_lock+0x19/0x1f
[  452.064012] RSP: 0018:ffff8800b6e83ba0  EFLAGS: 00000297
[  452.064012] RAX: 000000000000aaa9 RBX: ffff8800b6e83b40 RCX: 0000000000000002
[  452.064012] RDX: 00000000000000aa RSI: 000000000000000a RDI: ffff8800745c8110
[  452.064012] RBP: ffff8800b6e83ba0 R08: 000000000000c802 R09: 000000000000001c
[  452.064012] R10: ffff880071096c4e R11: 0000000000000006 R12: ffff8800b6e83b18
[  452.064012] R13: ffffffff8125d51e R14: ffff8800b6e83ba0 R15: ffff880072a589c0
[  452.064012] FS:  00007fdc0b81e700(0000) GS:ffff8800b6e80000(0000) knlGS:0000000000000000
[  452.064012] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  452.064012] CR2: 0000000000625208 CR3: 0000000074404000 CR4: 00000000000006a0
[  452.064012] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  452.064012] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[  452.064012] Process accel-pppd (pid: 6662, threadinfo ffff88007129a000, task ffff8800744f7410)
[  452.064012] Stack:
[  452.064012]  ffff8800b6e83bb0 ffffffff81256559 ffff8800b6e83bc0 ffffffff8121c64a
[  452.064012]  ffff8800b6e83bf0 ffffffff8121ec7a ffff880072a589c0 ffff880071096c62
[  452.064012]  0000000000000011 ffffffff81430024 ffff8800b6e83c80 ffffffff8121f276
[  452.064012] Call Trace:
[  452.064012]  <IRQ>
[  452.064012]  [<ffffffff81256559>] _raw_spin_lock+0xe/0x10
[  452.064012]  [<ffffffff8121c64a>] spin_lock+0x9/0xb
[  452.064012]  [<ffffffff8121ec7a>] udp_queue_rcv_skb+0x186/0x269
[  452.064012]  [<ffffffff8121f276>] __udp4_lib_rcv+0x297/0x4ae
[  452.064012]  [<ffffffff8121c178>] ? raw_rcv+0xe9/0xf0
[  452.064012]  [<ffffffff8121f4a7>] udp_rcv+0x1a/0x1c
[  452.064012]  [<ffffffff811fe385>] ip_local_deliver_finish+0x12b/0x1a5
[  452.064012]  [<ffffffff811fe54e>] ip_local_deliver+0x53/0x84
[  452.064012]  [<ffffffff811fe1d0>] ip_rcv_finish+0x2bc/0x2f3
[  452.064012]  [<ffffffff811fe78f>] ip_rcv+0x210/0x269
[  452.064012]  [<ffffffff8101911e>] ? kvm_clock_get_cycles+0x9/0xb
[  452.064012]  [<ffffffff811d88cd>] __netif_receive_skb+0x3a5/0x3f7
[  452.064012]  [<ffffffff811d8eba>] netif_receive_skb+0x57/0x5e
[  452.064012]  [<ffffffff811cf30f>] ? __netdev_alloc_skb+0x1f/0x3b
[  452.064012]  [<ffffffffa0049126>] virtnet_poll+0x4ba/0x5a4 [virtio_net]
[  452.064012]  [<ffffffff811d9417>] net_rx_action+0x73/0x184
[  452.064012]  [<ffffffffa01b2cc2>] ? l2tp_xmit_skb+0x27a/0x4ac [l2tp_core]
[  452.064012]  [<ffffffff810343b9>] __do_softirq+0xc3/0x1a8
[  452.064012]  [<ffffffff81013b56>] ? ack_APIC_irq+0x10/0x12
[  452.064012]  [<ffffffff81256559>] ? _raw_spin_lock+0xe/0x10
[  452.064012]  [<ffffffff8125e0ac>] call_softirq+0x1c/0x26
[  452.064012]  [<ffffffff81003587>] do_softirq+0x45/0x82
[  452.064012]  [<ffffffff81034667>] irq_exit+0x42/0x9c
[  452.064012]  [<ffffffff8125e146>] do_IRQ+0x8e/0xa5
[  452.064012]  [<ffffffff8125676e>] common_interrupt+0x6e/0x6e
[  452.064012]  <EOI>
[  452.064012]  [<ffffffff810b82a1>] ? kfree+0x8a/0xa3
[  452.064012]  [<ffffffffa01b2cc2>] ? l2tp_xmit_skb+0x27a/0x4ac [l2tp_core]
[  452.064012]  [<ffffffffa01b2c25>] ? l2tp_xmit_skb+0x1dd/0x4ac [l2tp_core]
[  452.064012]  [<ffffffffa01c2d36>] pppol2tp_sendmsg+0x15e/0x19c [l2tp_ppp]
[  452.064012]  [<ffffffff811c7872>] __sock_sendmsg_nosec+0x22/0x24
[  452.064012]  [<ffffffff811c83bd>] sock_sendmsg+0xa1/0xb6
[  452.064012]  [<ffffffff81254e88>] ? __schedule+0x5c1/0x616
[  452.064012]  [<ffffffff8103c7c6>] ? __dequeue_signal+0xb7/0x10c
[  452.064012]  [<ffffffff810bbd21>] ? fget_light+0x75/0x89
[  452.064012]  [<ffffffff811c8444>] ? sockfd_lookup_light+0x20/0x56
[  452.064012]  [<ffffffff811c9b34>] sys_sendto+0x10c/0x13b
[  452.064012]  [<ffffffff8125cac2>] system_call_fastpath+0x16/0x1b
[  452.064012] Code: 89 e5 72 0c 31 c0 48 81 ff 45 66 25 81 0f 92 c0 5d c3 55 b8 00 01 00 00 48 89 e5 f0 66 0f c1 07 0f b6 d4 38 d0 74 06 f3 90 8a 07 <eb> f6 5d c3 90 90 55 48 89 e5 9c 58 0f 1f 44 00 00 5d c3 55 48
[  452.064012] Call Trace:
[  452.064012]  <IRQ>  [<ffffffff81256559>] _raw_spin_lock+0xe/0x10
[  452.064012]  [<ffffffff8121c64a>] spin_lock+0x9/0xb
[  452.064012]  [<ffffffff8121ec7a>] udp_queue_rcv_skb+0x186/0x269
[  452.064012]  [<ffffffff8121f276>] __udp4_lib_rcv+0x297/0x4ae
[  452.064012]  [<ffffffff8121c178>] ? raw_rcv+0xe9/0xf0
[  452.064012]  [<ffffffff8121f4a7>] udp_rcv+0x1a/0x1c
[  452.064012]  [<ffffffff811fe385>] ip_local_deliver_finish+0x12b/0x1a5
[  452.064012]  [<ffffffff811fe54e>] ip_local_deliver+0x53/0x84
[  452.064012]  [<ffffffff811fe1d0>] ip_rcv_finish+0x2bc/0x2f3
[  452.064012]  [<ffffffff811fe78f>] ip_rcv+0x210/0x269
[  452.064012]  [<ffffffff8101911e>] ? kvm_clock_get_cycles+0x9/0xb
[  452.064012]  [<ffffffff811d88cd>] __netif_receive_skb+0x3a5/0x3f7
[  452.064012]  [<ffffffff811d8eba>] netif_receive_skb+0x57/0x5e
[  452.064012]  [<ffffffff811cf30f>] ? __netdev_alloc_skb+0x1f/0x3b
[  452.064012]  [<ffffffffa0049126>] virtnet_poll+0x4ba/0x5a4 [virtio_net]
[  452.064012]  [<ffffffff811d9417>] net_rx_action+0x73/0x184
[  452.064012]  [<ffffffffa01b2cc2>] ? l2tp_xmit_skb+0x27a/0x4ac [l2tp_core]
[  452.064012]  [<ffffffff810343b9>] __do_softirq+0xc3/0x1a8
[  452.064012]  [<ffffffff81013b56>] ? ack_APIC_irq+0x10/0x12
[  452.064012]  [<ffffffff81256559>] ? _raw_spin_lock+0xe/0x10
[  452.064012]  [<ffffffff8125e0ac>] call_softirq+0x1c/0x26
[  452.064012]  [<ffffffff81003587>] do_softirq+0x45/0x82
[  452.064012]  [<ffffffff81034667>] irq_exit+0x42/0x9c
[  452.064012]  [<ffffffff8125e146>] do_IRQ+0x8e/0xa5
[  452.064012]  [<ffffffff8125676e>] common_interrupt+0x6e/0x6e
[  452.064012]  <EOI>  [<ffffffff810b82a1>] ? kfree+0x8a/0xa3
[  452.064012]  [<ffffffffa01b2cc2>] ? l2tp_xmit_skb+0x27a/0x4ac [l2tp_core]
[  452.064012]  [<ffffffffa01b2c25>] ? l2tp_xmit_skb+0x1dd/0x4ac [l2tp_core]
[  452.064012]  [<ffffffffa01c2d36>] pppol2tp_sendmsg+0x15e/0x19c [l2tp_ppp]
[  452.064012]  [<ffffffff811c7872>] __sock_sendmsg_nosec+0x22/0x24
[  452.064012]  [<ffffffff811c83bd>] sock_sendmsg+0xa1/0xb6
[  452.064012]  [<ffffffff81254e88>] ? __schedule+0x5c1/0x616
[  452.064012]  [<ffffffff8103c7c6>] ? __dequeue_signal+0xb7/0x10c
[  452.064012]  [<ffffffff810bbd21>] ? fget_light+0x75/0x89
[  452.064012]  [<ffffffff811c8444>] ? sockfd_lookup_light+0x20/0x56
[  452.064012]  [<ffffffff811c9b34>] sys_sendto+0x10c/0x13b
[  452.064012]  [<ffffffff8125cac2>] system_call_fastpath+0x16/0x1b

Reported-by: François Cachereul <f.cachereul@alphalink.fr>
Tested-by: François Cachereul <f.cachereul@alphalink.fr>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: James Chapman <jchapman@katalix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_ppp.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 5ebee2d..8c46b27 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -353,7 +353,9 @@ static int pppol2tp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msgh
 		goto error_put_sess_tun;
 	}
 
+	local_bh_disable();
 	l2tp_xmit_skb(session, skb, session->hdr_len);
+	local_bh_enable();
 
 	sock_put(ps->tunnel_sock);
 	sock_put(sk);
@@ -422,7 +424,9 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
 	skb->data[0] = ppph[0];
 	skb->data[1] = ppph[1];
 
+	local_bh_disable();
 	l2tp_xmit_skb(session, skb, session->hdr_len);
+	local_bh_enable();
 
 	sock_put(sk_tun);
 	sock_put(sk);
-- 
1.7.11.7


From 5bf1c228293765ff84e4121cf2f92395403b7e33 Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Sun, 6 Oct 2013 15:15:33 -0700
Subject: [PATCH 26/47] netem: update backlog after drop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 638a52b801e40ed276ceb69b73579ad99365361a ]

When packet is dropped from rb-tree netem the backlog statistic should
also be updated.

Reported-by: Сергеев Сергей <adron@yapic.net>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_netem.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 82f6016..7dc79940 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -523,6 +523,7 @@ static unsigned int netem_drop(struct Qdisc *sch)
 			skb->next = NULL;
 			skb->prev = NULL;
 			len = qdisc_pkt_len(skb);
+			sch->qstats.backlog -= len;
 			kfree_skb(skb);
 		}
 	}
-- 
1.7.11.7


From ddc30868db0e31c0c2ab4691131a050f9136f3bf Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Sun, 6 Oct 2013 15:16:49 -0700
Subject: [PATCH 27/47] netem: free skb's in tree on reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit ff704050f2fc0f3382b5a70bba56a51a3feca79d ]

Netem can leak memory because packets get stored in red-black
tree and it is not cleared on reset.

Reported by: Сергеев Сергей <adron@yapic.net>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_netem.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 7dc79940..3626010 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -358,6 +358,21 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche
 	return PSCHED_NS2TICKS(ticks);
 }
 
+static void tfifo_reset(struct Qdisc *sch)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	struct rb_node *p;
+
+	while ((p = rb_first(&q->t_root))) {
+		struct sk_buff *skb = netem_rb_to_skb(p);
+
+		rb_erase(p, &q->t_root);
+		skb->next = NULL;
+		skb->prev = NULL;
+		kfree_skb(skb);
+	}
+}
+
 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
@@ -613,6 +628,7 @@ static void netem_reset(struct Qdisc *sch)
 	struct netem_sched_data *q = qdisc_priv(sch);
 
 	qdisc_reset_queue(sch);
+	tfifo_reset(sch);
 	if (q->qdisc)
 		qdisc_reset(q->qdisc);
 	qdisc_watchdog_cancel(&q->watchdog);
-- 
1.7.11.7


From c871c477136615360e283471acdb33df95d70470 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Salva=20Peir=C3=B3?= <speiro@ai2.upv.es>
Date: Fri, 11 Oct 2013 12:50:03 +0300
Subject: [PATCH 28/47] farsync: fix info leak in ioctl

[ Upstream commit 96b340406724d87e4621284ebac5e059d67b2194 ]

The fst_get_iface() code fails to initialize the two padding bytes of
struct sync_serial_settings after the ->loopback member. Add an explicit
memset(0) before filling the structure to avoid the info leak.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wan/farsync.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wan/farsync.c b/drivers/net/wan/farsync.c
index 3f0c4f2..bcfff0d 100644
--- a/drivers/net/wan/farsync.c
+++ b/drivers/net/wan/farsync.c
@@ -1972,6 +1972,7 @@ fst_get_iface(struct fst_card_info *card, struct fst_port_info *port,
 	}
 
 	i = port->index;
+	memset(&sync, 0, sizeof(sync));
 	sync.clock_rate = FST_RDL(card, portConfig[i].lineSpeed);
 	/* Lucky card and linux use same encoding here */
 	sync.clock_type = FST_RDB(card, portConfig[i].internalClock) ==
-- 
1.7.11.7


From e69ccba66791d0edd0d596520de268369aaab610 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Mon, 30 Sep 2013 22:05:40 +0200
Subject: [PATCH 29/47] unix_diag: fix info leak

[ Upstream commit 6865d1e834be84ddd5808d93d5035b492346c64a ]

When filling the netlink message we miss to wipe the pad field,
therefore leak one byte of heap memory to userland. Fix this by
setting pad to 0.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/diag.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/unix/diag.c b/net/unix/diag.c
index d591091..86fa0f3 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -124,6 +124,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
 	rep->udiag_family = AF_UNIX;
 	rep->udiag_type = sk->sk_type;
 	rep->udiag_state = sk->sk_state;
+	rep->pad = 0;
 	rep->udiag_ino = sk_ino;
 	sock_diag_save_cookie(sk, rep->udiag_cookie);
 
-- 
1.7.11.7


From 00fa721e6873ccbb36fc008558bb7d23e9e3c21f Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Mon, 30 Sep 2013 22:03:07 +0200
Subject: [PATCH 30/47] connector: use nlmsg_len() to check message length

[ Upstream commit 162b2bedc084d2d908a04c93383ba02348b648b0 ]

The current code tests the length of the whole netlink message to be
at least as long to fit a cn_msg. This is wrong as nlmsg_len includes
the length of the netlink message header. Use nlmsg_len() instead to
fix this "off-by-NLMSG_HDRLEN" size check.

Cc: stable@vger.kernel.org  # v2.6.14+
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/connector/connector.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index 6ecfa75..0daa11e 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -157,17 +157,18 @@ static int cn_call_callback(struct sk_buff *skb)
 static void cn_rx_skb(struct sk_buff *__skb)
 {
 	struct nlmsghdr *nlh;
-	int err;
 	struct sk_buff *skb;
+	int len, err;
 
 	skb = skb_get(__skb);
 
 	if (skb->len >= NLMSG_HDRLEN) {
 		nlh = nlmsg_hdr(skb);
+		len = nlmsg_len(nlh);
 
-		if (nlh->nlmsg_len < sizeof(struct cn_msg) ||
+		if (len < (int)sizeof(struct cn_msg) ||
 		    skb->len < nlh->nlmsg_len ||
-		    nlh->nlmsg_len > CONNECTOR_MAX_MSG_SIZE) {
+		    len > CONNECTOR_MAX_MSG_SIZE) {
 			kfree_skb(skb);
 			return;
 		}
-- 
1.7.11.7


From d99d51100021c9f8b335fc1931880618eaa448e3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 12 Oct 2013 14:08:34 -0700
Subject: [PATCH 31/47] bnx2x: record rx queue for LRO packets

[ Upstream commit 60e66fee56b2256dcb1dc2ea1b2ddcb6e273857d ]

RPS support is kind of broken on bnx2x, because only non LRO packets
get proper rx queue information. This triggers reorders, as it seems
bnx2x like to generate a non LRO packet for segment including TCP PUSH
flag : (this might be pure coincidence, but all the reorders I've
seen involve segments with a PUSH)

11:13:34.335847 IP A > B: . 415808:447136(31328) ack 1 win 457 <nop,nop,timestamp 3789336 3985797>
11:13:34.335992 IP A > B: . 447136:448560(1424) ack 1 win 457 <nop,nop,timestamp 3789336 3985797>
11:13:34.336391 IP A > B: . 448560:479888(31328) ack 1 win 457 <nop,nop,timestamp 3789337 3985797>
11:13:34.336425 IP A > B: P 511216:512640(1424) ack 1 win 457 <nop,nop,timestamp 3789337 3985798>
11:13:34.336423 IP A > B: . 479888:511216(31328) ack 1 win 457 <nop,nop,timestamp 3789337 3985798>
11:13:34.336924 IP A > B: . 512640:543968(31328) ack 1 win 457 <nop,nop,timestamp 3789337 3985798>
11:13:34.336963 IP A > B: . 543968:575296(31328) ack 1 win 457 <nop,nop,timestamp 3789337 3985798>

We must call skb_record_rx_queue() to properly give to RPS (and more
generally for TX queue selection on forward path) the receive queue
information.

Similar fix is needed for skb_mark_napi_id(), but will be handled
in a separate patch to ease stable backports.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Eilon Greenstein <eilong@broadcom.com>
Acked-by: Dmitry Kravkov <dmitry@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 0cc2611..4b0877e 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -676,6 +676,7 @@ static void bnx2x_gro_receive(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 		}
 	}
 #endif
+	skb_record_rx_queue(skb, fp->rx_queue);
 	napi_gro_receive(&fp->napi, skb);
 }
 
-- 
1.7.11.7


From 3f1db36c01909701d0e34cd2413a1127e144bcc3 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 15 Oct 2013 11:18:58 +0800
Subject: [PATCH 32/47] virtio-net: don't respond to cpu hotplug notifier if
 we're not ready

[ Upstream commit 3ab098df35f8b98b6553edc2e40234af512ba877 ]

We're trying to re-configure the affinity unconditionally in cpu hotplug
callback. This may lead the issue during resuming from s3/s4 since

- virt queues haven't been allocated at that time.
- it's unnecessary since thaw method will re-configure the affinity.

Fix this issue by checking the config_enable and do nothing is we're not ready.

The bug were introduced by commit 8de4b2f3ae90c8fc0f17eeaab87d5a951b66ee17
(virtio-net: reset virtqueue affinity when doing cpu hotplug).

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 3d2a90a..43a71d9 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1094,6 +1094,11 @@ static int virtnet_cpu_callback(struct notifier_block *nfb,
 {
 	struct virtnet_info *vi = container_of(nfb, struct virtnet_info, nb);
 
+	mutex_lock(&vi->config_lock);
+
+	if (!vi->config_enable)
+		goto done;
+
 	switch(action & ~CPU_TASKS_FROZEN) {
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
@@ -1106,6 +1111,9 @@ static int virtnet_cpu_callback(struct notifier_block *nfb,
 	default:
 		break;
 	}
+
+done:
+	mutex_unlock(&vi->config_lock);
 	return NOTIFY_OK;
 }
 
-- 
1.7.11.7


From 24ef3b7cfd16ce5ac263deebfecb661d1c784670 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 15 Oct 2013 11:18:59 +0800
Subject: [PATCH 33/47] virtio-net: refill only when device is up during
 setting queues

[ Upstream commit 35ed159bfd96a7547ec277ed8b550c7cbd9841b6 ]

We used to schedule the refill work unconditionally after changing the
number of queues. This may lead an issue if the device is not
up. Since we only try to cancel the work in ndo_stop(), this may cause
the refill work still work after removing the device. Fix this by only
schedule the work when device is up.

The bug were introduce by commit 9b9cd8024a2882e896c65222aa421d461354e3f2.
(virtio-net: fix the race between channels setting and refill)

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 43a71d9..1d01534 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -916,7 +916,9 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
 		return -EINVAL;
 	} else {
 		vi->curr_queue_pairs = queue_pairs;
-		schedule_delayed_work(&vi->refill, 0);
+		/* virtnet_open() will refill when device is going to up. */
+		if (dev->flags & IFF_UP)
+			schedule_delayed_work(&vi->refill, 0);
 	}
 
 	return 0;
@@ -1714,7 +1716,9 @@ static int virtnet_restore(struct virtio_device *vdev)
 	vi->config_enable = true;
 	mutex_unlock(&vi->config_lock);
 
+	rtnl_lock();
 	virtnet_set_queues(vi, vi->curr_queue_pairs);
+	rtnl_unlock();
 
 	return 0;
 }
-- 
1.7.11.7


From d616bd8bf902f82ea742462a29bf4080aaa8f497 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevic@redhat.com>
Date: Tue, 15 Oct 2013 14:57:45 -0400
Subject: [PATCH 34/47] bridge: Correctly clamp MAX forward_delay when
 enabling STP

[ Upstream commit 4b6c7879d84ad06a2ac5b964808ed599187a188d ]

Commit be4f154d5ef0ca147ab6bcd38857a774133f5450
	bridge: Clamp forward_delay when enabling STP
had a typo when attempting to clamp maximum forward delay.

It is possible to set bridge_forward_delay to be higher then
permitted maximum when STP is off.  When turning STP on, the
higher then allowed delay has to be clamed down to max value.

CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Vlad Yasevich <vyasevic@redhat.com>
Reviewed-by: Veaceslav Falico <vfalico@redhat.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_stp_if.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 108084a..656a6f3 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -134,7 +134,7 @@ static void br_stp_start(struct net_bridge *br)
 
 	if (br->bridge_forward_delay < BR_MIN_FORWARD_DELAY)
 		__br_set_forward_delay(br, BR_MIN_FORWARD_DELAY);
-	else if (br->bridge_forward_delay < BR_MAX_FORWARD_DELAY)
+	else if (br->bridge_forward_delay > BR_MAX_FORWARD_DELAY)
 		__br_set_forward_delay(br, BR_MAX_FORWARD_DELAY);
 
 	if (r == 0) {
-- 
1.7.11.7


From 803490b7c577add0b976aa08e4bbfdd95f505270 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevich@gmail.com>
Date: Tue, 15 Oct 2013 22:01:29 -0400
Subject: [PATCH 35/47] net: dst: provide accessor function to dst->xfrm

[ Upstream commit e87b3998d795123b4139bc3f25490dd236f68212 ]

dst->xfrm is conditionally defined.  Provide accessor funtion that
is always available.

Signed-off-by: Vlad Yasevich <vyasevich@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/include/net/dst.h b/include/net/dst.h
index 1f8fd10..e0c97f5 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -477,10 +477,22 @@ static inline struct dst_entry *xfrm_lookup(struct net *net,
 {
 	return dst_orig;
 } 
+
+static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst)
+{
+	return NULL;
+}
+
 #else
 extern struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
 				     const struct flowi *fl, struct sock *sk,
 				     int flags);
+
+/* skb attached with this dst needs transformation if dst->xfrm is valid */
+static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst)
+{
+	return dst->xfrm;
+}
 #endif
 
 #endif /* _NET_DST_H */
-- 
1.7.11.7


From 371a65903ccb75fc71fd42b30a310a28c42e54a3 Mon Sep 17 00:00:00 2001
From: Fan Du <fan.du@windriver.com>
Date: Tue, 15 Oct 2013 22:01:30 -0400
Subject: [PATCH 36/47] sctp: Use software crc32 checksum when xfrm transform
 will happen.

[ Upstream commit 27127a82561a2a3ed955ce207048e1b066a80a2a ]

igb/ixgbe have hardware sctp checksum support, when this feature is enabled
and also IPsec is armed to protect sctp traffic, ugly things happened as
xfrm_output checks CHECKSUM_PARTIAL to do checksum operation(sum every thing
up and pack the 16bits result in the checksum field). The result is fail
establishment of sctp communication.

Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Fan Du <fan.du@windriver.com>
Signed-off-by: Vlad Yasevich <vyasevich@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/output.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/sctp/output.c b/net/sctp/output.c
index a46d1eb..a06a9b6 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -542,7 +542,8 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 	 * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>.
 	 */
 	if (!sctp_checksum_disable) {
-		if (!(dst->dev->features & NETIF_F_SCTP_CSUM)) {
+		if (!(dst->dev->features & NETIF_F_SCTP_CSUM) ||
+		    (dst_xfrm(dst) != NULL)) {
 			__u32 crc32 = sctp_start_cksum((__u8 *)sh, cksum_buf_len);
 
 			/* 3) Put the resultant value into the checksum field in the
-- 
1.7.11.7


From 9067790bb296fb5818894222d7e85407238e9843 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevich@gmail.com>
Date: Tue, 15 Oct 2013 22:01:31 -0400
Subject: [PATCH 37/47] sctp: Perform software checksum if packet has to be
 fragmented.

[ Upstream commit d2dbbba77e95dff4b4f901fee236fef6d9552072 ]

IP/IPv6 fragmentation knows how to compute only TCP/UDP checksum.
This causes problems if SCTP packets has to be fragmented and
ipsummed has been set to PARTIAL due to checksum offload support.
This condition can happen when retransmitting after MTU discover,
or when INIT or other control chunks are larger then MTU.
Check for the rare fragmentation condition in SCTP and use software
checksum calculation in this case.

CC: Fan Du <fan.du@windriver.com>
Signed-off-by: Vlad Yasevich <vyasevich@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sctp/output.c b/net/sctp/output.c
index a06a9b6..013a07d 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -543,7 +543,7 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 	 */
 	if (!sctp_checksum_disable) {
 		if (!(dst->dev->features & NETIF_F_SCTP_CSUM) ||
-		    (dst_xfrm(dst) != NULL)) {
+		    (dst_xfrm(dst) != NULL) || packet->ipfragok) {
 			__u32 crc32 = sctp_start_cksum((__u8 *)sh, cksum_buf_len);
 
 			/* 3) Put the resultant value into the checksum field in the
-- 
1.7.11.7


From 22e825ed8144360271614511563166f37fef9f90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Salva=20Peir=C3=B3?= <speiro@ai2.upv.es>
Date: Wed, 16 Oct 2013 12:46:50 +0200
Subject: [PATCH 38/47] wanxl: fix info leak in ioctl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 2b13d06c9584b4eb773f1e80bbaedab9a1c344e1 ]

The wanxl_ioctl() code fails to initialize the two padding bytes of
struct sync_serial_settings after the ->loopback member. Add an explicit
memset(0) before filling the structure to avoid the info leak.

Signed-off-by: Salva Peiró <speiro@ai2.upv.es>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wan/wanxl.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wan/wanxl.c b/drivers/net/wan/wanxl.c
index 6a24a5a..4c0a697 100644
--- a/drivers/net/wan/wanxl.c
+++ b/drivers/net/wan/wanxl.c
@@ -355,6 +355,7 @@ static int wanxl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 			ifr->ifr_settings.size = size; /* data size wanted */
 			return -ENOBUFS;
 		}
+		memset(&line, 0, sizeof(line));
 		line.clock_type = get_status(port)->clocking;
 		line.clock_rate = 0;
 		line.loopback = 0;
-- 
1.7.11.7


From b16dd2cff7a4eb3881f43371d71ed242332877dc Mon Sep 17 00:00:00 2001
From: Vasundhara Volam <vasundhara.volam@emulex.com>
Date: Thu, 17 Oct 2013 11:47:14 +0530
Subject: [PATCH 39/47] be2net: pass if_id for v1 and V2 versions of TX_CREATE
 cmd

[ Upstream commit 0fb88d61bc60779dde88b0fc268da17eb81d0412 ]

It is a required field for all TX_CREATE cmd versions > 0.
This fixes a driver initialization failure, caused by recent SH-R Firmwares
(versions > 10.0.639.0) failing the TX_CREATE cmd when if_id field is
not passed.

Signed-off-by: Sathya Perla <sathya.perla@emulex.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/emulex/benet/be_cmds.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c
index 8ec5d74..13ac104 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.c
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.c
@@ -1150,7 +1150,6 @@ int be_cmd_txq_create(struct be_adapter *adapter, struct be_tx_obj *txo)
 
 	if (lancer_chip(adapter)) {
 		req->hdr.version = 1;
-		req->if_id = cpu_to_le16(adapter->if_handle);
 	} else if (BEx_chip(adapter)) {
 		if (adapter->function_caps & BE_FUNCTION_CAPS_SUPER_NIC)
 			req->hdr.version = 2;
@@ -1158,6 +1157,8 @@ int be_cmd_txq_create(struct be_adapter *adapter, struct be_tx_obj *txo)
 		req->hdr.version = 2;
 	}
 
+	if (req->hdr.version > 0)
+		req->if_id = cpu_to_le16(adapter->if_handle);
 	req->num_pages = PAGES_4K_SPANNED(q_mem->va, q_mem->size);
 	req->ulp_num = BE_ULP1_NUM;
 	req->type = BE_ETH_TX_RING_TYPE_STANDARD;
-- 
1.7.11.7


From 9829aac8208e7a31e4e42e7d2e7e165593c05202 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Thu, 17 Oct 2013 22:51:31 +0200
Subject: [PATCH 40/47] net: unix: inherit SOCK_PASS{CRED, SEC} flags from
 socket to fix race

[ Upstream commit 90c6bd34f884cd9cee21f1d152baf6c18bcac949 ]

In the case of credentials passing in unix stream sockets (dgram
sockets seem not affected), we get a rather sparse race after
commit 16e5726 ("af_unix: dont send SCM_CREDENTIALS by default").

We have a stream server on receiver side that requests credential
passing from senders (e.g. nc -U). Since we need to set SO_PASSCRED
on each spawned/accepted socket on server side to 1 first (as it's
not inherited), it can happen that in the time between accept() and
setsockopt() we get interrupted, the sender is being scheduled and
continues with passing data to our receiver. At that time SO_PASSCRED
is neither set on sender nor receiver side, hence in cmsg's
SCM_CREDENTIALS we get eventually pid:0, uid:65534, gid:65534
(== overflow{u,g}id) instead of what we actually would like to see.

On the sender side, here nc -U, the tests in maybe_add_creds()
invoked through unix_stream_sendmsg() would fail, as at that exact
time, as mentioned, the sender has neither SO_PASSCRED on his side
nor sees it on the server side, and we have a valid 'other' socket
in place. Thus, sender believes it would just look like a normal
connection, not needing/requesting SO_PASSCRED at that time.

As reverting 16e5726 would not be an option due to the significant
performance regression reported when having creds always passed,
one way/trade-off to prevent that would be to set SO_PASSCRED on
the listener socket and allow inheriting these flags to the spawned
socket on server side in accept(). It seems also logical to do so
if we'd tell the listener socket to pass those flags onwards, and
would fix the race.

Before, strace:

recvmsg(4, {msg_name(0)=NULL, msg_iov(1)=[{"blub\n", 4096}],
        msg_controllen=32, {cmsg_len=28, cmsg_level=SOL_SOCKET,
        cmsg_type=SCM_CREDENTIALS{pid=0, uid=65534, gid=65534}},
        msg_flags=0}, 0) = 5

After, strace:

recvmsg(4, {msg_name(0)=NULL, msg_iov(1)=[{"blub\n", 4096}],
        msg_controllen=32, {cmsg_len=28, cmsg_level=SOL_SOCKET,
        cmsg_type=SCM_CREDENTIALS{pid=11580, uid=1000, gid=1000}},
        msg_flags=0}, 0) = 5

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c4ce243..e64bbcf 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1246,6 +1246,15 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb)
 	return 0;
 }
 
+static void unix_sock_inherit_flags(const struct socket *old,
+				    struct socket *new)
+{
+	if (test_bit(SOCK_PASSCRED, &old->flags))
+		set_bit(SOCK_PASSCRED, &new->flags);
+	if (test_bit(SOCK_PASSSEC, &old->flags))
+		set_bit(SOCK_PASSSEC, &new->flags);
+}
+
 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
 {
 	struct sock *sk = sock->sk;
@@ -1280,6 +1289,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
 	/* attach accepted sock to socket */
 	unix_state_lock(tsk);
 	newsock->state = SS_CONNECTED;
+	unix_sock_inherit_flags(sock, newsock);
 	sock_graft(tsk, newsock);
 	unix_state_unlock(tsk);
 	return 0;
-- 
1.7.11.7


From 7b48750febb4c3387db39fd0b547936c53ba7364 Mon Sep 17 00:00:00 2001
From: Seif Mazareeb <seif@marvell.com>
Date: Thu, 17 Oct 2013 20:33:21 -0700
Subject: [PATCH 41/47] net: fix cipso packet validation when !NETLABEL

[ Upstream commit f2e5ddcc0d12f9c4c7b254358ad245c9dddce13b ]

When CONFIG_NETLABEL is disabled, the cipso_v4_validate() function could loop
forever in the main loop if opt[opt_iter +1] == 0, this will causing a kernel
crash in an SMP system, since the CPU executing this function will
stall /not respond to IPIs.

This problem can be reproduced by running the IP Stack Integrity Checker
(http://isic.sourceforge.net) using the following command on a Linux machine
connected to DUT:

"icmpsic -s rand -d <DUT IP address> -r 123456"
wait (1-2 min)

Signed-off-by: Seif Mazareeb <seif@marvell.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/cipso_ipv4.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/net/cipso_ipv4.h b/include/net/cipso_ipv4.h
index a7a683e..a8c2ef6 100644
--- a/include/net/cipso_ipv4.h
+++ b/include/net/cipso_ipv4.h
@@ -290,6 +290,7 @@ static inline int cipso_v4_validate(const struct sk_buff *skb,
 	unsigned char err_offset = 0;
 	u8 opt_len = opt[1];
 	u8 opt_iter;
+	u8 tag_len;
 
 	if (opt_len < 8) {
 		err_offset = 1;
@@ -302,11 +303,12 @@ static inline int cipso_v4_validate(const struct sk_buff *skb,
 	}
 
 	for (opt_iter = 6; opt_iter < opt_len;) {
-		if (opt[opt_iter + 1] > (opt_len - opt_iter)) {
+		tag_len = opt[opt_iter + 1];
+		if ((tag_len == 0) || (opt[opt_iter + 1] > (opt_len - opt_iter))) {
 			err_offset = opt_iter + 1;
 			goto out;
 		}
-		opt_iter += opt[opt_iter + 1];
+		opt_iter += tag_len;
 	}
 
 out:
-- 
1.7.11.7


From 27e33640a8905b1aeefe9998242551caf24e84a6 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Tue, 22 Oct 2013 00:07:47 +0200
Subject: [PATCH 42/47] inet: fix possible memory corruption with UDP_CORK and
 UFO

[ This is a simplified -stable version of a set of upstream commits. ]

This is a replacement patch only for stable which does fix the problems
handled by the following two commits in -net:

"ip_output: do skb ufo init for peeked non ufo skb as well" (e93b7d748be887cd7639b113ba7d7ef792a7efb9)
"ip6_output: do skb ufo init for peeked non ufo skb as well" (c547dbf55d5f8cf615ccc0e7265e98db27d3fb8b)

Three frames are written on a corked udp socket for which the output
netdevice has UFO enabled.  If the first and third frame are smaller than
the mtu and the second one is bigger, we enqueue the second frame with
skb_append_datato_frags without initializing the gso fields. This leads
to the third frame appended regulary and thus constructing an invalid skb.

This fixes the problem by always using skb_append_datato_frags as soon
as the first frag got enqueued to the skb without marking the packet
as SKB_GSO_UDP.

The problem with only two frames for ipv6 was fixed by "ipv6: udp
packets following an UFO enqueued packet need also be handled by UFO"
(2811ebac2521ceac84f2bdae402455baa6a7fb47).

Cc: Jiri Pirko <jiri@resnulli.us>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
 include/linux/skbuff.h | 5 +++++
 net/ipv4/ip_output.c   | 2 +-
 net/ipv6/ip6_output.c  | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3b71a4e..6bd165b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1316,6 +1316,11 @@ static inline int skb_pagelen(const struct sk_buff *skb)
 	return len + skb_headlen(skb);
 }
 
+static inline bool skb_has_frags(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->nr_frags;
+}
+
 /**
  * __skb_fill_page_desc - initialise a paged fragment in an skb
  * @skb: buffer containing fragment to be initialised
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a04d872..7f4ab5d 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -836,7 +836,7 @@ static int __ip_append_data(struct sock *sk,
 		csummode = CHECKSUM_PARTIAL;
 
 	cork->length += length;
-	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
+	if (((length > mtu) || (skb && skb_has_frags(skb))) &&
 	    (sk->sk_protocol == IPPROTO_UDP) &&
 	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
 		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 44df1c9..2e542d0 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1252,7 +1252,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 	skb = skb_peek_tail(&sk->sk_write_queue);
 	cork->length += length;
 	if (((length > mtu) ||
-	     (skb && skb_is_gso(skb))) &&
+	     (skb && skb_has_frags(skb))) &&
 	    (sk->sk_protocol == IPPROTO_UDP) &&
 	    (rt->dst.dev->features & NETIF_F_UFO)) {
 		err = ip6_ufo_append_data(sk, getfrag, from, length,
-- 
1.7.11.7


From 689f77d13532698739438b2288ec8eac2f667584 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 20 Oct 2013 15:43:03 +0300
Subject: [PATCH 43/47] ipv6: always prefer rt6i_gateway if present

[ Upstream commit 96dc809514fb2328605198a0602b67554d8cce7b ]

In v3.9 6fd6ce2056de2709 ("ipv6: Do not depend on rt->n in
ip6_finish_output2()." changed the behaviour of ip6_finish_output2()
such that the recently introduced rt6_nexthop() is used
instead of an assigned neighbor.

As rt6_nexthop() prefers rt6i_gateway only for gatewayed
routes this causes a problem for users like IPVS, xt_TEE and
RAW(hdrincl) if they want to use different address for routing
compared to the destination address.

Another case is when redirect can create RTF_DYNAMIC
route without RTF_GATEWAY flag, we ignore the rt6i_gateway
in rt6_nexthop().

Fix the above problems by considering the rt6i_gateway if
present, so that traffic routed to address on local subnet is
not wrongly diverted to the destination address.

Thanks to Simon Horman and Phil Oester for spotting the
problematic commit.

Thanks to Hannes Frederic Sowa for his review and help in testing.

Reported-by: Phil Oester <kernel@linuxace.com>
Reported-by: Mark Brooks <mark@loadbalancer.org>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index f667248..0aaf0ec 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -198,7 +198,7 @@ static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 
 static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, struct in6_addr *dest)
 {
-	if (rt->rt6i_flags & RTF_GATEWAY)
+	if (rt->rt6i_flags & RTF_GATEWAY || !ipv6_addr_any(&rt->rt6i_gateway))
 		return &rt->rt6i_gateway;
 	return dest;
 }
-- 
1.7.11.7


From 471dd605429d6645f990becd29c877740d3b32e7 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 20 Oct 2013 15:43:04 +0300
Subject: [PATCH 44/47] ipv6: fill rt6i_gateway with nexthop address

[ Upstream commit 550bab42f83308c9d6ab04a980cc4333cef1c8fa ]

Make sure rt6i_gateway contains nexthop information in
all routes returned from lookup or when routes are directly
attached to skb for generated ICMP packets.

The effect of this patch should be a faster version of
rt6_nexthop() and the consideration of local addresses as
nexthop.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 6 ++----
 net/ipv6/ip6_output.c   | 4 ++--
 net/ipv6/route.c        | 8 ++++++--
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 0aaf0ec..c7b8860 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -196,11 +196,9 @@ static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 }
 
-static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, struct in6_addr *dest)
+static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt)
 {
-	if (rt->rt6i_flags & RTF_GATEWAY || !ipv6_addr_any(&rt->rt6i_gateway))
-		return &rt->rt6i_gateway;
-	return dest;
+	return &rt->rt6i_gateway;
 }
 
 #endif
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 2e542d0..5b25f85 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -130,7 +130,7 @@ static int ip6_finish_output2(struct sk_buff *skb)
 	}
 
 	rcu_read_lock_bh();
-	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
+	nexthop = rt6_nexthop((struct rt6_info *)dst);
 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
@@ -899,7 +899,7 @@ static int ip6_dst_lookup_tail(struct sock *sk,
 	 */
 	rt = (struct rt6_info *) *dst;
 	rcu_read_lock_bh();
-	n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr));
+	n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt));
 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 	rcu_read_unlock_bh();
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8d9a93ed..08e6c40 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -852,7 +852,6 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
 			if (ort->rt6i_dst.plen != 128 &&
 			    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
 				rt->rt6i_flags |= RTF_ANYCAST;
-			rt->rt6i_gateway = *daddr;
 		}
 
 		rt->rt6i_flags |= RTF_CACHE;
@@ -1270,6 +1269,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
 	rt->dst.flags |= DST_HOST;
 	rt->dst.output  = ip6_output;
 	atomic_set(&rt->dst.__refcnt, 1);
+	rt->rt6i_gateway  = fl6->daddr;
 	rt->rt6i_dst.addr = fl6->daddr;
 	rt->rt6i_dst.plen = 128;
 	rt->rt6i_idev     = idev;
@@ -1824,7 +1824,10 @@ static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
 			in6_dev_hold(rt->rt6i_idev);
 		rt->dst.lastuse = jiffies;
 
-		rt->rt6i_gateway = ort->rt6i_gateway;
+		if (ort->rt6i_flags & RTF_GATEWAY)
+			rt->rt6i_gateway = ort->rt6i_gateway;
+		else
+			rt->rt6i_gateway = *dest;
 		rt->rt6i_flags = ort->rt6i_flags;
 		if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
 		    (RTF_DEFAULT | RTF_ADDRCONF))
@@ -2111,6 +2114,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 	else
 		rt->rt6i_flags |= RTF_LOCAL;
 
+	rt->rt6i_gateway  = *addr;
 	rt->rt6i_dst.addr = *addr;
 	rt->rt6i_dst.plen = 128;
 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
-- 
1.7.11.7


From d01c3be45be54261f56ba63197d94e3d756befdf Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 20 Oct 2013 15:43:05 +0300
Subject: [PATCH 45/47] netfilter: nf_conntrack: fix rt6i_gateway checks for
 H.323 helper

[ Upstream commit 56e42441ed54b092d6c7411138ce60d049e7c731 ]

Now when rt6_nexthop() can return nexthop address we can use it
for proper nexthop comparison of directly connected destinations.
For more information refer to commit bbb5823cf742a7
("netfilter: nf_conntrack: fix rt_gateway checks for H.323 helper").

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_conntrack_h323_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index bdebd03..70866d1 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -778,8 +778,8 @@ static int callforward_do_filter(const union nf_inet_addr *src,
 				   flowi6_to_flowi(&fl1), false)) {
 			if (!afinfo->route(&init_net, (struct dst_entry **)&rt2,
 					   flowi6_to_flowi(&fl2), false)) {
-				if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway,
-					    sizeof(rt1->rt6i_gateway)) &&
+				if (ipv6_addr_equal(rt6_nexthop(rt1),
+						    rt6_nexthop(rt2)) &&
 				    rt1->dst.dev == rt2->dst.dev)
 					ret = 1;
 				dst_release(&rt2->dst);
-- 
1.7.11.7


From 1d98ddb501bedeee62c916d3d6999109f0a22198 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Mon, 21 Oct 2013 06:17:15 +0200
Subject: [PATCH 46/47] ipv6: probe routes asynchronous in rt6_probe

[ Upstream commit c2f17e827b419918c856131f592df9521e1a38e3 ]

Routes need to be probed asynchronous otherwise the call stack gets
exhausted when the kernel attemps to deliver another skb inline, like
e.g. xt_TEE does, and we probe at the same time.

We update neigh->updated still at once, otherwise we would send to
many probes.

Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 38 +++++++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 08e6c40..1e32d5c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -477,6 +477,24 @@ out:
 }
 
 #ifdef CONFIG_IPV6_ROUTER_PREF
+struct __rt6_probe_work {
+	struct work_struct work;
+	struct in6_addr target;
+	struct net_device *dev;
+};
+
+static void rt6_probe_deferred(struct work_struct *w)
+{
+	struct in6_addr mcaddr;
+	struct __rt6_probe_work *work =
+		container_of(w, struct __rt6_probe_work, work);
+
+	addrconf_addr_solict_mult(&work->target, &mcaddr);
+	ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
+	dev_put(work->dev);
+	kfree(w);
+}
+
 static void rt6_probe(struct rt6_info *rt)
 {
 	struct neighbour *neigh;
@@ -500,17 +518,23 @@ static void rt6_probe(struct rt6_info *rt)
 
 	if (!neigh ||
 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
-		struct in6_addr mcaddr;
-		struct in6_addr *target;
+		struct __rt6_probe_work *work;
+
+		work = kmalloc(sizeof(*work), GFP_ATOMIC);
 
-		if (neigh) {
+		if (neigh && work)
 			neigh->updated = jiffies;
+
+		if (neigh)
 			write_unlock(&neigh->lock);
-		}
 
-		target = (struct in6_addr *)&rt->rt6i_gateway;
-		addrconf_addr_solict_mult(target, &mcaddr);
-		ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
+		if (work) {
+			INIT_WORK(&work->work, rt6_probe_deferred);
+			work->target = rt->rt6i_gateway;
+			dev_hold(rt->dst.dev);
+			work->dev = rt->dst.dev;
+			schedule_work(&work->work);
+		}
 	} else {
 out:
 		write_unlock(&neigh->lock);
-- 
1.7.11.7


From d7710f5e65b37ec3ac09dde758141e81fa47315d Mon Sep 17 00:00:00 2001
From: Mariusz Ceier <mceier+kernel@gmail.com>
Date: Mon, 21 Oct 2013 19:45:04 +0200
Subject: [PATCH 47/47] davinci_emac.c: Fix IFF_ALLMULTI setup

[ Upstream commit d69e0f7ea95fef8059251325a79c004bac01f018 ]

When IFF_ALLMULTI flag is set on interface and IFF_PROMISC isn't,
emac_dev_mcast_set should only enable RX of multicasts and reset
MACHASH registers.

It does this, but afterwards it either sets up multicast MACs
filtering or disables RX of multicasts and resets MACHASH registers
again, rendering IFF_ALLMULTI flag useless.

This patch fixes emac_dev_mcast_set, so that multicast MACs filtering and
disabling of RX of multicasts are skipped when IFF_ALLMULTI flag is set.

Tested with kernel 2.6.37.

Signed-off-by: Mariusz Ceier <mceier+kernel@gmail.com>
Acked-by: Mugunthan V N <mugunthanvnm@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/davinci_emac.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
index 1a222bce..45c167f 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -876,8 +876,7 @@ static void emac_dev_mcast_set(struct net_device *ndev)
 		    netdev_mc_count(ndev) > EMAC_DEF_MAX_MULTICAST_ADDRESSES) {
 			mbp_enable = (mbp_enable | EMAC_MBP_RXMCAST);
 			emac_add_mcast(priv, EMAC_ALL_MULTI_SET, NULL);
-		}
-		if (!netdev_mc_empty(ndev)) {
+		} else if (!netdev_mc_empty(ndev)) {
 			struct netdev_hw_addr *ha;
 
 			mbp_enable = (mbp_enable | EMAC_MBP_RXMCAST);
-- 
1.7.11.7