@@ -1224,6 +1224,7 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp)
12241224 if (likely (__mptcp_add_ext (skb , gfp ))) {
12251225 skb_reserve (skb , MAX_TCP_HEADER );
12261226 skb -> reserved_tailroom = skb -> end - skb -> tail ;
1227+ INIT_LIST_HEAD (& skb -> tcp_tsorted_anchor );
12271228 return skb ;
12281229 }
12291230 __kfree_skb (skb );
@@ -1233,31 +1234,23 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp)
12331234 return NULL ;
12341235}
12351236
1236- static bool __mptcp_alloc_tx_skb (struct sock * sk , struct sock * ssk , gfp_t gfp )
1237+ static struct sk_buff * __mptcp_alloc_tx_skb (struct sock * sk , struct sock * ssk , gfp_t gfp )
12371238{
12381239 struct sk_buff * skb ;
12391240
1240- if (ssk -> sk_tx_skb_cache ) {
1241- skb = ssk -> sk_tx_skb_cache ;
1242- if (unlikely (!skb_ext_find (skb , SKB_EXT_MPTCP ) &&
1243- !__mptcp_add_ext (skb , gfp )))
1244- return false;
1245- return true;
1246- }
1247-
12481241 skb = __mptcp_do_alloc_tx_skb (sk , gfp );
12491242 if (!skb )
1250- return false ;
1243+ return NULL ;
12511244
12521245 if (likely (sk_wmem_schedule (ssk , skb -> truesize ))) {
1253- ssk -> sk_tx_skb_cache = skb ;
1254- return true ;
1246+ tcp_skb_entail ( ssk , skb ) ;
1247+ return skb ;
12551248 }
12561249 kfree_skb (skb );
1257- return false ;
1250+ return NULL ;
12581251}
12591252
1260- static bool mptcp_alloc_tx_skb (struct sock * sk , struct sock * ssk , bool data_lock_held )
1253+ static struct sk_buff * mptcp_alloc_tx_skb (struct sock * sk , struct sock * ssk , bool data_lock_held )
12611254{
12621255 gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk -> sk_allocation ;
12631256
@@ -1287,23 +1280,29 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
12871280 struct mptcp_sendmsg_info * info )
12881281{
12891282 u64 data_seq = dfrag -> data_seq + info -> sent ;
1283+ int offset = dfrag -> offset + info -> sent ;
12901284 struct mptcp_sock * msk = mptcp_sk (sk );
12911285 bool zero_window_probe = false;
12921286 struct mptcp_ext * mpext = NULL ;
1293- struct sk_buff * skb , * tail ;
1294- bool must_collapse = false ;
1295- int size_bias = 0 ;
1296- int avail_size ;
1297- size_t ret = 0 ;
1287+ bool can_coalesce = false ;
1288+ bool reuse_skb = true ;
1289+ struct sk_buff * skb ;
1290+ size_t copy ;
1291+ int i ;
12981292
12991293 pr_debug ("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u" ,
13001294 msk , ssk , dfrag -> data_seq , dfrag -> data_len , info -> sent );
13011295
1296+ if (WARN_ON_ONCE (info -> sent > info -> limit ||
1297+ info -> limit > dfrag -> data_len ))
1298+ return 0 ;
1299+
13021300 /* compute send limit */
13031301 info -> mss_now = tcp_send_mss (ssk , & info -> size_goal , info -> flags );
1304- avail_size = info -> size_goal ;
1302+ copy = info -> size_goal ;
1303+
13051304 skb = tcp_write_queue_tail (ssk );
1306- if (skb ) {
1305+ if (skb && copy > skb -> len ) {
13071306 /* Limit the write to the size available in the
13081307 * current skb, if any, so that we create at most a new skb.
13091308 * Explicitly tells TCP internals to avoid collapsing on later
@@ -1316,62 +1315,80 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
13161315 goto alloc_skb ;
13171316 }
13181317
1319- must_collapse = ( info -> size_goal > skb -> len ) &&
1320- ( skb_shinfo ( skb ) -> nr_frags < sysctl_max_skb_frags );
1321- if (must_collapse ) {
1322- size_bias = skb -> len ;
1323- avail_size = info -> size_goal - skb -> len ;
1318+ i = skb_shinfo ( skb ) -> nr_frags ;
1319+ can_coalesce = skb_can_coalesce ( skb , i , dfrag -> page , offset );
1320+ if (! can_coalesce && i >= sysctl_max_skb_frags ) {
1321+ tcp_mark_push ( tcp_sk ( ssk ), skb ) ;
1322+ goto alloc_skb ;
13241323 }
1325- }
13261324
1325+ copy -= skb -> len ;
1326+ } else {
13271327alloc_skb :
1328- if (!must_collapse &&
1329- !mptcp_alloc_tx_skb (sk , ssk , info -> data_lock_held ))
1330- return 0 ;
1328+ skb = mptcp_alloc_tx_skb (sk , ssk , info -> data_lock_held );
1329+ if (!skb )
1330+ return - ENOMEM ;
1331+
1332+ i = skb_shinfo (skb )-> nr_frags ;
1333+ reuse_skb = false;
1334+ mpext = skb_ext_find (skb , SKB_EXT_MPTCP );
1335+ }
13311336
13321337 /* Zero window and all data acked? Probe. */
1333- avail_size = mptcp_check_allowed_size (msk , data_seq , avail_size );
1334- if (avail_size == 0 ) {
1338+ copy = mptcp_check_allowed_size (msk , data_seq , copy );
1339+ if (copy == 0 ) {
13351340 u64 snd_una = READ_ONCE (msk -> snd_una );
13361341
1337- if (skb || snd_una != msk -> snd_nxt )
1342+ if (snd_una != msk -> snd_nxt ) {
1343+ tcp_remove_empty_skb (ssk , tcp_write_queue_tail (ssk ));
13381344 return 0 ;
1345+ }
1346+
13391347 zero_window_probe = true;
13401348 data_seq = snd_una - 1 ;
1341- avail_size = 1 ;
1342- }
1349+ copy = 1 ;
13431350
1344- if (WARN_ON_ONCE (info -> sent > info -> limit ||
1345- info -> limit > dfrag -> data_len ))
1346- return 0 ;
1351+ /* all mptcp-level data is acked, no skbs should be present into the
1352+ * ssk write queue
1353+ */
1354+ WARN_ON_ONCE (reuse_skb );
1355+ }
13471356
1348- ret = info -> limit - info -> sent ;
1349- tail = tcp_build_frag (ssk , avail_size + size_bias , info -> flags ,
1350- dfrag -> page , dfrag -> offset + info -> sent , & ret );
1351- if (!tail ) {
1352- tcp_remove_empty_skb (sk , tcp_write_queue_tail (ssk ));
1357+ copy = min_t (size_t , copy , info -> limit - info -> sent );
1358+ if (!sk_wmem_schedule (ssk , copy )) {
1359+ tcp_remove_empty_skb (ssk , tcp_write_queue_tail (ssk ));
13531360 return - ENOMEM ;
13541361 }
13551362
1356- /* if the tail skb is still the cached one, collapsing really happened.
1357- */
1358- if (skb == tail ) {
1359- TCP_SKB_CB (tail )-> tcp_flags &= ~TCPHDR_PSH ;
1360- mpext -> data_len += ret ;
1363+ if (can_coalesce ) {
1364+ skb_frag_size_add (& skb_shinfo (skb )-> frags [i - 1 ], copy );
1365+ } else {
1366+ get_page (dfrag -> page );
1367+ skb_fill_page_desc (skb , i , dfrag -> page , offset , copy );
1368+ }
1369+
1370+ skb -> len += copy ;
1371+ skb -> data_len += copy ;
1372+ skb -> truesize += copy ;
1373+ sk_wmem_queued_add (ssk , copy );
1374+ sk_mem_charge (ssk , copy );
1375+ skb -> ip_summed = CHECKSUM_PARTIAL ;
1376+ WRITE_ONCE (tcp_sk (ssk )-> write_seq , tcp_sk (ssk )-> write_seq + copy );
1377+ TCP_SKB_CB (skb )-> end_seq += copy ;
1378+ tcp_skb_pcount_set (skb , 0 );
1379+
1380+ /* on skb reuse we just need to update the DSS len */
1381+ if (reuse_skb ) {
1382+ TCP_SKB_CB (skb )-> tcp_flags &= ~TCPHDR_PSH ;
1383+ mpext -> data_len += copy ;
13611384 WARN_ON_ONCE (zero_window_probe );
13621385 goto out ;
13631386 }
13641387
1365- mpext = skb_ext_find (tail , SKB_EXT_MPTCP );
1366- if (WARN_ON_ONCE (!mpext )) {
1367- /* should never reach here, stream corrupted */
1368- return - EINVAL ;
1369- }
1370-
13711388 memset (mpext , 0 , sizeof (* mpext ));
13721389 mpext -> data_seq = data_seq ;
13731390 mpext -> subflow_seq = mptcp_subflow_ctx (ssk )-> rel_write_seq ;
1374- mpext -> data_len = ret ;
1391+ mpext -> data_len = copy ;
13751392 mpext -> use_map = 1 ;
13761393 mpext -> dsn64 = 1 ;
13771394
@@ -1380,18 +1397,18 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
13801397 mpext -> dsn64 );
13811398
13821399 if (zero_window_probe ) {
1383- mptcp_subflow_ctx (ssk )-> rel_write_seq += ret ;
1400+ mptcp_subflow_ctx (ssk )-> rel_write_seq += copy ;
13841401 mpext -> frozen = 1 ;
13851402 if (READ_ONCE (msk -> csum_enabled ))
1386- mptcp_update_data_checksum (tail , ret );
1403+ mptcp_update_data_checksum (skb , copy );
13871404 tcp_push_pending_frames (ssk );
13881405 return 0 ;
13891406 }
13901407out :
13911408 if (READ_ONCE (msk -> csum_enabled ))
1392- mptcp_update_data_checksum (tail , ret );
1393- mptcp_subflow_ctx (ssk )-> rel_write_seq += ret ;
1394- return ret ;
1409+ mptcp_update_data_checksum (skb , copy );
1410+ mptcp_subflow_ctx (ssk )-> rel_write_seq += copy ;
1411+ return copy ;
13951412}
13961413
13971414#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
0 commit comments