Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Daniel Borkmann says: ==================== bpf-next 2018-11-30 The following pull-request contains BPF updates for your *net-next* tree. (Getting out bit earlier this time to pull in a dependency from bpf.) The main changes are: 1) Add libbpf ABI versioning and document API naming conventions as well as ABI versioning process, from Andrey. 2) Add a new sk_msg_pop_data() helper for sk_msg based BPF programs that is used in conjunction with sk_msg_push_data() for adding / removing meta data to the msg data, from John. 3) Optimize convert_bpf_ld_abs() for 0 offset and fix various lib and testsuite build failures on 32 bit, from David. 4) Make BPF prog dump for !JIT identical to how we dump subprogs when JIT is in use, from Yonghong. 5) Rename btf_get_from_id() to make it more conform with libbpf API naming conventions, from Martin. 6) Add a missing BPF kselftest config item, from Naresh. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
2024-12-28 13:51:44 +00:00 · 2018-11-29 18:15:07 -08:00 · 2018-11-29 18:15:07 -08:00 · 93029d7d40
commit 93029d7d40
parent 3d58c9c9f7 b42699547f
25 changed files with 760 additions and 115 deletions
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@ -299,7 +299,8 @@ struct bpf_prog_aux {
 	u32 max_pkt_offset;
 	u32 stack_depth;
 	u32 id;
-	u32 func_cnt;
+	u32 func_cnt; /* used by non-func prog as the number of func progs */
+	u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
 	bool offload_requested;
 	struct bpf_prog **func;
 	void *jit_data; /* JIT specific data. arch dependent */
@ -317,7 +318,8 @@ struct bpf_prog_aux {
 #endif
 	struct bpf_prog_offload *offload;
 	struct btf *btf;
-	u32 type_id; /* type id for this prog/func */
+	struct bpf_func_info *func_info;
+	u32 func_info_cnt;
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@ -204,7 +204,6 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 struct bpf_subprog_info {
 	u32 start; /* insn idx of function entry point */
 	u16 stack_depth; /* max. stack depth used by this function */
-	u32 type_id; /* btf type_id for this subprog */
 };

 /* single container for all structs
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@ -2268,6 +2268,19 @@ union bpf_attr {
 *
 *	Return
 *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags)
+ *	 Description
+ *		Will remove *pop* bytes from a *msg* starting at byte *start*.
+ *		This may result in **ENOMEM** errors under certain situations if
+ *		an allocation and copy are required due to a full ring buffer.
+ *		However, the helper will try to avoid doing the allocation
+ *		if possible. Other errors can occur if input parameters are
+ *		invalid either due to *start* byte not being valid part of msg
+ *		payload and/or *pop* value being to large.
+ *
+ *	Return
+ *		0 on success, or a negative erro in case of failure.
 */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@ -2360,7 +2373,8 @@ union bpf_attr {
 	FN(map_push_elem),		\
 	FN(map_pop_elem),		\
 	FN(map_peek_elem),		\
-	FN(msg_push_data),
+	FN(msg_push_data),		\
+	FN(msg_pop_data),

 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
 * function eBPF program intends to call
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@ -411,7 +411,8 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)

 	/* prog->aux->name will be ignored if full btf name is available */
 	if (prog->aux->btf) {
-		type = btf_type_by_id(prog->aux->btf, prog->aux->type_id);
+		type = btf_type_by_id(prog->aux->btf,
+				      prog->aux->func_info[prog->aux->func_idx].type_id);
 		func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
 		snprintf(sym, (size_t)(end - sym), "_%s", func_name);
 		return;
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@ -1214,6 +1214,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 		bpf_prog_free_id(prog, do_idr_lock);
 		bpf_prog_kallsyms_del_all(prog);
 		btf_put(prog->aux->btf);
+		kvfree(prog->aux->func_info);

 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 	}
@ -2219,46 +2220,28 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	}

 	if (prog->aux->btf) {
+		u32 krec_size = sizeof(struct bpf_func_info);
 		u32 ucnt, urec_size;

 		info.btf_id = btf_id(prog->aux->btf);

 		ucnt = info.func_info_cnt;
-		info.func_info_cnt = prog->aux->func_cnt ? : 1;
+		info.func_info_cnt = prog->aux->func_info_cnt;
 		urec_size = info.func_info_rec_size;
-		info.func_info_rec_size = sizeof(struct bpf_func_info);
+		info.func_info_rec_size = krec_size;
 		if (ucnt) {
 			/* expect passed-in urec_size is what the kernel expects */
 			if (urec_size != info.func_info_rec_size)
 				return -EINVAL;

 			if (bpf_dump_raw_ok()) {
-				struct bpf_func_info kern_finfo;
 				char __user *user_finfo;
-				u32 i, insn_offset;

 				user_finfo = u64_to_user_ptr(info.func_info);
-				if (prog->aux->func_cnt) {
-					ucnt = min_t(u32, info.func_info_cnt, ucnt);
-					insn_offset = 0;
-					for (i = 0; i < ucnt; i++) {
-						kern_finfo.insn_offset = insn_offset;
-						kern_finfo.type_id = prog->aux->func[i]->aux->type_id;
-						if (copy_to_user(user_finfo, &kern_finfo,
-								 sizeof(kern_finfo)))
-							return -EFAULT;
-
-						/* func[i]->len holds the prog len */
-						insn_offset += prog->aux->func[i]->len;
-						user_finfo += urec_size;
-					}
-				} else {
-					kern_finfo.insn_offset = 0;
-					kern_finfo.type_id = prog->aux->type_id;
-					if (copy_to_user(user_finfo, &kern_finfo,
-							 sizeof(kern_finfo)))
-						return -EFAULT;
-				}
+				ucnt = min_t(u32, info.func_info_cnt, ucnt);
+				if (copy_to_user(user_finfo, prog->aux->func_info,
+						 krec_size * ucnt))
+					return -EFAULT;
 			} else {
 				info.func_info_cnt = 0;
 			}
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@ -4650,7 +4650,7 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
 {
 	u32 i, nfuncs, urec_size, min_size, prev_offset;
 	u32 krec_size = sizeof(struct bpf_func_info);
-	struct bpf_func_info krecord = {};
+	struct bpf_func_info *krecord = NULL;
 	const struct btf_type *type;
 	void __user *urecord;
 	struct btf *btf;
@ -4682,6 +4682,12 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
 	urecord = u64_to_user_ptr(attr->func_info);
 	min_size = min_t(u32, krec_size, urec_size);

+	krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
+	if (!krecord) {
+		ret = -ENOMEM;
+		goto free_btf;
+	}
+
 	for (i = 0; i < nfuncs; i++) {
 		ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
 		if (ret) {
@ -4696,59 +4702,69 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
 			goto free_btf;
 		}

-		if (copy_from_user(&krecord, urecord, min_size)) {
+		if (copy_from_user(&krecord[i], urecord, min_size)) {
 			ret = -EFAULT;
 			goto free_btf;
 		}

 		/* check insn_offset */
 		if (i == 0) {
-			if (krecord.insn_offset) {
+			if (krecord[i].insn_offset) {
 				verbose(env,
 					"nonzero insn_offset %u for the first func info record",
-					krecord.insn_offset);
+					krecord[i].insn_offset);
 				ret = -EINVAL;
 				goto free_btf;
 			}
-		} else if (krecord.insn_offset <= prev_offset) {
+		} else if (krecord[i].insn_offset <= prev_offset) {
 			verbose(env,
 				"same or smaller insn offset (%u) than previous func info record (%u)",
-				krecord.insn_offset, prev_offset);
+				krecord[i].insn_offset, prev_offset);
 			ret = -EINVAL;
 			goto free_btf;
 		}

-		if (env->subprog_info[i].start != krecord.insn_offset) {
+		if (env->subprog_info[i].start != krecord[i].insn_offset) {
 			verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
 			ret = -EINVAL;
 			goto free_btf;
 		}

 		/* check type_id */
-		type = btf_type_by_id(btf, krecord.type_id);
+		type = btf_type_by_id(btf, krecord[i].type_id);
 		if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) {
 			verbose(env, "invalid type id %d in func info",
-				krecord.type_id);
+				krecord[i].type_id);
 			ret = -EINVAL;
 			goto free_btf;
 		}

-		if (i == 0)
-			prog->aux->type_id = krecord.type_id;
-		env->subprog_info[i].type_id = krecord.type_id;
-
-		prev_offset = krecord.insn_offset;
+		prev_offset = krecord[i].insn_offset;
 		urecord += urec_size;
 	}

 	prog->aux->btf = btf;
+	prog->aux->func_info = krecord;
+	prog->aux->func_info_cnt = nfuncs;
 	return 0;

 free_btf:
 	btf_put(btf);
+	kvfree(krecord);
 	return ret;
 }

+static void adjust_btf_func(struct bpf_verifier_env *env)
+{
+	int i;
+
+	if (!env->prog->aux->func_info)
+		return;
+
+	for (i = 0; i < env->subprog_cnt; i++)
+		env->prog->aux->func_info[i].insn_offset = env->subprog_info[i].start;
+}
+
 /* check %cur's range satisfies %old's */
 static bool range_within(struct bpf_reg_state *old,
 			 struct bpf_reg_state *cur)
@ -6043,15 +6059,17 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		if (bpf_prog_calc_tag(func[i]))
 			goto out_free;
 		func[i]->is_func = 1;
+		func[i]->aux->func_idx = i;
+		/* the btf and func_info will be freed only at prog->aux */
+		func[i]->aux->btf = prog->aux->btf;
+		func[i]->aux->func_info = prog->aux->func_info;
+
 		/* Use bpf_prog_F_tag to indicate functions in stack traces.
 		 * Long term would need debug info to populate names
 		 */
 		func[i]->aux->name[0] = 'F';
 		func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
 		func[i]->jit_requested = 1;
-		/* the btf will be freed only at prog->aux */
-		func[i]->aux->btf = prog->aux->btf;
-		func[i]->aux->type_id = env->subprog_info[i].type_id;
 		func[i] = bpf_int_jit_compile(func[i]);
 		if (!func[i]->jited) {
 			err = -ENOTSUPP;
@ -6572,6 +6590,9 @@ skip_full_check:
 		convert_pseudo_ld_imm64(env);
 	}

+	if (ret == 0)
+		adjust_btf_func(env);
+
 err_release_maps:
 	if (!env->prog->aux->used_maps)
 		/* if we didn't copy map pointers into bpf_prog_info, release
--- a/net/core/filter.c
+++ b/net/core/filter.c
@ -463,7 +463,8 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
 		bool ldx_off_ok = offset <= S16_MAX;

 		*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
-		*insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
+		if (offset)
+			*insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
 		*insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
 				      size, 2 + endian + (!ldx_off_ok * 2));
 		if (ldx_off_ok) {
@ -2424,6 +2425,174 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };

+static void sk_msg_shift_left(struct sk_msg *msg, int i)
+{
+	int prev;
+
+	do {
+		prev = i;
+		sk_msg_iter_var_next(i);
+		msg->sg.data[prev] = msg->sg.data[i];
+	} while (i != msg->sg.end);
+
+	sk_msg_iter_prev(msg, end);
+}
+
+static void sk_msg_shift_right(struct sk_msg *msg, int i)
+{
+	struct scatterlist tmp, sge;
+
+	sk_msg_iter_next(msg, end);
+	sge = sk_msg_elem_cpy(msg, i);
+	sk_msg_iter_var_next(i);
+	tmp = sk_msg_elem_cpy(msg, i);
+
+	while (i != msg->sg.end) {
+		msg->sg.data[i] = sge;
+		sk_msg_iter_var_next(i);
+		sge = tmp;
+		tmp = sk_msg_elem_cpy(msg, i);
+	}
+}
+
+BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
+	   u32, len, u64, flags)
+{
+	u32 i = 0, l, space, offset = 0;
+	u64 last = start + len;
+	int pop;
+
+	if (unlikely(flags))
+		return -EINVAL;
+
+	/* First find the starting scatterlist element */
+	i = msg->sg.start;
+	do {
+		l = sk_msg_elem(msg, i)->length;
+
+		if (start < offset + l)
+			break;
+		offset += l;
+		sk_msg_iter_var_next(i);
+	} while (i != msg->sg.end);
+
+	/* Bounds checks: start and pop must be inside message */
+	if (start >= offset + l || last >= msg->sg.size)
+		return -EINVAL;
+
+	space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
+
+	pop = len;
+	/* --------------| offset
+	 * -| start      |-------- len -------|
+	 *
+	 *  |----- a ----|-------- pop -------|----- b ----|
+	 *  |______________________________________________| length
+	 *
+	 *
+	 * a:   region at front of scatter element to save
+	 * b:   region at back of scatter element to save when length > A + pop
+	 * pop: region to pop from element, same as input 'pop' here will be
+	 *      decremented below per iteration.
+	 *
+	 * Two top-level cases to handle when start != offset, first B is non
+	 * zero and second B is zero corresponding to when a pop includes more
+	 * than one element.
+	 *
+	 * Then if B is non-zero AND there is no space allocate space and
+	 * compact A, B regions into page. If there is space shift ring to
+	 * the rigth free'ing the next element in ring to place B, leaving
+	 * A untouched except to reduce length.
+	 */
+	if (start != offset) {
+		struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
+		int a = start;
+		int b = sge->length - pop - a;
+
+		sk_msg_iter_var_next(i);
+
+		if (pop < sge->length - a) {
+			if (space) {
+				sge->length = a;
+				sk_msg_shift_right(msg, i);
+				nsge = sk_msg_elem(msg, i);
+				get_page(sg_page(sge));
+				sg_set_page(nsge,
+					    sg_page(sge),
+					    b, sge->offset + pop + a);
+			} else {
+				struct page *page, *orig;
+				u8 *to, *from;
+
+				page = alloc_pages(__GFP_NOWARN |
+						   __GFP_COMP   | GFP_ATOMIC,
+						   get_order(a + b));
+				if (unlikely(!page))
+					return -ENOMEM;
+
+				sge->length = a;
+				orig = sg_page(sge);
+				from = sg_virt(sge);
+				to = page_address(page);
+				memcpy(to, from, a);
+				memcpy(to + a, from + a + pop, b);
+				sg_set_page(sge, page, a + b, 0);
+				put_page(orig);
+			}
+			pop = 0;
+		} else if (pop >= sge->length - a) {
+			sge->length = a;
+			pop -= (sge->length - a);
+		}
+	}
+
+	/* From above the current layout _must_ be as follows,
+	 *
+	 * -| offset
+	 * -| start
+	 *
+	 *  |---- pop ---|---------------- b ------------|
+	 *  |____________________________________________| length
+	 *
+	 * Offset and start of the current msg elem are equal because in the
+	 * previous case we handled offset != start and either consumed the
+	 * entire element and advanced to the next element OR pop == 0.
+	 *
+	 * Two cases to handle here are first pop is less than the length
+	 * leaving some remainder b above. Simply adjust the element's layout
+	 * in this case. Or pop >= length of the element so that b = 0. In this
+	 * case advance to next element decrementing pop.
+	 */
+	while (pop) {
+		struct scatterlist *sge = sk_msg_elem(msg, i);
+
+		if (pop < sge->length) {
+			sge->length -= pop;
+			sge->offset += pop;
+			pop = 0;
+		} else {
+			pop -= sge->length;
+			sk_msg_shift_left(msg, i);
+		}
+		sk_msg_iter_var_next(i);
+	}
+
+	sk_mem_uncharge(msg->sk, len - pop);
+	msg->sg.size -= (len - pop);
+	sk_msg_compute_data_pointers(msg);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_pop_data_proto = {
+	.func		= bpf_msg_pop_data,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
 	return task_get_classid(skb);
@ -5097,6 +5266,7 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_xdp_adjust_meta ||
 	    func == bpf_msg_pull_data ||
 	    func == bpf_msg_push_data ||
+	    func == bpf_msg_pop_data ||
 	    func == bpf_xdp_adjust_tail ||
 #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
 	    func == bpf_lwt_seg6_store_bytes ||
@ -5393,6 +5563,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_msg_pull_data_proto;
 	case BPF_FUNC_msg_push_data:
 		return &bpf_msg_push_data_proto;
+	case BPF_FUNC_msg_pop_data:
+		return &bpf_msg_pop_data_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@ -289,12 +289,23 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
 {
 	bool cork = false, enospc = msg->sg.start == msg->sg.end;
 	struct sock *sk_redir;
-	u32 tosend;
+	u32 tosend, delta = 0;
 	int ret;

 more_data:
-	if (psock->eval == __SK_NONE)
+	if (psock->eval == __SK_NONE) {
+		/* Track delta in msg size to add/subtract it on SK_DROP from
+		 * returned to user copied size. This ensures user doesn't
+		 * get a positive return code with msg_cut_data and SK_DROP
+		 * verdict.
+		 */
+		delta = msg->sg.size;
 		psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+		if (msg->sg.size < delta)
+			delta -= msg->sg.size;
+		else
+			delta = 0;
+	}

 	if (msg->cork_bytes &&
 	    msg->cork_bytes > msg->sg.size && !enospc) {
@ -350,7 +361,7 @@ more_data:
 	default:
 		sk_msg_free_partial(sk, msg, tosend);
 		sk_msg_apply_bytes(psock, tosend);
-		*copied -= tosend;
+		*copied -= (tosend + delta);
 		return -EACCES;
 	}

--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@ -687,6 +687,7 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,
 	struct sock *sk_redir;
 	struct tls_rec *rec;
 	int err = 0, send;
+	u32 delta = 0;
 	bool enospc;

 	psock = sk_psock_get(sk);
@ -694,8 +695,14 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,
 		return tls_push_record(sk, flags, record_type);
 more_data:
 	enospc = sk_msg_full(msg);
-	if (psock->eval == __SK_NONE)
+	if (psock->eval == __SK_NONE) {
+		delta = msg->sg.size;
 		psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+		if (delta < msg->sg.size)
+			delta -= msg->sg.size;
+		else
+			delta = 0;
+	}
 	if (msg->cork_bytes && msg->cork_bytes > msg->sg.size &&
 	    !enospc && !full_record) {
 		err = -ENOSPC;
@ -743,7 +750,7 @@ more_data:
 			msg->apply_bytes -= send;
 		if (msg->sg.size == 0)
 			tls_free_open_rec(sk);
-		*copied -= send;
+		*copied -= (send + delta);
 		err = -EACCES;
 	}

--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@ -713,7 +713,7 @@ static int do_dump(int argc, char **argv)

 	prev_key = NULL;

-	err = btf_get_from_id(info.btf_id, &btf);
+	err = btf__get_from_id(info.btf_id, &btf);
 	if (err) {
 		p_err("failed to get btf");
 		goto exit_free;
@ -857,7 +857,7 @@ static int do_lookup(int argc, char **argv)
 	}

 	/* here means bpf_map_lookup_elem() succeeded */
-	err = btf_get_from_id(info.btf_id, &btf);
+	err = btf__get_from_id(info.btf_id, &btf);
 	if (err) {
 		p_err("failed to get btf");
 		goto exit_free;
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@ -621,7 +621,7 @@ static int do_dump(int argc, char **argv)
 		goto err_free;
 	}

-	if (info.btf_id && btf_get_from_id(info.btf_id, &btf)) {
+	if (info.btf_id && btf__get_from_id(info.btf_id, &btf)) {
 		p_err("failed to get btf");
 		goto err_free;
 	}
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@ -2268,6 +2268,19 @@ union bpf_attr {
 *
 *	Return
 *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags)
+ *	 Description
+ *		Will remove *pop* bytes from a *msg* starting at byte *start*.
+ *		This may result in **ENOMEM** errors under certain situations if
+ *		an allocation and copy are required due to a full ring buffer.
+ *		However, the helper will try to avoid doing the allocation
+ *		if possible. Other errors can occur if input parameters are
+ *		invalid either due to *start* byte not being valid part of msg
+ *		payload and/or *pop* value being to large.
+ *
+ *	Return
+ *		0 on success, or a negative erro in case of failure.
 */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@ -2360,7 +2373,8 @@ union bpf_attr {
 	FN(map_push_elem),		\
 	FN(map_pop_elem),		\
 	FN(map_peek_elem),		\
-	FN(msg_push_data),
+	FN(msg_push_data),		\
+	FN(msg_pop_data),

 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
 * function eBPF program intends to call
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@ -145,6 +145,12 @@ include $(srctree)/tools/build/Makefile.include

 BPF_IN    := $(OUTPUT)libbpf-in.o
 LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE))
+VERSION_SCRIPT := libbpf.map
+
+GLOBAL_SYM_COUNT = $(shell readelf -s $(BPF_IN) | \
+			   awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {s++} END{print s}')
+VERSIONED_SYM_COUNT = $(shell readelf -s $(OUTPUT)libbpf.so | \
+			      grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | sort -u | wc -l)

 CMD_TARGETS = $(LIB_FILE)

@ -158,7 +164,7 @@ TARGETS = $(CMD_TARGETS)

 all: fixdep all_cmd

-all_cmd: $(CMD_TARGETS)
+all_cmd: $(CMD_TARGETS) check

 $(BPF_IN): force elfdep bpfdep
 	@(test -f ../../include/uapi/linux/bpf.h -a -f ../../../include/uapi/linux/bpf.h && ( \
@ -176,7 +182,8 @@ $(BPF_IN): force elfdep bpfdep
 	$(Q)$(MAKE) $(build)=libbpf

 $(OUTPUT)libbpf.so: $(BPF_IN)
-	$(QUIET_LINK)$(CC) --shared $^ -o $@
+	$(QUIET_LINK)$(CC) --shared -Wl,--version-script=$(VERSION_SCRIPT) \
+		$^ -o $@

 $(OUTPUT)libbpf.a: $(BPF_IN)
 	$(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^
@ -184,6 +191,18 @@ $(OUTPUT)libbpf.a: $(BPF_IN)
 $(OUTPUT)test_libbpf: test_libbpf.cpp $(OUTPUT)libbpf.a
 	$(QUIET_LINK)$(CXX) $^ -lelf -o $@

+check: check_abi
+
+check_abi: $(OUTPUT)libbpf.so
+	@if [ "$(GLOBAL_SYM_COUNT)" != "$(VERSIONED_SYM_COUNT)" ]; then	 \
+		echo "Warning: Num of global symbols in $(BPF_IN)"	 \
+		     "($(GLOBAL_SYM_COUNT)) does NOT match with num of"	 \
+		     "versioned symbols in $^ ($(VERSIONED_SYM_COUNT))." \
+		     "Please make sure all LIBBPF_API symbols are"	 \
+		     "versioned in $(VERSION_SCRIPT)." >&2;		 \
+		exit 1;							 \
+	fi
+
 define do_install
 	if [ ! -d '$(DESTDIR_SQ)$2' ]; then		\
 		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$2';	\
--- a/tools/lib/bpf/README.rst
+++ b/tools/lib/bpf/README.rst
@ -0,0 +1,139 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+libbpf API naming convention
+============================
+
+libbpf API provides access to a few logically separated groups of
+functions and types. Every group has its own naming convention
+described here. It's recommended to follow these conventions whenever a
+new function or type is added to keep libbpf API clean and consistent.
+
+All types and functions provided by libbpf API should have one of the
+following prefixes: ``bpf_``, ``btf_``, ``libbpf_``.
+
+System call wrappers
+--------------------
+
+System call wrappers are simple wrappers for commands supported by
+sys_bpf system call. These wrappers should go to ``bpf.h`` header file
+and map one-on-one to corresponding commands.
+
+For example ``bpf_map_lookup_elem`` wraps ``BPF_MAP_LOOKUP_ELEM``
+command of sys_bpf, ``bpf_prog_attach`` wraps ``BPF_PROG_ATTACH``, etc.
+
+Objects
+-------
+
+Another class of types and functions provided by libbpf API is "objects"
+and functions to work with them. Objects are high-level abstractions
+such as BPF program or BPF map. They're represented by corresponding
+structures such as ``struct bpf_object``, ``struct bpf_program``,
+``struct bpf_map``, etc.
+
+Structures are forward declared and access to their fields should be
+provided via corresponding getters and setters rather than directly.
+
+These objects are associated with corresponding parts of ELF object that
+contains compiled BPF programs.
+
+For example ``struct bpf_object`` represents ELF object itself created
+from an ELF file or from a buffer, ``struct bpf_program`` represents a
+program in ELF object and ``struct bpf_map`` is a map.
+
+Functions that work with an object have names built from object name,
+double underscore and part that describes function purpose.
+
+For example ``bpf_object__open`` consists of the name of corresponding
+object, ``bpf_object``, double underscore and ``open`` that defines the
+purpose of the function to open ELF file and create ``bpf_object`` from
+it.
+
+Another example: ``bpf_program__load`` is named for corresponding
+object, ``bpf_program``, that is separated from other part of the name
+by double underscore.
+
+All objects and corresponding functions other than BTF related should go
+to ``libbpf.h``. BTF types and functions should go to ``btf.h``.
+
+Auxiliary functions
+-------------------
+
+Auxiliary functions and types that don't fit well in any of categories
+described above should have ``libbpf_`` prefix, e.g.
+``libbpf_get_error`` or ``libbpf_prog_type_by_name``.
+
+libbpf ABI
+==========
+
+libbpf can be both linked statically or used as DSO. To avoid possible
+conflicts with other libraries an application is linked with, all
+non-static libbpf symbols should have one of the prefixes mentioned in
+API documentation above. See API naming convention to choose the right
+name for a new symbol.
+
+Symbol visibility
+-----------------
+
+libbpf follow the model when all global symbols have visibility "hidden"
+by default and to make a symbol visible it has to be explicitly
+attributed with ``LIBBPF_API`` macro. For example:
+
+.. code-block:: c
+
+        LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id);
+
+This prevents from accidentally exporting a symbol, that is not supposed
+to be a part of ABI what, in turn, improves both libbpf developer- and
+user-experiences.
+
+ABI versionning
+---------------
+
+To make future ABI extensions possible libbpf ABI is versioned.
+Versioning is implemented by ``libbpf.map`` version script that is
+passed to linker.
+
+Version name is ``LIBBPF_`` prefix + three-component numeric version,
+starting from ``0.0.1``.
+
+Every time ABI is being changed, e.g. because a new symbol is added or
+semantic of existing symbol is changed, ABI version should be bumped.
+
+For example, if current state of ``libbpf.map`` is:
+
+.. code-block::
+        LIBBPF_0.0.1 {
+        	global:
+                        bpf_func_a;
+                        bpf_func_b;
+        	local:
+        		\*;
+        };
+
+, and a new symbol ``bpf_func_c`` is being introduced, then
+``libbpf.map`` should be changed like this:
+
+.. code-block::
+        LIBBPF_0.0.1 {
+        	global:
+                        bpf_func_a;
+                        bpf_func_b;
+        	local:
+        		\*;
+        };
+        LIBBPF_0.0.2 {
+                global:
+                        bpf_func_c;
+        } LIBBPF_0.0.1;
+
+, where new version ``LIBBPF_0.0.2`` depends on the previous
+``LIBBPF_0.0.1``.
+
+Format of version script and ways to handle ABI changes, including
+incompatible ones, described in details in [1].
+
+Links
+=====
+
+[1] https://www.akkadia.org/drepper/dsohowto.pdf
+    (Chapter 3. Maintaining APIs and ABIs).
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@ -415,7 +415,7 @@ const char *btf__name_by_offset(const struct btf *btf, __u32 offset)
 		return NULL;
 }

-int btf_get_from_id(__u32 id, struct btf **btf)
+int btf__get_from_id(__u32 id, struct btf **btf)
 {
 	struct bpf_btf_info btf_info = { 0 };
 	__u32 len = sizeof(btf_info);
@ -466,7 +466,7 @@ int btf_get_from_id(__u32 id, struct btf **btf)
 		goto exit_free;
 	}

-	*btf = btf__new((__u8 *)btf_info.btf, btf_info.btf_size, NULL);
+	*btf = btf__new((__u8 *)(long)btf_info.btf, btf_info.btf_size, NULL);
 	if (IS_ERR(*btf)) {
 		err = PTR_ERR(*btf);
 		*btf = NULL;
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@ -73,7 +73,7 @@ LIBBPF_API __s64 btf__resolve_size(const struct btf *btf, __u32 type_id);
 LIBBPF_API int btf__resolve_type(const struct btf *btf, __u32 type_id);
 LIBBPF_API int btf__fd(const struct btf *btf);
 LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset);
-LIBBPF_API int btf_get_from_id(__u32 id, struct btf **btf);
+LIBBPF_API int btf__get_from_id(__u32 id, struct btf **btf);

 struct btf_ext *btf_ext__new(__u8 *data, __u32 size, btf_print_fn_t err_log);
 void btf_ext__free(struct btf_ext *btf_ext);
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@ -9,7 +9,9 @@
 * Copyright (C) 2017 Nicira, Inc.
 */

+#ifndef _GNU_SOURCE
 #define _GNU_SOURCE
+#endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <stdarg.h>
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@ -0,0 +1,121 @@
+LIBBPF_0.0.1 {
+	global:
+		bpf_btf_get_fd_by_id;
+		bpf_create_map;
+		bpf_create_map_in_map;
+		bpf_create_map_in_map_node;
+		bpf_create_map_name;
+		bpf_create_map_node;
+		bpf_create_map_xattr;
+		bpf_load_btf;
+		bpf_load_program;
+		bpf_load_program_xattr;
+		bpf_map__btf_key_type_id;
+		bpf_map__btf_value_type_id;
+		bpf_map__def;
+		bpf_map__fd;
+		bpf_map__is_offload_neutral;
+		bpf_map__name;
+		bpf_map__next;
+		bpf_map__pin;
+		bpf_map__prev;
+		bpf_map__priv;
+		bpf_map__reuse_fd;
+		bpf_map__set_ifindex;
+		bpf_map__set_inner_map_fd;
+		bpf_map__set_priv;
+		bpf_map__unpin;
+		bpf_map_delete_elem;
+		bpf_map_get_fd_by_id;
+		bpf_map_get_next_id;
+		bpf_map_get_next_key;
+		bpf_map_lookup_and_delete_elem;
+		bpf_map_lookup_elem;
+		bpf_map_update_elem;
+		bpf_obj_get;
+		bpf_obj_get_info_by_fd;
+		bpf_obj_pin;
+		bpf_object__btf_fd;
+		bpf_object__close;
+		bpf_object__find_map_by_name;
+		bpf_object__find_map_by_offset;
+		bpf_object__find_program_by_title;
+		bpf_object__kversion;
+		bpf_object__load;
+		bpf_object__name;
+		bpf_object__next;
+		bpf_object__open;
+		bpf_object__open_buffer;
+		bpf_object__open_xattr;
+		bpf_object__pin;
+		bpf_object__pin_maps;
+		bpf_object__pin_programs;
+		bpf_object__priv;
+		bpf_object__set_priv;
+		bpf_object__unload;
+		bpf_object__unpin_maps;
+		bpf_object__unpin_programs;
+		bpf_perf_event_read_simple;
+		bpf_prog_attach;
+		bpf_prog_detach;
+		bpf_prog_detach2;
+		bpf_prog_get_fd_by_id;
+		bpf_prog_get_next_id;
+		bpf_prog_load;
+		bpf_prog_load_xattr;
+		bpf_prog_query;
+		bpf_prog_test_run;
+		bpf_program__fd;
+		bpf_program__is_kprobe;
+		bpf_program__is_perf_event;
+		bpf_program__is_raw_tracepoint;
+		bpf_program__is_sched_act;
+		bpf_program__is_sched_cls;
+		bpf_program__is_socket_filter;
+		bpf_program__is_tracepoint;
+		bpf_program__is_xdp;
+		bpf_program__load;
+		bpf_program__next;
+		bpf_program__nth_fd;
+		bpf_program__pin;
+		bpf_program__pin_instance;
+		bpf_program__prev;
+		bpf_program__priv;
+		bpf_program__set_expected_attach_type;
+		bpf_program__set_ifindex;
+		bpf_program__set_kprobe;
+		bpf_program__set_perf_event;
+		bpf_program__set_prep;
+		bpf_program__set_priv;
+		bpf_program__set_raw_tracepoint;
+		bpf_program__set_sched_act;
+		bpf_program__set_sched_cls;
+		bpf_program__set_socket_filter;
+		bpf_program__set_tracepoint;
+		bpf_program__set_type;
+		bpf_program__set_xdp;
+		bpf_program__title;
+		bpf_program__unload;
+		bpf_program__unpin;
+		bpf_program__unpin_instance;
+		bpf_raw_tracepoint_open;
+		bpf_set_link_xdp_fd;
+		bpf_task_fd_query;
+		bpf_verify_program;
+		btf__fd;
+		btf__find_by_name;
+		btf__free;
+		btf__get_from_id;
+		btf__name_by_offset;
+		btf__new;
+		btf__resolve_size;
+		btf__resolve_type;
+		btf__type_by_id;
+		libbpf_attach_type_by_name;
+		libbpf_get_error;
+		libbpf_prog_type_by_name;
+		libbpf_set_print;
+		libbpf_strerror;
+	local:
+		*;
+};
--- a/tools/lib/bpf/libbpf_errno.c
+++ b/tools/lib/bpf/libbpf_errno.c
@ -7,6 +7,7 @@
 * Copyright (C) 2017 Nicira, Inc.
 */

+#undef _GNU_SOURCE
 #include <stdio.h>
 #include <string.h>

--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@ -113,6 +113,8 @@ static int (*bpf_msg_pull_data)(void *ctx, int start, int end, int flags) =
 	(void *) BPF_FUNC_msg_pull_data;
 static int (*bpf_msg_push_data)(void *ctx, int start, int end, int flags) =
 	(void *) BPF_FUNC_msg_push_data;
+static int (*bpf_msg_pop_data)(void *ctx, int start, int cut, int flags) =
+	(void *) BPF_FUNC_msg_pop_data;
 static int (*bpf_bind)(void *ctx, void *addr, int addr_len) =
 	(void *) BPF_FUNC_bind;
 static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) =
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@ -22,3 +22,4 @@ CONFIG_NET_CLS_FLOWER=m
 CONFIG_LWTUNNEL=y
 CONFIG_BPF_STREAM_PARSER=y
 CONFIG_XDP_SOCKETS=y
+CONFIG_FTRACE_SYSCALLS=y
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@ -29,7 +29,6 @@
 static uint32_t pass_cnt;
 static uint32_t error_cnt;
 static uint32_t skip_cnt;
-static bool jit_enabled;

 #define CHECK(condition, format...) ({					\
 	int __ret = !!(condition);					\
@ -65,24 +64,6 @@ static int __base_pr(const char *format, ...)
 	return err;
 }

-static bool is_jit_enabled(void)
-{
-	const char *jit_sysctl = "/proc/sys/net/core/bpf_jit_enable";
-	bool enabled = false;
-	int sysctl_fd;
-
-	sysctl_fd = open(jit_sysctl, 0, O_RDONLY);
-	if (sysctl_fd != -1) {
-		char tmpc;
-
-		if (read(sysctl_fd, &tmpc, sizeof(tmpc)) == 1)
-			enabled = (tmpc != '0');
-		close(sysctl_fd);
-	}
-
-	return enabled;
-}
-
 #define BTF_INFO_ENC(kind, root, vlen)			\
 	((!!(root) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN))

@ -2547,8 +2528,8 @@ static int do_test_file(unsigned int test_num)
 		  test->btf_kv_notfound))
 		goto done;

-	if (!jit_enabled || !has_btf_ext)
-		goto skip_jit;
+	if (!has_btf_ext)
+		goto skip;

 	/* get necessary program info */
 	info_len = sizeof(struct bpf_prog_info);
@ -2604,7 +2585,7 @@ static int do_test_file(unsigned int test_num)
 		goto done;
 	}

-	err = btf_get_from_id(info.btf_id, &btf);
+	err = btf__get_from_id(info.btf_id, &btf);
 	if (CHECK(err, "cannot get btf from kernel, err: %d", err))
 		goto done;

@ -2636,7 +2617,7 @@ static int do_test_file(unsigned int test_num)
 		finfo = (void *)finfo + rec_size;
 	}

-skip_jit:
+skip:
 	fprintf(stderr, "OK");

 done:
@ -3270,12 +3251,6 @@ static int do_test_func_type(int test_num)
 		err = -1;
 		goto done;
 	}
-	if (!jit_enabled) {
-		skip_cnt++;
-		fprintf(stderr, "SKIPPED, please enable sysctl bpf_jit_enable\n");
-		err = 0;
-		goto done;
-	}

 	/* get necessary lens */
 	info_len = sizeof(struct bpf_prog_info);
@ -3452,8 +3427,6 @@ int main(int argc, char **argv)
 	if (args.always_log)
 		libbpf_set_print(__base_pr, __base_pr, __base_pr);

-	jit_enabled = is_jit_enabled();
-
 	if (args.raw_test)
 		err |= test_raw();

--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@ -524,7 +524,7 @@ static void test_bpf_obj_id(void)
 			  load_time < now - 60 || load_time > now + 60 ||
 			  prog_infos[i].created_by_uid != my_uid ||
 			  prog_infos[i].nr_map_ids != 1 ||
-			  *(int *)prog_infos[i].map_ids != map_infos[i].id ||
+			  *(int *)(long)prog_infos[i].map_ids != map_infos[i].id ||
 			  strcmp((char *)prog_infos[i].name, expected_prog_name),
 			  "get-prog-info(fd)",
 			  "err %d errno %d i %d type %d(%d) info_len %u(%Zu) jit_enabled %d jited_prog_len %u xlated_prog_len %u jited_prog %d xlated_prog %d load_time %lu(%lu) uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) name %s(%s)\n",
@ -539,7 +539,7 @@ static void test_bpf_obj_id(void)
 			  load_time, now,
 			  prog_infos[i].created_by_uid, my_uid,
 			  prog_infos[i].nr_map_ids, 1,
-			  *(int *)prog_infos[i].map_ids, map_infos[i].id,
+			  *(int *)(long)prog_infos[i].map_ids, map_infos[i].id,
 			  prog_infos[i].name, expected_prog_name))
 			goto done;
 	}
@ -585,7 +585,7 @@ static void test_bpf_obj_id(void)
 		bzero(&prog_info, sizeof(prog_info));
 		info_len = sizeof(prog_info);

-		saved_map_id = *(int *)(prog_infos[i].map_ids);
+		saved_map_id = *(int *)((long)prog_infos[i].map_ids);
 		prog_info.map_ids = prog_infos[i].map_ids;
 		prog_info.nr_map_ids = 2;
 		err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len);
@ -593,12 +593,12 @@ static void test_bpf_obj_id(void)
 		prog_infos[i].xlated_prog_insns = 0;
 		CHECK(err || info_len != sizeof(struct bpf_prog_info) ||
 		      memcmp(&prog_info, &prog_infos[i], info_len) ||
-		      *(int *)prog_info.map_ids != saved_map_id,
+		      *(int *)(long)prog_info.map_ids != saved_map_id,
 		      "get-prog-info(next_id->fd)",
 		      "err %d errno %d info_len %u(%Zu) memcmp %d map_id %u(%u)\n",
 		      err, errno, info_len, sizeof(struct bpf_prog_info),
 		      memcmp(&prog_info, &prog_infos[i], info_len),
-		      *(int *)prog_info.map_ids, saved_map_id);
+		      *(int *)(long)prog_info.map_ids, saved_map_id);
 		close(prog_fd);
 	}
 	CHECK(nr_id_found != nr_iters,
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@ -79,6 +79,8 @@ int txmsg_start;
 int txmsg_end;
 int txmsg_start_push;
 int txmsg_end_push;
+int txmsg_start_pop;
+int txmsg_pop;
 int txmsg_ingress;
 int txmsg_skb;
 int ktls;
@ -104,6 +106,8 @@ static const struct option long_options[] = {
 	{"txmsg_end",	required_argument,	NULL, 'e'},
 	{"txmsg_start_push", required_argument,	NULL, 'p'},
 	{"txmsg_end_push",   required_argument,	NULL, 'q'},
+	{"txmsg_start_pop",  required_argument,	NULL, 'w'},
+	{"txmsg_pop",	     required_argument,	NULL, 'x'},
 	{"txmsg_ingress", no_argument,		&txmsg_ingress, 1 },
 	{"txmsg_skb", no_argument,		&txmsg_skb, 1 },
 	{"ktls", no_argument,			&ktls, 1 },
@ -473,13 +477,27 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		clock_gettime(CLOCK_MONOTONIC, &s->end);
 	} else {
 		int slct, recvp = 0, recv, max_fd = fd;
+		float total_bytes, txmsg_pop_total;
 		int fd_flags = O_NONBLOCK;
 		struct timeval timeout;
-		float total_bytes;
 		fd_set w;

 		fcntl(fd, fd_flags);
+		/* Account for pop bytes noting each iteration of apply will
+		 * call msg_pop_data helper so we need to account for this
+		 * by calculating the number of apply iterations. Note user
+		 * of the tool can create cases where no data is sent by
+		 * manipulating pop/push/pull/etc. For example txmsg_apply 1
+		 * with txmsg_pop 1 will try to apply 1B at a time but each
+		 * iteration will then pop 1B so no data will ever be sent.
+		 * This is really only useful for testing edge cases in code
+		 * paths.
+		 */
 		total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
+		txmsg_pop_total = txmsg_pop;
+		if (txmsg_apply)
+			txmsg_pop_total *= (total_bytes / txmsg_apply);
+		total_bytes -= txmsg_pop_total;
 		err = clock_gettime(CLOCK_MONOTONIC, &s->start);
 		if (err < 0)
 			perror("recv start time: ");
@ -488,7 +506,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 				timeout.tv_sec = 0;
 				timeout.tv_usec = 300000;
 			} else {
-				timeout.tv_sec = 1;
+				timeout.tv_sec = 3;
 				timeout.tv_usec = 0;
 			}

@ -503,7 +521,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 				goto out_errno;
 			} else if (!slct) {
 				if (opt->verbose)
-					fprintf(stderr, "unexpected timeout\n");
+					fprintf(stderr, "unexpected timeout: recved %zu/%f pop_total %f\n", s->bytes_recvd, total_bytes, txmsg_pop_total);
 				errno = -EIO;
 				clock_gettime(CLOCK_MONOTONIC, &s->end);
 				goto out_errno;
@ -619,7 +637,7 @@ static int sendmsg_test(struct sockmap_options *opt)
 			iov_count = 1;
 		err = msg_loop(rx_fd, iov_count, iov_buf,
 			       cnt, &s, false, opt);
-		if (err && opt->verbose)
+		if (opt->verbose)
 			fprintf(stderr,
 				"msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
 				iov_count, iov_buf, cnt, err);
@ -931,6 +949,39 @@ run:
 			}
 		}

+		if (txmsg_start_pop) {
+			i = 4;
+			err = bpf_map_update_elem(map_fd[5],
+						  &i, &txmsg_start_pop, BPF_ANY);
+			if (err) {
+				fprintf(stderr,
+					"ERROR: bpf_map_update_elem %i@%i (txmsg_start_pop):  %d (%s)\n",
+					txmsg_start_pop, i, err, strerror(errno));
+				goto out;
+			}
+		} else {
+			i = 4;
+			bpf_map_update_elem(map_fd[5],
+						  &i, &txmsg_start_pop, BPF_ANY);
+		}
+
+		if (txmsg_pop) {
+			i = 5;
+			err = bpf_map_update_elem(map_fd[5],
+						  &i, &txmsg_pop, BPF_ANY);
+			if (err) {
+				fprintf(stderr,
+					"ERROR: bpf_map_update_elem %i@%i (txmsg_pop):  %d (%s)\n",
+					txmsg_pop, i, err, strerror(errno));
+				goto out;
+			}
+		} else {
+			i = 5;
+			bpf_map_update_elem(map_fd[5],
+					    &i, &txmsg_pop, BPF_ANY);
+
+		}
+
 		if (txmsg_ingress) {
 			int in = BPF_F_INGRESS;

@ -1082,6 +1133,11 @@ static void test_options(char *options)
 		snprintf(tstr, OPTSTRING, "end %d,", txmsg_end);
 		strncat(options, tstr, OPTSTRING);
 	}
+	if (txmsg_start_pop) {
+		snprintf(tstr, OPTSTRING, "pop (%d,%d),",
+			 txmsg_start_pop, txmsg_start_pop + txmsg_pop);
+		strncat(options, tstr, OPTSTRING);
+	}
 	if (txmsg_ingress)
 		strncat(options, "ingress,", OPTSTRING);
 	if (txmsg_skb)
@ -1264,6 +1320,7 @@ static int test_mixed(int cgrp)
 	txmsg_apply = txmsg_cork = 0;
 	txmsg_start = txmsg_end = 0;
 	txmsg_start_push = txmsg_end_push = 0;
+	txmsg_start_pop = txmsg_pop = 0;

 	/* Test small and large iov_count values with pass/redir/apply/cork */
 	txmsg_pass = 1;
@ -1383,6 +1440,19 @@ static int test_start_end(int cgrp)
 	txmsg_end = 2;
 	txmsg_start_push = 1;
 	txmsg_end_push = 2;
+	txmsg_start_pop = 1;
+	txmsg_pop = 1;
+	err = test_txmsg(cgrp);
+	if (err)
+		goto out;
+
+	/* Cut a byte of pushed data but leave reamining in place */
+	txmsg_start = 1;
+	txmsg_end = 2;
+	txmsg_start_push = 1;
+	txmsg_end_push = 3;
+	txmsg_start_pop = 1;
+	txmsg_pop = 1;
 	err = test_txmsg(cgrp);
 	if (err)
 		goto out;
@ -1393,6 +1463,9 @@ static int test_start_end(int cgrp)
 	opt.iov_length = 100;
 	txmsg_cork = 1600;

+	txmsg_start_pop = 0;
+	txmsg_pop = 0;
+
 	for (i = 99; i <= 1600; i += 500) {
 		txmsg_start = 0;
 		txmsg_end = i;
@ -1403,6 +1476,17 @@ static int test_start_end(int cgrp)
 			goto out;
 	}

+	/* Test pop data in middle of cork */
+	for (i = 99; i <= 1600; i += 500) {
+		txmsg_start_pop = 10;
+		txmsg_pop = i;
+		err = test_exec(cgrp, &opt);
+		if (err)
+			goto out;
+	}
+	txmsg_start_pop = 0;
+	txmsg_pop = 0;
+
 	/* Test start/end with cork but pull data in middle */
 	for (i = 199; i <= 1600; i += 500) {
 		txmsg_start = 100;
@ -1423,6 +1507,15 @@ static int test_start_end(int cgrp)
 	if (err)
 		goto out;

+	/* Test pop with cork pulling last sg entry */
+	txmsg_start_pop = 1500;
+	txmsg_pop = 1600;
+	err = test_exec(cgrp, &opt);
+	if (err)
+		goto out;
+	txmsg_start_pop = 0;
+	txmsg_pop = 0;
+
 	/* Test start/end pull of single byte in last page */
 	txmsg_start = 1111;
 	txmsg_end = 1112;
@ -1432,6 +1525,13 @@ static int test_start_end(int cgrp)
 	if (err)
 		goto out;

+	/* Test pop of single byte in last page */
+	txmsg_start_pop = 1111;
+	txmsg_pop = 1112;
+	err = test_exec(cgrp, &opt);
+	if (err)
+		goto out;
+
 	/* Test start/end with end < start */
 	txmsg_start = 1111;
 	txmsg_end = 0;
@ -1456,7 +1556,20 @@ static int test_start_end(int cgrp)
 	txmsg_start_push = 1601;
 	txmsg_end_push = 1600;
 	err = test_exec(cgrp, &opt);
+	if (err)
+		goto out;

+	/* Test pop with start > data */
+	txmsg_start_pop = 1601;
+	txmsg_pop = 1;
+	err = test_exec(cgrp, &opt);
+	if (err)
+		goto out;
+
+	/* Test pop with pop range > data */
+	txmsg_start_pop = 1599;
+	txmsg_pop = 10;
+	err = test_exec(cgrp, &opt);
 out:
 	txmsg_start = 0;
 	txmsg_end = 0;
@ -1641,6 +1754,12 @@ int main(int argc, char **argv)
 		case 'q':
 			txmsg_end_push = atoi(optarg);
 			break;
+		case 'w':
+			txmsg_start_pop = atoi(optarg);
+			break;
+		case 'x':
+			txmsg_pop = atoi(optarg);
+			break;
 		case 'a':
 			txmsg_apply = atoi(optarg);
 			break;
--- a/tools/testing/selftests/bpf/test_sockmap_kern.h
+++ b/tools/testing/selftests/bpf/test_sockmap_kern.h
@ -74,7 +74,7 @@ struct bpf_map_def SEC("maps") sock_bytes = {
 	.type = BPF_MAP_TYPE_ARRAY,
 	.key_size = sizeof(int),
 	.value_size = sizeof(int),
-	.max_entries = 4
+	.max_entries = 6
 };

 struct bpf_map_def SEC("maps") sock_redir_flags = {
@ -181,8 +181,8 @@ int bpf_sockmap(struct bpf_sock_ops *skops)
 SEC("sk_msg1")
 int bpf_prog4(struct sk_msg_md *msg)
 {
-	int *bytes, zero = 0, one = 1, two = 2, three = 3;
-	int *start, *end, *start_push, *end_push;
+	int *bytes, zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
+	int *start, *end, *start_push, *end_push, *start_pop, *pop;

 	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
 	if (bytes)
@ -198,15 +198,19 @@ int bpf_prog4(struct sk_msg_md *msg)
 	end_push = bpf_map_lookup_elem(&sock_bytes, &three);
 	if (start_push && end_push)
 		bpf_msg_push_data(msg, *start_push, *end_push, 0);
+	start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+	pop = bpf_map_lookup_elem(&sock_bytes, &five);
+	if (start_pop && pop)
+		bpf_msg_pop_data(msg, *start_pop, *pop, 0);
 	return SK_PASS;
 }

 SEC("sk_msg2")
 int bpf_prog5(struct sk_msg_md *msg)
 {
-	int zero = 0, one = 1, two = 2, three = 3;
-	int *start, *end, *start_push, *end_push;
-	int *bytes, len1, len2 = 0, len3;
+	int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
+	int *start, *end, *start_push, *end_push, *start_pop, *pop;
+	int *bytes, len1, len2 = 0, len3, len4;
 	int err1 = -1, err2 = -1;

 	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
@ -247,6 +251,20 @@ int bpf_prog5(struct sk_msg_md *msg)
 		bpf_printk("sk_msg2: length push_update %i->%i\n",
 			   len2 ? len2 : len1, len3);
 	}
+	start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+	pop = bpf_map_lookup_elem(&sock_bytes, &five);
+	if (start_pop && pop) {
+		int err;
+
+		bpf_printk("sk_msg2: pop(%i@%i)\n",
+			   start_pop, pop);
+		err = bpf_msg_pop_data(msg, *start_pop, *pop, 0);
+		if (err)
+			bpf_printk("sk_msg2: pop_data err %i\n", err);
+		len4 = (__u64)msg->data_end - (__u64)msg->data;
+		bpf_printk("sk_msg2: length pop_data %i->%i\n",
+			   len1 ? len1 : 0,  len4);
+	}

 	bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n",
 		   len1, err1, err2);
@ -256,8 +274,8 @@ int bpf_prog5(struct sk_msg_md *msg)
 SEC("sk_msg3")
 int bpf_prog6(struct sk_msg_md *msg)
 {
-	int *bytes, *start, *end, *start_push, *end_push, *f;
-	int zero = 0, one = 1, two = 2, three = 3, key = 0;
+	int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, key = 0;
+	int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f;
 	__u64 flags = 0;

 	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
@ -277,6 +295,11 @@ int bpf_prog6(struct sk_msg_md *msg)
 	if (start_push && end_push)
 		bpf_msg_push_data(msg, *start_push, *end_push, 0);

+	start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+	pop = bpf_map_lookup_elem(&sock_bytes, &five);
+	if (start_pop && pop)
+		bpf_msg_pop_data(msg, *start_pop, *pop, 0);
+
 	f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
 	if (f && *f) {
 		key = 2;
@ -292,8 +315,9 @@ int bpf_prog6(struct sk_msg_md *msg)
 SEC("sk_msg4")
 int bpf_prog7(struct sk_msg_md *msg)
 {
-	int zero = 0, one = 1, two = 2, three = 3, len1, len2 = 0, len3;
-	int *bytes, *start, *end, *start_push, *end_push, *f;
+	int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f;
+	int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
+	int len1, len2 = 0, len3, len4;
 	int err1 = 0, err2 = 0, key = 0;
 	__u64 flags = 0;

@ -335,6 +359,22 @@ int bpf_prog7(struct sk_msg_md *msg)
 			   len2 ? len2 : len1, len3);
 	}

+	start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+	pop = bpf_map_lookup_elem(&sock_bytes, &five);
+	if (start_pop && pop) {
+		int err;
+
+		bpf_printk("sk_msg4: pop(%i@%i)\n",
+			   start_pop, pop);
+		err = bpf_msg_pop_data(msg, *start_pop, *pop, 0);
+		if (err)
+			bpf_printk("sk_msg4: pop_data err %i\n", err);
+		len4 = (__u64)msg->data_end - (__u64)msg->data;
+		bpf_printk("sk_msg4: length pop_data %i->%i\n",
+			   len1 ? len1 : 0,  len4);
+	}
+
+
 	f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
 	if (f && *f) {
 		key = 2;
@ -389,8 +429,8 @@ int bpf_prog9(struct sk_msg_md *msg)
 SEC("sk_msg7")
 int bpf_prog10(struct sk_msg_md *msg)
 {
-	int *bytes, *start, *end, *start_push, *end_push;
-	int zero = 0, one = 1, two = 2, three = 3;
+	int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop;
+	int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;

 	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
 	if (bytes)
@ -406,7 +446,11 @@ int bpf_prog10(struct sk_msg_md *msg)
 	end_push = bpf_map_lookup_elem(&sock_bytes, &three);
 	if (start_push && end_push)
 		bpf_msg_push_data(msg, *start_push, *end_push, 0);
-
+	start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+	pop = bpf_map_lookup_elem(&sock_bytes, &five);
+	if (start_pop && pop)
+		bpf_msg_pop_data(msg, *start_pop, *pop, 0);
+	bpf_printk("return sk drop\n");
 	return SK_DROP;
 }