diff --git a/include/string.h b/include/string.h
index f73bbf9..fc3172d 100644
--- a/include/string.h
+++ b/include/string.h
@@ -143,4 +143,13 @@ extern char *__strncat_chk (char *__restrict __dest,
 			    const char *__restrict __src,
 			    size_t __len, size_t __destlen) __THROW;
 
+extern void *__memcpy_overlap_x86_64(void *dst, void const *src, size_t len,
+	void *rv, int *p_lim);
+libc_hidden_proto(__memcpy_overlap_x86_64)
+
+extern void * __attribute__((regparm(2))) __memcpy_overlap_i386(
+	void *rv, int *p_lim,
+	void *dst, void const *src, size_t len);
+libc_hidden_proto(__memcpy_overlap_i386)
+
 #endif
diff --git a/stdlib/getenv.c b/stdlib/getenv.c
index 6cdfe2b..af8a0ac 100644
--- a/stdlib/getenv.c
+++ b/stdlib/getenv.c
@@ -24,6 +24,10 @@
 #include <unistd.h>
 
 
+#if defined NOT_IN_libc && defined IS_IN_rtld  /*{*/
+rtld_hidden_proto(strncmp)
+#endif  /*}*/
+
 /* Return the value of the environment variable NAME.  This implementation
    is tuned a bit in that it assumes no environment variable has an empty
    name which of course should always be true.  We have a special case for
diff --git a/string/Makefile b/string/Makefile
index f836f59..172d9ef 100644
--- a/string/Makefile
+++ b/string/Makefile
@@ -32,6 +32,7 @@ routines	:= strcat strchr strcmp strcoll strcpy strcspn		\
 		   strrchr strpbrk strsignal strspn strstr strtok	\
 		   strtok_r strxfrm memchr memcmp memmove memset	\
 		   mempcpy bcopy bzero ffs ffsll stpcpy stpncpy		\
+		   memcpy-overlap					\
 		   strcasecmp strncase strcasecmp_l strncase_l		\
 		   memccpy memcpy wordcopy strsep strcasestr		\
 		   swab strfry memfrob memmem rawmemchr strchrnul	\
diff --git a/string/memcpy-overlap.c b/string/memcpy-overlap.c
index e69de29..4e1549c 100644
--- a/string/memcpy-overlap.c
+++ b/string/memcpy-overlap.c
@@ -0,0 +1,106 @@
+/* Handle overlap in memcpy/mempcpy according to MEMCPY_CHECK_ environment
+   variable.  Then use memmove instead.
+   
+   Copyright 2010 John Reiser <jreiser@BitWagon.com>
+
+   This file is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   This file is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+#if 0  /*{*/
+#  include <stdlib.h>
+#endif  /*}*/
+#include <unistd.h>
+
+static char *
+pre_str(char const *const msg, char *cp)
+{
+	char const *p = strlen(msg) + msg;
+	while (msg < p) *--cp = *--p;
+	return cp;
+}
+
+static char *
+pre_hex(uintptr_t x, char *cp)
+{
+	do {
+		*--cp = "0123456789abcdef"[0xf & x];
+	} while (x>>=4);
+	cp-=2; cp[0]='0'; cp[1]='x';
+	return cp;
+}
+
+#if 0  /*{*/
+extern char **__libc_argv attribute_hidden;
+#endif  /*}*/
+
+/* Explicit result 'rv' enables use for both memcpy and mempcpy. */
+static void * __attribute__((regparm(3)))
+memlap_diagnose(void *const rv, int *const p_lim, void const *const pc,
+	void *const dst, void const *const src, size_t const len)
+{
+	int action = (*p_lim -= (1<<1));  /* FIXME: multiprocessor race */
+	if ((1<<0) & action) {  /* Print diagnostic on stderr (fd2). */
+		/* 2*: hex digits per byte
+		 * 2+: "0x"
+		 * 2+: punctuation
+		 * 4*: once per value converted
+		 * 500: leading text plus strlen(argv[0])
+		 */
+		char buf[500 + 4*(2+ 2+ 2*sizeof(uintptr_t))];
+		char *cp = &buf[sizeof(buf)]; *--cp = '\0';
+		cp = pre_str(") ***\n", cp);
+		                             cp = pre_hex((uintptr_t)len, cp);
+		cp-=2; cp[0]=','; cp[1]=' '; cp = pre_hex((uintptr_t)src, cp);
+		cp-=2; cp[0]=','; cp[1]=' '; cp = pre_hex((uintptr_t)dst, cp);
+		cp-=2; cp[0]=' '; cp[1]='('; cp = pre_hex((uintptr_t)pc , cp);
+#if 0  /*{*/
+		__libc_message (action & (1<<1),
+			"\n*** memcpy overlap %s @%s\n",
+			(__libc_argv[0] ?: "<unknown>"), cp);
+#else  /*}{*/
+		cp = pre_str("\n*** memcpy overlap @", cp);
+		write(2, cp, &buf[sizeof(buf)] - cp);
+#endif  /*}*/
+	}
+	if (action < (1<<1))  /* partially compensates for race above */
+#if 0  /*{*/
+		abort();
+#else  /*}{*/
+		__asm__("hlt");  /* FIXME: ix86 and x86_64 only */
+#endif  /*}*/
+	memmove(dst, src, len);
+	return rv;
+}
+
+void *__memcpy_overlap_x86_64(void *dst, void const *src, size_t len,
+	void *rv, int *p_lim)
+{
+	return memlap_diagnose(rv, p_lim,  __builtin_return_address(0),
+		dst, src, len);
+}
+libc_hidden_def(__memcpy_overlap_x86_64)
+
+void * __attribute__((regparm(2))) __memcpy_overlap_i386(
+	void *rv, int *p_lim,
+	void *dst, void const *src, size_t len)
+{
+	return memlap_diagnose(rv, p_lim, __builtin_return_address(0),
+		dst, src, len);
+}
+libc_hidden_def(__memcpy_overlap_i386)
+
diff --git a/sysdeps/i386/dl-irel.h b/sysdeps/i386/dl-irel.h
index 30385a1..d98acb0 100644
--- a/sysdeps/i386/dl-irel.h
+++ b/sysdeps/i386/dl-irel.h
@@ -22,10 +22,13 @@
 #define _DL_IREL_H
 
 #include <stdio.h>
+#include <stdlib.h>
 #include <unistd.h>
 
 #define ELF_MACHINE_IREL	1
 
+typedef char *(*getenv_t)(char const *);
+
 static inline void
 __attribute ((always_inline))
 elf_irel (const Elf32_Rel *reloc)
@@ -35,7 +38,7 @@ elf_irel (const Elf32_Rel *reloc)
 
   if (__builtin_expect (r_type == R_386_IRELATIVE, 1))
     {
-      Elf32_Addr value = ((Elf32_Addr (*) (void)) (*reloc_addr)) ();
+      Elf32_Addr value = ((Elf32_Addr (*) (getenv_t)) (*reloc_addr)) (getenv);
       *reloc_addr = value;
     }
   else
diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
index a093d2b..9314095 100644
--- a/sysdeps/i386/dl-machine.h
+++ b/sysdeps/i386/dl-machine.h
@@ -22,6 +22,7 @@
 
 #define ELF_MACHINE_NAME "i386"
 
+#include <stdlib.h>
 #include <sys/param.h>
 #include <sysdep.h>
 #include <tls.h>
@@ -304,6 +305,8 @@ elf_machine_plt_value (struct link_map *map, const Elf32_Rel *reloc,
 
 #ifdef RESOLVE_MAP
 
+typedef char *(*getenv_t)(char const *);
+
 /* Perform the relocation specified by RELOC and SYM (which is fully resolved).
    MAP is the object containing the reloc.  */
 
@@ -474,7 +477,7 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
 	  break;
 	case R_386_IRELATIVE:
 	  value = map->l_addr + *reloc_addr;
-	  value = ((Elf32_Addr (*) (void)) value) ();
+	  value = ((Elf32_Addr (*) (getenv_t)) value) (getenv);
 	  *reloc_addr = value;
 	  break;
 	default:
@@ -618,7 +621,7 @@ elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc,
 #  endif /* !RESOLVE_CONFLICT_FIND_MAP */
 	case R_386_IRELATIVE:
 	  value = map->l_addr + reloc->r_addend;
-	  value = ((Elf32_Addr (*) (void)) value) ();
+	  value = ((Elf32_Addr (*) (getenv_t)) value) (getenv);
 	  *reloc_addr = value;
 	  break;
 	default:
@@ -718,7 +721,7 @@ elf_machine_lazy_rel (struct link_map *map,
   else if (__builtin_expect (r_type == R_386_IRELATIVE, 0))
     {
       Elf32_Addr value = map->l_addr + *reloc_addr;
-      value = ((Elf32_Addr (*) (void)) value) ();
+      value = ((Elf32_Addr (*) (getenv_t)) value) (getenv);
       *reloc_addr = value;
     }
   else
@@ -747,7 +750,7 @@ elf_machine_lazy_rela (struct link_map *map,
   else if (__builtin_expect (r_type == R_386_IRELATIVE, 0))
     {
       Elf32_Addr value = map->l_addr + reloc->r_addend;
-      value = ((Elf32_Addr (*) (void)) value) ();
+      value = ((Elf32_Addr (*) (getenv_t)) value) (getenv);
       *reloc_addr = value;
     }
   else
diff --git a/sysdeps/i386/i586/Makefile b/sysdeps/i386/i586/Makefile
index e69de29..4e4d28d 100644
--- a/sysdeps/i386/i586/Makefile
+++ b/sysdeps/i386/i586/Makefile
@@ -0,0 +1,5 @@
+ifeq ($(subdir),csu)
+aux += init-arch
+gen-as-const-headers += ifunc-defines.sym
+endif
+
diff --git a/sysdeps/i386/i586/ifunc-defines.sym b/sysdeps/i386/i586/ifunc-defines.sym
index e69de29..d164208 100644
--- a/sysdeps/i386/i586/ifunc-defines.sym
+++ b/sysdeps/i386/i586/ifunc-defines.sym
@@ -0,0 +1,21 @@
+#include "init-arch.h"
+#include <stddef.h>
+
+--
+
+CPU_FEATURES_SIZE	sizeof (struct cpu_features)
+KIND_OFFSET		offsetof (struct cpu_features, kind)
+CPUID_OFFSET		offsetof (struct cpu_features, cpuid)
+CPUID_SIZE		sizeof (struct cpuid_registers)
+CPUID_EAX_OFFSET	offsetof (struct cpuid_registers, eax)
+CPUID_EBX_OFFSET	offsetof (struct cpuid_registers, ebx)
+CPUID_ECX_OFFSET	offsetof (struct cpuid_registers, ecx)
+CPUID_EDX_OFFSET	offsetof (struct cpuid_registers, edx)
+FAMILY_OFFSET		offsetof (struct cpu_features, family)
+MODEL_OFFSET		offsetof (struct cpu_features, model)
+FEATURE_OFFSET		offsetof (struct cpu_features, feature)
+FEATURE_SIZE		sizeof (unsigned int)
+
+COMMON_CPUID_INDEX_1
+FEATURE_INDEX_1
+FEATURE_MEMCPY_LIMIT_OVERLAP
diff --git a/sysdeps/i386/i586/init-arch.c b/sysdeps/i386/i586/init-arch.c
index e69de29..f0d7f19 100644
--- a/sysdeps/i386/i586/init-arch.c
+++ b/sysdeps/i386/i586/init-arch.c
@@ -0,0 +1 @@
+#include <sysdeps/i386/i686/multiarch/init-arch.c>
diff --git a/sysdeps/i386/i586/init-arch.h b/sysdeps/i386/i586/init-arch.h
index e69de29..c111b53 100644
--- a/sysdeps/i386/i586/init-arch.h
+++ b/sysdeps/i386/i586/init-arch.h
@@ -0,0 +1 @@
+#include <sysdeps/i386/i686/multiarch/init-arch.h>
diff --git a/sysdeps/i386/i586/memcpy.S b/sysdeps/i386/i586/memcpy.S
index 677a7e6..37a84c2 100644
--- a/sysdeps/i386/i586/memcpy.S
+++ b/sysdeps/i386/i586/memcpy.S
@@ -23,18 +23,54 @@
 #include "bp-sym.h"
 #include "bp-asm.h"
 
+#ifndef ENTRY_LAP
+# define ENTRY_LAP(sym) ENTRY(sym)
+# define END_LAP(sym)   END(sym)
+#endif
+
 /* BEWARE: `#ifdef memcpy' means that memcpy is redefined as `mempcpy',
    and the return value is the byte after the last one copied in
    the destination. */
 #define MEMPCPY_P (defined memcpy)
 
-#define PARMS	LINKAGE+8	/* space for 2 saved regs */
+#define PARMS	LINKAGE+0	/* no saved regs yet */
 #define RTN	PARMS
 #define DEST	RTN+RTN_SIZE
 #define SRC	DEST+PTR_SIZE
 #define LEN	SRC+PTR_SIZE
 
         .text
+#ifndef USE_AS_MEMMOVE  /*{*/
+#include <init-arch.h>
+ENTRY_LAP (memcpy_lap)
+	ENTER
+	movl DEST(%esp),%edx; movl %edx,%ecx
+	movl  SRC(%esp),%eax; subl %eax,%edx
+	subl %ecx,%eax; movl  LEN(%esp),%ecx
+	cmpl %ecx,%edx; movl DEST(%esp),%edx; jb L(badlap)  # dst in src[0 for len)
+	cmpl %ecx,%eax; movl  SRC(%esp),%eax; jb L(badlap)  # src in dst[0 for len)
+	jmp L(1)
+L(badlap):
+	movl DEST(%esp),%eax  # arg1 = result
+#if MEMPCPY_P  /*{*/
+	addl %ecx,%eax
+#endif  /*}*/
+#if defined(SHARED)  /*{*/
+	call 0f; 0: popl %edx
+	lea  FEATURE_OFFSET+index_memcpy_limit_overlap+__cpu_features@GOTOFF(%edx),%edx  # arg2 = &limit
+#else  /*}{*/
+	lea  FEATURE_OFFSET+index_memcpy_limit_overlap+__cpu_features,             %edx  # arg2 = &limit
+#endif  /*}*/
+	jmp HIDDEN_JUMPTARGET(__memcpy_overlap_i386)
+END_LAP (memcpy_lap)
+#endif  /*}*/
+
+#define PARMS	LINKAGE+8	/* space for 2 saved regs */
+#define RTN	PARMS
+#define DEST	RTN+RTN_SIZE
+#define SRC	DEST+PTR_SIZE
+#define LEN	SRC+PTR_SIZE
+
 #if defined PIC && !defined NOT_IN_libc
 ENTRY (__memcpy_chk)
 	movl	12(%esp), %eax
@@ -44,7 +80,7 @@ END (__memcpy_chk)
 #endif
 ENTRY (BP_SYM (memcpy))
 	ENTER
-
+L(1):
 	pushl	%edi
 	cfi_adjust_cfa_offset (4)
 	pushl	%esi
diff --git a/sysdeps/i386/i586/mempcpy.S b/sysdeps/i386/i586/mempcpy.S
index f492be7..1fb8f9d 100644
--- a/sysdeps/i386/i586/mempcpy.S
+++ b/sysdeps/i386/i586/mempcpy.S
@@ -1,5 +1,6 @@
 #define memcpy __mempcpy
 #define __memcpy_chk __mempcpy_chk
+#define __memcpy_lap __mempcpy_lap
 #include <sysdeps/i386/i586/memcpy.S>
 
 libc_hidden_def (BP_SYM (__mempcpy))
diff --git a/sysdeps/i386/i686/memcpy.S b/sysdeps/i386/i686/memcpy.S
index 86ee082..85625f6 100644
--- a/sysdeps/i386/i686/memcpy.S
+++ b/sysdeps/i386/i686/memcpy.S
@@ -24,28 +24,81 @@
 #include "bp-sym.h"
 #include "bp-asm.h"
 
+#ifndef MEMCPY  /*{*/
+#define   MEMCPY       memcpy
+#define __MEMCPY_CHK __memcpy_chk
+#define   MEMCPY_LAP   memcpy_lap
+#endif  /*}*/
+
+#ifndef NOT_IN_libc  /*{ IS_IN_libc */
+#  ifndef ENTRY_LAP
+#    define ENTRY_LAP(sym) ENTRY(sym)
+#    define END_LAP(sym)   END(sym)
+#  endif
+#endif  /*}*/
+
 #define PARMS	LINKAGE		/* no space for saved regs */
 #define RTN	PARMS
 #define DEST	RTN+RTN_SIZE
 #define SRC	DEST+PTR_SIZE
 #define LEN	SRC+PTR_SIZE
 
+#ifndef NOT_IN_libc  /*{ IS_IN_libc */
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.dx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.dx
+	.hidden	__i686.get_pc_thunk.dx
+	ALIGN (4)
+	.type	__i686.get_pc_thunk.dx,@function
+__i686.get_pc_thunk.dx:
+	movl	(%esp), %edx
+	ret
+
+#include <init-arch.h>
+	.text
+ENTRY_LAP (MEMCPY_LAP)
+	ENTER
+	movl %edi,%eax; movl DEST(%esp),%edi; movl %edi,%ecx
+	movl %esi,%edx; movl  SRC(%esp),%esi; subl %esi,%edi
+	subl %ecx,%esi; movl  LEN(%esp),%ecx
+	cmpl %ecx,%edi; movl DEST(%esp),%edi; jb L(badlap)  # dst in src[0 for len)
+	cmpl %ecx,%esi; movl  SRC(%esp),%esi; jb L(badlap)  # src in dst[0 for len]
+	jmp L(1)
+
+L(badlap):
+	movl %eax,%edi  # restore registers
+	movl %edx,%esi
+
+	movl DEST(%esp),%eax  # arg1 = result
+#ifdef USE_AS_MEMPCPY  /*{*/
+	addl %ecx,%eax
+#endif  /*}*/
+#if defined(SHARED)  /*{*/
+	call	__i686.get_pc_thunk.dx
+	addl	$_GLOBAL_OFFSET_TABLE_, %edx
+	lea  FEATURE_OFFSET+index_memcpy_limit_overlap+__cpu_features@GOTOFF(%edx),%edx  # arg2 = &limit
+#else  /*}{*/
+	lea  FEATURE_OFFSET+index_memcpy_limit_overlap+__cpu_features,             %edx  # arg2 = &limit
+#endif  /*}*/
+	jmp HIDDEN_JUMPTARGET(__memcpy_overlap_i386)
+END_LAP (MEMCPY_LAP)
+#endif  /*}*/
+
 	.text
 #if defined PIC && !defined NOT_IN_libc
-ENTRY_CHK (__memcpy_chk)
+ENTRY_CHK (__MEMCPY_CHK)
 	movl	12(%esp), %eax
 	cmpl	%eax, 16(%esp)
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END_CHK (__memcpy_chk)
+END_CHK (__MEMCPY_CHK)
 #endif
-ENTRY (BP_SYM (memcpy))
+ENTRY (BP_SYM (MEMCPY))
 	ENTER
 
 	movl	%edi, %eax
 	movl	DEST(%esp), %edi
 	movl	%esi, %edx
 	movl	SRC(%esp), %esi
-
+L(1):
 	movl	%edi, %ecx
 	xorl	%esi, %ecx
 	andl	$3, %ecx
@@ -100,5 +153,5 @@ ENTRY (BP_SYM (memcpy))
 2:	rep
 	movsl
 	jmp	.Lend
-END (BP_SYM (memcpy))
-libc_hidden_builtin_def (memcpy)
+END (BP_SYM (MEMCPY))
+libc_hidden_builtin_def (MEMCPY)
diff --git a/sysdeps/i386/i686/mempcpy.S b/sysdeps/i386/i686/mempcpy.S
index c10686f..10bbc10 100644
--- a/sysdeps/i386/i686/mempcpy.S
+++ b/sysdeps/i386/i686/mempcpy.S
@@ -30,15 +30,67 @@
 #define SRC	DEST+PTR_SIZE
 #define LEN	SRC+PTR_SIZE
 
+#if defined(SHARED)  /*{*/
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.dx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.dx
+	.hidden	__i686.get_pc_thunk.dx
+	ALIGN (4)
+	.type	__i686.get_pc_thunk.dx,@function
+__i686.get_pc_thunk.dx:
+	movl	(%esp), %edx
+	ret
+#endif  /*}*/
+
+#include <init-arch.h>
+
+#ifndef   __MEMPCPY  /*{*/
+#  define   MEMPCPY       mempcpy
+#  define __MEMPCPY     __mempcpy
+#  define __MEMPCPY_CHK __mempcpy_chk
+#  define __MEMPCPY_LAP __mempcpy_lap
+#endif  /*}*/
+
+#ifndef ENTRY_LAP  /*{*/
+#  define ENTRY_LAP(sym) ENTRY(sym)
+#  define   END_LAP(sym)   END(sym)
+#endif  /*}*/
+
 	.text
+#ifndef NOT_IN_libc  /*{*/
+ENTRY_LAP (MEMCPY_LAP)
+	ENTER
+	movl %edi,%eax; movl DEST(%esp),%edi; movl %edi,%ecx
+	movl %esi,%edx; movl  SRC(%esp),%esi; subl %esi,%edi
+	subl %ecx,%esi; movl  LEN(%esp),%ecx
+	cmpl %ecx,%edi; movl DEST(%esp),%edi; jb L(badlap)  # dst in src[0 for len)
+	cmpl %ecx,%esi; movl  SRC(%esp),%esi; jb L(badlap)  # src in dst[0 for len]
+	jmp L(1)
+
+L(badlap):
+	movl %eax,%edi  # restore registers
+	movl %edx,%esi
+
+	movl DEST(%esp),%eax
+	addl %ecx,%eax  # arg1 = result
+#if defined(SHARED)  /*{*/
+	call	__i686.get_pc_thunk.dx
+	addl	$_GLOBAL_OFFSET_TABLE_, %edx
+	lea  FEATURE_OFFSET+index_memcpy_limit_overlap+__cpu_features@GOTOFF(%edx),%edx  # arg2 = &limit
+#else  /*}{*/
+	lea  FEATURE_OFFSET+index_memcpy_limit_overlap+__cpu_features,             %edx  # arg2 = &limit
+#endif  /*}*/
+	jmp HIDDEN_JUMPTARGET(__memcpy_overlap_i386)
+END_LAP (MEMCPY_LAP)
+#endif  /*}*/
+
 #if defined PIC && !defined NOT_IN_libc
-ENTRY_CHK (__mempcpy_chk)
+ENTRY_CHK (__MEMPCPY_CHK)
 	movl	12(%esp), %eax
 	cmpl	%eax, 16(%esp)
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END_CHK (__mempcpy_chk)
+END_CHK (__MEMPCPY_CHK)
 #endif
-ENTRY (BP_SYM (__mempcpy))
+ENTRY (BP_SYM (__MEMPCPY))
 	ENTER
 
 	movl	LEN(%esp), %ecx
@@ -50,6 +102,7 @@ ENTRY (BP_SYM (__mempcpy))
 	cfi_register (esi, edx)
 	movl	SRC(%esp), %esi
 	CHECK_BOUNDS_BOTH_WIDE (%esi, SRC(%esp), %ecx)
+L(1):
 	cld
 	shrl	$1, %ecx
 	jnc	1f
@@ -67,7 +120,7 @@ ENTRY (BP_SYM (__mempcpy))
 
 	LEAVE
 	RET_PTR
-END (BP_SYM (__mempcpy))
-libc_hidden_def (BP_SYM (__mempcpy))
-weak_alias (BP_SYM (__mempcpy), BP_SYM (mempcpy))
+END (BP_SYM (__MEMPCPY))
+libc_hidden_def (BP_SYM (__MEMPCPY))
+weak_alias (BP_SYM (__MEMPCPY), BP_SYM (mempcpy))
 libc_hidden_builtin_def (mempcpy)
diff --git a/sysdeps/i386/i686/multiarch/bcopy.S b/sysdeps/i386/i686/multiarch/bcopy.S
index 8671bf6..e41843d 100644
--- a/sysdeps/i386/i686/multiarch/bcopy.S
+++ b/sysdeps/i386/i686/multiarch/bcopy.S
@@ -43,6 +43,7 @@ ENTRY(bcopy)
 	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
+	movl 4*2(%esp),%eax  # getenv_rtld; skip saved %ebx and retaddr
 	call	__init_cpu_features
 1:	leal	__bcopy_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
@@ -62,6 +63,7 @@ ENTRY(bcopy)
 	.type	bcopy, @gnu_indirect_function
 	cmpl	$0, KIND_OFFSET+__cpu_features
 	jne	1f
+	movl 4*1(%esp),%eax  # getenv_rtld; skip retaddr
 	call	__init_cpu_features
 1:	leal	__bcopy_ia32, %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
diff --git a/sysdeps/i386/i686/multiarch/bzero.S b/sysdeps/i386/i686/multiarch/bzero.S
index 8c740a4..c1944e4 100644
--- a/sysdeps/i386/i686/multiarch/bzero.S
+++ b/sysdeps/i386/i686/multiarch/bzero.S
@@ -43,6 +43,7 @@ ENTRY(__bzero)
 	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
+	movl 4*2(%esp),%eax  # getenv_rtld; skip saved %ebx and retaddr
 	call	__init_cpu_features
 1:	leal	__bzero_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
@@ -62,6 +63,7 @@ ENTRY(__bzero)
 	.type	__bzero, @gnu_indirect_function
 	cmpl	$0, KIND_OFFSET+__cpu_features
 	jne	1f
+	movl 4*1(%esp),%eax  # getenv_rtld; skip retaddr
 	call	__init_cpu_features
 1:	leal	__bzero_ia32, %eax
 	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
diff --git a/sysdeps/i386/i686/multiarch/ifunc-defines.sym b/sysdeps/i386/i686/multiarch/ifunc-defines.sym
index eb1538a..d164208 100644
--- a/sysdeps/i386/i686/multiarch/ifunc-defines.sym
+++ b/sysdeps/i386/i686/multiarch/ifunc-defines.sym
@@ -18,3 +18,4 @@ FEATURE_SIZE		sizeof (unsigned int)
 
 COMMON_CPUID_INDEX_1
 FEATURE_INDEX_1
+FEATURE_MEMCPY_LIMIT_OVERLAP
diff --git a/sysdeps/i386/i686/multiarch/memcmp.S b/sysdeps/i386/i686/multiarch/memcmp.S
index cf606a5..119f894 100644
--- a/sysdeps/i386/i686/multiarch/memcmp.S
+++ b/sysdeps/i386/i686/multiarch/memcmp.S
@@ -34,6 +34,7 @@ ENTRY(memcmp)
 	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
+	movl 4*2(%esp),%eax  # getenv_rtld; skip saved %ebx and retaddr
 	call	__init_cpu_features
 1:	leal	__memcmp_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
@@ -53,6 +54,7 @@ ENTRY(memcmp)
 	.type	memcmp, @gnu_indirect_function
 	cmpl	$0, KIND_OFFSET+__cpu_features
 	jne	1f
+	movl 4*1(%esp),%eax  # getenv_rtld; skip retaddr
 	call	__init_cpu_features
 1:	leal	__memcmp_ia32, %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
index 48a109c..6645540 100644
--- a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
+++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
@@ -19,6 +19,7 @@
    02111-1307 USA.  */
 
 #include <sysdep.h>
+#include <init-arch.h>
 
 #if !defined NOT_IN_libc \
     && (defined SHARED \
@@ -30,6 +31,7 @@
 #ifndef MEMCPY
 # define MEMCPY		__memcpy_ssse3_rep
 # define MEMCPY_CHK	__memcpy_chk_ssse3_rep
+# define MEMCPY_LAP	__memcpy_lap_ssse3_rep
 #endif
 
 #ifdef USE_AS_BCOPY
@@ -110,6 +112,37 @@ __i686.get_pc_thunk.bx:
 #endif
 
 	.section .text.ssse3,"ax",@progbits
+#ifndef USE_AS_MEMMOVE  /*{*/
+#ifndef ENTRY_LAP
+# define ENTRY_LAP(sym) ENTRY(sym)
+# define END_LAP(sym)   END(sym)
+#endif
+
+ENTRY_LAP (MEMCPY_LAP)
+	ENTRANCE
+	movl DEST(%esp),%edx; movl %edx,%ecx
+	movl  SRC(%esp),%eax; subl %eax,%edx
+	subl %ecx,%eax; movl  LEN(%esp),%ecx
+	cmpl %ecx,%edx; movl DEST(%esp),%edx; jb L(badlap)  # dst in src[0 for len)
+	cmpl %ecx,%eax; movl  SRC(%esp),%eax; jb L(badlap)  # src in dst[0 for len)
+	jmp L(1)
+L(badlap):
+	movl DEST(%esp),%eax  # arg1 = result
+#ifdef USE_AS_MEMPCPY  /*{*/
+	addl %ecx,%eax
+#endif  /*}*/
+#if defined(SHARED)  /*{*/
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	lea  __cpu_features+FEATURE_OFFSET+index_memcpy_limit_overlap@GOTOFF(%ebx),%edx  # arg2 = &limit
+#else  /*}{*/
+	lea  __cpu_features+FEATURE_OFFSET+index_memcpy_limit_overlap,             %edx  # arg2 = &limit
+#endif  /*}*/
+	POP(%ebx)
+	jmp HIDDEN_JUMPTARGET(__memcpy_overlap_i386)
+END_LAP (MEMCPY_LAP)
+#endif  /*}*/
+
 #if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
 	movl	12(%esp), %eax
@@ -122,7 +155,7 @@ ENTRY (MEMCPY)
 	movl	LEN(%esp), %ecx
 	movl	SRC(%esp), %eax
 	movl	DEST(%esp), %edx
-
+L(1):
 #ifdef USE_AS_MEMMOVE
 	cmp	%eax, %edx
 	jb	L(copy_forward)
diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
index ec9eeb9..3b178cd 100644
--- a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
@@ -19,6 +19,12 @@
    02111-1307 USA.  */
 
 #include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef ENTRY_LAP
+# define ENTRY_LAP(sym) ENTRY(sym)
+# define END_LAP(sym)   END(sym)
+#endif
 
 #if !defined NOT_IN_libc \
     && (defined SHARED \
@@ -30,6 +36,7 @@
 #ifndef MEMCPY
 # define MEMCPY		__memcpy_ssse3
 # define MEMCPY_CHK	__memcpy_chk_ssse3
+# define MEMCPY_LAP	__memcpy_lap_ssse3
 #endif
 
 #ifdef USE_AS_BCOPY
@@ -110,6 +117,32 @@ __i686.get_pc_thunk.bx:
 #endif
 
 	.section .text.ssse3,"ax",@progbits
+#ifndef USE_AS_MEMMOVE  /*{*/
+ENTRY_LAP (MEMCPY_LAP)
+	ENTRANCE
+	movl DEST(%esp),%edx; movl %edx,%ecx
+	movl  SRC(%esp),%eax; subl %eax,%edx
+	subl %ecx,%eax; movl  LEN(%esp),%ecx
+	cmpl %ecx,%edx; movl DEST(%esp),%edx; jb L(badlap)  # dst in src[0 for len)
+	cmpl %ecx,%eax; movl  SRC(%esp),%eax; jb L(badlap)  # src in dst[0 for len)
+	jmp L(1)
+L(badlap):
+	movl DEST(%esp),%eax  # arg1 = result
+#ifdef USE_AS_MEMPCPY  /*{*/
+	addl %ecx,%eax
+#endif  /*}*/
+#if defined(SHARED)  /*{*/
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	lea  __cpu_features+FEATURE_OFFSET+index_memcpy_limit_overlap@GOTOFF(%ebx),%edx  # arg2 = &limit
+	POP(%ebx)
+#else  /*}{*/
+	lea  __cpu_features+FEATURE_OFFSET+index_memcpy_limit_overlap,             %edx  # arg2 = &limit
+#endif  /*}*/
+	jmp HIDDEN_JUMPTARGET(__memcpy_overlap_i386)
+END_LAP (MEMCPY_LAP)
+#endif  /*}*/
+
 #if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
 	movl	12(%esp), %eax
@@ -122,7 +155,7 @@ ENTRY (MEMCPY)
 	movl	LEN(%esp), %ecx
 	movl	SRC(%esp), %eax
 	movl	DEST(%esp), %edx
-
+L(1):
 #ifdef USE_AS_MEMMOVE
 	cmp	%eax, %edx
 	jb	L(copy_forward)
diff --git a/sysdeps/i386/i686/multiarch/memcpy.S b/sysdeps/i386/i686/multiarch/memcpy.S
index bf1c7cc..6a6c9e2 100644
--- a/sysdeps/i386/i686/multiarch/memcpy.S
+++ b/sysdeps/i386/i686/multiarch/memcpy.S
@@ -34,6 +34,12 @@ __i686.get_pc_thunk.bx:
 	movl	(%esp), %ebx
 	ret
 
+#define MEMCPY_LAP_SUBST(name1, name2) \
+	leal name1@GOTOFF(%ebx), %eax; \
+	testl $bit_memcpy_check_overlap, FEATURE_OFFSET+index_memcpy_check_overlap+__cpu_features@GOTOFF(%ebx); jz 0f; \
+	leal name2@GOTOFF(%ebx), %eax; \
+0:
+
 	.text
 ENTRY(memcpy)
 	.type	memcpy, @gnu_indirect_function
@@ -44,40 +50,21 @@ ENTRY(memcpy)
 	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
+	movl 4*2(%esp),%eax  # getenv_rtld; skip saved %ebx and retaddr
 	call	__init_cpu_features
-1:	leal	__memcpy_ia32@GOTOFF(%ebx), %eax
+1:	MEMCPY_LAP_SUBST(__memcpy_ia32, __memcpy_lap_ia32)
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
 	jz	2f
-	leal	__memcpy_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memcpy_ssse3_rep@GOTOFF(%ebx), %eax
+	MEMCPY_LAP_SUBST(__memcpy_ssse3, __memcpy_lap_ssse3)
 2:	popl	%ebx
 	cfi_adjust_cfa_offset (-4)
 	cfi_restore (ebx)
 	ret
 END(memcpy)
 
-# undef ENTRY
-# define ENTRY(name) \
-	.type __memcpy_ia32, @function; \
-	.p2align 4; \
-	__memcpy_ia32: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __memcpy_ia32, .-__memcpy_ia32
-
-# undef ENTRY_CHK
-# define ENTRY_CHK(name) \
-	.type __memcpy_chk_ia32, @function; \
-	.globl __memcpy_chk_ia32; \
-	.p2align 4; \
-	__memcpy_chk_ia32: cfi_startproc; \
-	CALL_MCOUNT
-# undef END_CHK
-# define END_CHK(name) \
-	cfi_endproc; .size __memcpy_chk_ia32, .-__memcpy_chk_ia32
+#define   MEMCPY     __memcpy_ia32
+#define __MEMCPY_CHK __memcpy_chk_ia32
+#define   MEMCPY_LAP __memcpy_lap_ia32
 
 # undef libc_hidden_builtin_def
 /* IFUNC doesn't work with the hidden functions in shared library since
diff --git a/sysdeps/i386/i686/multiarch/memcpy_chk.S b/sysdeps/i386/i686/multiarch/memcpy_chk.S
index 171ac8a..0a534df 100644
--- a/sysdeps/i386/i686/multiarch/memcpy_chk.S
+++ b/sysdeps/i386/i686/multiarch/memcpy_chk.S
@@ -45,6 +45,7 @@ ENTRY(__memcpy_chk)
 	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
+	movl 4*2(%esp),%eax  # getenv_rtld; skip saved %ebx and retaddr
 	call	__init_cpu_features
 1:	leal	__memcpy_chk_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
diff --git a/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S b/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
index d202fc4..5fe8ac9 100644
--- a/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
+++ b/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
@@ -1,4 +1,5 @@
 #define USE_AS_MEMMOVE
 #define MEMCPY		__memmove_ssse3_rep
 #define MEMCPY_CHK	__memmove_chk_ssse3_rep
+#define MEMCPY_LAP	__memmove_lap_ssse3_rep
 #include "memcpy-ssse3-rep.S"
diff --git a/sysdeps/i386/i686/multiarch/memmove-ssse3.S b/sysdeps/i386/i686/multiarch/memmove-ssse3.S
index 295430b..c723c1a 100644
--- a/sysdeps/i386/i686/multiarch/memmove-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/memmove-ssse3.S
@@ -1,4 +1,5 @@
 #define USE_AS_MEMMOVE
 #define MEMCPY		__memmove_ssse3
 #define MEMCPY_CHK	__memmove_chk_ssse3
+#define MEMCPY_LAP	__memmove_lap_ssse3
 #include "memcpy-ssse3.S"
diff --git a/sysdeps/i386/i686/multiarch/memmove.S b/sysdeps/i386/i686/multiarch/memmove.S
index e0529c0..6348531 100644
--- a/sysdeps/i386/i686/multiarch/memmove.S
+++ b/sysdeps/i386/i686/multiarch/memmove.S
@@ -43,6 +43,7 @@ ENTRY(memmove)
 	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
+	movl 4*2(%esp),%eax  # getenv_rtld; skip saved %ebx and retaddr
 	call	__init_cpu_features
 1:	leal	__memmove_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
@@ -69,6 +70,7 @@ ENTRY(memmove)
 	.type	memmove, @gnu_indirect_function
 	cmpl	$0, KIND_OFFSET+__cpu_features
 	jne	1f
+	movl 4*1(%esp),%eax  # getenv_rtld; skip retaddr
 	call	__init_cpu_features
 1:	leal	__memmove_ia32, %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
diff --git a/sysdeps/i386/i686/multiarch/memmove_chk.S b/sysdeps/i386/i686/multiarch/memmove_chk.S
index e33f2a3..1fa3958 100644
--- a/sysdeps/i386/i686/multiarch/memmove_chk.S
+++ b/sysdeps/i386/i686/multiarch/memmove_chk.S
@@ -43,6 +43,7 @@ ENTRY(__memmove_chk)
 	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
+	movl 4*2(%esp),%eax  # getenv_rtld; skip saved %ebx and retaddr
 	call	__init_cpu_features
 1:	leal	__memmove_chk_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
@@ -62,6 +63,7 @@ ENTRY(__memmove_chk)
 	.type	__memmove_chk, @gnu_indirect_function
 	cmpl	$0, KIND_OFFSET+__cpu_features
 	jne	1f
+	movl 4*1(%esp),%eax  # getenv_rtld; skip retaddr
 	call	__init_cpu_features
 1:	leal	__memmove_chk_ia32, %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
diff --git a/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
index 5357b33..5d58483 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
@@ -1,4 +1,5 @@
 #define USE_AS_MEMPCPY
 #define MEMCPY		__mempcpy_ssse3_rep
 #define MEMCPY_CHK	__mempcpy_chk_ssse3_rep
+#define MEMCPY_LAP	__mempcpy_lap_ssse3_rep
 #include "memcpy-ssse3-rep.S"
diff --git a/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S b/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
index 822d98e..de70740 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
@@ -1,4 +1,5 @@
 #define USE_AS_MEMPCPY
 #define MEMCPY		__mempcpy_ssse3
 #define MEMCPY_CHK	__mempcpy_chk_ssse3
+#define MEMCPY_LAP	__mempcpy_lap_ssse3
 #include "memcpy-ssse3.S"
diff --git a/sysdeps/i386/i686/multiarch/mempcpy.S b/sysdeps/i386/i686/multiarch/mempcpy.S
index df830d2..eaf170f 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy.S
@@ -44,6 +44,7 @@ ENTRY(__mempcpy)
 	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
+	movl 4*2(%esp),%eax  # getenv_rtld; skip saved %ebx and retaddr
 	call	__init_cpu_features
 1:	leal	__mempcpy_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
@@ -60,24 +61,24 @@ END(__mempcpy)
 
 # undef ENTRY
 # define ENTRY(name) \
-	.type __mempcpy_ia32, @function; \
+	.type name, @function; \
 	.p2align 4; \
-	__mempcpy_ia32: cfi_startproc; \
+	name: cfi_startproc; \
 	CALL_MCOUNT
 # undef END
 # define END(name) \
-	cfi_endproc; .size __mempcpy_ia32, .-__mempcpy_ia32
+	cfi_endproc; .size name, .-name
 
 # undef ENTRY_CHK
 # define ENTRY_CHK(name) \
-	.type __mempcpy_chk_ia32, @function; \
-	.globl __mempcpy_chk_ia32; \
+	.type name, @function; \
+	.globl name; \
 	.p2align 4; \
-	__mempcpy_chk_ia32: cfi_startproc; \
+	name: cfi_startproc; \
 	CALL_MCOUNT
 # undef END_CHK
 # define END_CHK(name) \
-	cfi_endproc; .size __mempcpy_chk_ia32, .-__mempcpy_chk_ia32
+	cfi_endproc; .size name, .-name
 
 # undef libc_hidden_def
 # undef libc_hidden_builtin_def
@@ -85,9 +86,14 @@ END(__mempcpy)
    they will be called without setting up EBX needed for PLT which is
    used by IFUNC.  */
 # define libc_hidden_def(name) \
-	.globl __GI_mempcpy; __GI_mempcpy = __mempcpy_ia32
+	.globl __GI_mempcpy; __GI_mempcpy = name
 # define libc_hidden_builtin_def(name) \
-	.globl __GI___mempcpy; __GI___mempcpy = __mempcpy_ia32
+	.globl __GI___mempcpy; __GI___mempcpy = name
 #endif
 
+#define   MEMPCPY        mempcpy_ia32
+#define __MEMPCPY      __mempcpy_ia32
+#define __MEMPCPY_LAP  __mempcpy_lap_ia32
+#define __MEMPCPY_CHK  __mempcpy_chk_ia32
+
 #include "../mempcpy.S"
diff --git a/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
index 828fb5e..8a5f7d4 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy_chk.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
@@ -45,6 +45,7 @@ ENTRY(__mempcpy_chk)
 	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
+	movl 4*2(%esp),%eax  # getenv_rtld; skip saved %ebx and retaddr
 	call	__init_cpu_features
 1:	leal	__mempcpy_chk_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
diff --git a/sysdeps/i386/i686/multiarch/memset.S b/sysdeps/i386/i686/multiarch/memset.S
index 34dddce..24ed49b 100644
--- a/sysdeps/i386/i686/multiarch/memset.S
+++ b/sysdeps/i386/i686/multiarch/memset.S
@@ -43,6 +43,7 @@ ENTRY(memset)
 	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
+	movl 4*2(%esp),%eax  # getenv_rtld; skip saved %ebx and retaddr
 	call	__init_cpu_features
 1:	leal	__memset_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
@@ -62,6 +63,7 @@ ENTRY(memset)
 	.type	memset, @gnu_indirect_function
 	cmpl	$0, KIND_OFFSET+__cpu_features
 	jne	1f
+	movl 4*1(%esp),%eax  # getenv_rtld; skip retaddr
 	call	__init_cpu_features
 1:	leal	__memset_ia32, %eax
 	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
diff --git a/sysdeps/i386/i686/multiarch/memset_chk.S b/sysdeps/i386/i686/multiarch/memset_chk.S
index d659c7e..ebc6448 100644
--- a/sysdeps/i386/i686/multiarch/memset_chk.S
+++ b/sysdeps/i386/i686/multiarch/memset_chk.S
@@ -43,6 +43,7 @@ ENTRY(__memset_chk)
 	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
+	movl 4*2(%esp),%eax  # getenv_rtld; skip saved %ebx and retaddr
 	call	__init_cpu_features
 1:	leal	__memset_chk_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
@@ -66,6 +67,7 @@ ENTRY(__memset_chk)
 	.type	__memset_chk, @gnu_indirect_function
 	cmpl	$0, KIND_OFFSET+__cpu_features
 	jne	1f
+	movl 4*1(%esp),%eax  # getenv_rtld; skip retaddr
 	call	__init_cpu_features
 1:	leal	__memset_chk_ia32, %eax
 	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
diff --git a/sysdeps/i386/i686/multiarch/strcmp.S b/sysdeps/i386/i686/multiarch/strcmp.S
index 7136d47..19971cf 100644
--- a/sysdeps/i386/i686/multiarch/strcmp.S
+++ b/sysdeps/i386/i686/multiarch/strcmp.S
@@ -59,6 +59,7 @@ ENTRY(STRCMP)
 	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
+	movl 4*2(%esp),%eax  # getenv_rtld; skip saved %ebx and retaddr
 	call	__init_cpu_features
 1:	leal	__STRCMP_IA32@GOTOFF(%ebx), %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
@@ -78,6 +79,7 @@ ENTRY(STRCMP)
 	.type	STRCMP, @gnu_indirect_function
 	cmpl	$0, KIND_OFFSET+__cpu_features
 	jne	1f
+	movl 4*1(%esp),%eax  # getenv_rtld; skip retaddr
 	call	__init_cpu_features
 1:	leal	__STRCMP_IA32, %eax
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
diff --git a/sysdeps/i386/i686/multiarch/strcspn.S b/sysdeps/i386/i686/multiarch/strcspn.S
index b2310e4..a56b98a 100644
--- a/sysdeps/i386/i686/multiarch/strcspn.S
+++ b/sysdeps/i386/i686/multiarch/strcspn.S
@@ -62,6 +62,7 @@ ENTRY(STRCSPN)
 	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
+	movl 4*2(%esp),%eax  # getenv_rtld; skip saved %ebx and retaddr
 	call	__init_cpu_features
 1:	leal	STRCSPN_IA32@GOTOFF(%ebx), %eax
 	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
@@ -78,6 +79,7 @@ ENTRY(STRCSPN)
 	.type	STRCSPN, @gnu_indirect_function
 	cmpl	$0, KIND_OFFSET+__cpu_features
 	jne	1f
+	movl 4*1(%esp),%eax  # getenv_rtld; skip retaddr
 	call	__init_cpu_features
 1:	leal	STRCSPN_IA32, %eax
 	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
diff --git a/sysdeps/i386/i686/multiarch/strlen.S b/sysdeps/i386/i686/multiarch/strlen.S
index 9d465c8..8046c8e 100644
--- a/sysdeps/i386/i686/multiarch/strlen.S
+++ b/sysdeps/i386/i686/multiarch/strlen.S
@@ -44,6 +44,7 @@ ENTRY(strlen)
 	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
+	movl 4*2(%esp),%eax  # getenv_rtld; skip saved %ebx and retaddr
 	call	__init_cpu_features
 1:	leal	__strlen_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
diff --git a/sysdeps/i386/i686/multiarch/strspn.S b/sysdeps/i386/i686/multiarch/strspn.S
index cd26c80..d60aaef 100644
--- a/sysdeps/i386/i686/multiarch/strspn.S
+++ b/sysdeps/i386/i686/multiarch/strspn.S
@@ -47,6 +47,7 @@ ENTRY(strspn)
 	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
+	movl 4*2(%esp),%eax  # getenv_rtld; skip saved %ebx and retaddr
 	call	__init_cpu_features
 1:	leal	__strspn_ia32@GOTOFF(%ebx), %eax
 	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
@@ -63,6 +64,7 @@ ENTRY(strspn)
 	.type	strspn, @gnu_indirect_function
 	cmpl	$0, KIND_OFFSET+__cpu_features
 	jne	1f
+	movl 4*1(%esp),%eax  # getenv_rtld; skip retaddr
 	call	__init_cpu_features
 1:	leal	__strspn_ia32, %eax
 	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
diff --git a/sysdeps/i386/strncmp.S b/sysdeps/i386/strncmp.S
index e69de29..e22e65d 100644
--- a/sysdeps/i386/strncmp.S
+++ b/sysdeps/i386/strncmp.S
@@ -0,0 +1,39 @@
+/* Simple version that does not use SSE registers; for use in ld.so.
+   Copyright (C) 2010
+   Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+ENTRY (strncmp)
+/* Simple version because we can't use SSE registers in ld.so.  */
+	movl 4*1(%esp),%edx  # 1st arg
+	movl 4*2(%esp),%ecx  # 2nd arg
+L(top):
+	movb (%edx),%al;       addl $1,%edx
+	cmpb (%ecx),%al;  leal 1(%ecx),%ecx; jne L(done)
+	subl $1,4*3(%esp); jz L(done)
+	testb %al,%al;    jne L(top)
+L(done):
+	movsbl -1(%ecx),%edx
+	movsbl      %al,%eax
+	subl       %edx,%eax  # avoid 8-bit overflow
+	ret
+END (strncmp)
+libc_hidden_builtin_def(strncmp)
diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c
index eae54e7..4cf413e 100644
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -431,7 +431,7 @@ __cache_sysconf (int name)
 {
 #ifdef USE_MULTIARCH
   if (__cpu_features.kind == arch_kind_unknown)
-    __init_cpu_features ();
+    __init_cpu_features (getenv);
 #else
   /* Find out what brand of processor.  */
   unsigned int max_cpuid;
@@ -506,7 +506,7 @@ init_cacheinfo (void)
 
 #ifdef USE_MULTIARCH
   if (__cpu_features.kind == arch_kind_unknown)
-    __init_cpu_features ();
+    __init_cpu_features (getenv);
 #else
   int max_cpuid;
   __cpuid (0, max_cpuid, ebx, ecx, edx);
diff --git a/sysdeps/x86_64/dl-irel.h b/sysdeps/x86_64/dl-irel.h
index d2d5c06..c70552d 100644
--- a/sysdeps/x86_64/dl-irel.h
+++ b/sysdeps/x86_64/dl-irel.h
@@ -22,10 +22,13 @@
 #define _DL_IREL_H
 
 #include <stdio.h>
+#include <stdlib.h>
 #include <unistd.h>
 
 #define ELF_MACHINE_IRELA	1
 
+typedef char *(*getenv_t)(char const *);
+
 static inline void
 __attribute ((always_inline))
 elf_irela (const Elf64_Rela *reloc)
@@ -35,7 +38,7 @@ elf_irela (const Elf64_Rela *reloc)
 
   if (__builtin_expect (r_type == R_X86_64_IRELATIVE, 1))
     {
-      Elf64_Addr value = ((Elf64_Addr (*) (void)) reloc->r_addend) ();
+      Elf64_Addr value = ((Elf64_Addr (*) (getenv_t)) reloc->r_addend) (getenv);
       *reloc_addr = value;
     }
   else
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index f615e95..2c01e60 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -23,6 +23,7 @@
 
 #define ELF_MACHINE_NAME "x86_64"
 
+#include <stdlib.h>
 #include <sys/param.h>
 #include <sysdep.h>
 #include <tls.h>
@@ -254,6 +255,8 @@ elf_machine_plt_value (struct link_map *map, const Elf64_Rela *reloc,
 
 #ifdef RESOLVE_MAP
 
+typedef char *(*getenv_t)(char const *);
+
 /* Perform the relocation specified by RELOC and SYM (which is fully resolved).
    MAP is the object containing the reloc.  */
 
@@ -445,7 +448,7 @@ elf_machine_rela (struct link_map *map, const Elf64_Rela *reloc,
 #  endif
 	case R_X86_64_IRELATIVE:
 	  value = map->l_addr + reloc->r_addend;
-	  value = ((Elf64_Addr (*) (void)) value) ();
+          value = ((Elf64_Addr (*) (getenv_t)) value) (getenv);
 	  *reloc_addr = value;
 	  break;
 	default:
@@ -497,7 +500,7 @@ elf_machine_lazy_rel (struct link_map *map,
   else if (__builtin_expect (r_type == R_X86_64_IRELATIVE, 0))
     {
       Elf64_Addr value = map->l_addr + reloc->r_addend;
-      value = ((Elf64_Addr (*) (void)) value) ();
+      value = ((Elf64_Addr (*) (getenv_t)) value) (getenv);
       *reloc_addr = value;
     }
   else
diff --git a/sysdeps/x86_64/memcpy.S b/sysdeps/x86_64/memcpy.S
index b4545ac..a5f1094 100644
--- a/sysdeps/x86_64/memcpy.S
+++ b/sysdeps/x86_64/memcpy.S
@@ -23,8 +23,16 @@
 */
 
 #include <sysdep.h>
+#include <sysdep.h>
 #include "asm-syntax.h"
 
+#ifndef NOT_IN_libc  /*{ IS_IN_libc */
+#  ifndef ENTRY_LAP
+#    define ENTRY_LAP(sym) ENTRY(sym)
+#    define END_LAP(sym)   END(sym)
+#  endif
+#endif  /*}*/
+
 /* Stack slots in the red-zone. */
 
 #ifdef USE_AS_MEMPCPY
@@ -39,6 +47,25 @@
 
         .text
 
+#ifndef NOT_IN_libc  /*{ IS_IN_libc */
+ENTRY_LAP(memcpy_lap)
+	movq %rdi, %rax; movq %rsi, %rcx
+	subq %rsi, %rax; subq %rdi, %rcx
+	cmpq %rdx, %rax; jb L(badlap)  # dst in &src[0 for len)
+	                 cmpq %rdx, %rcx; jb L(badlap)  # src in &dst[0 for len)
+	jmp L(1try)
+L(badlap):
+#ifdef USE_AS_MEMPCPY  /*{*/
+	leaq (%rdi,%rdx),%rcx  # arg4 = result
+#else  /*}{*/
+	movq %rdi,%rcx  # arg4 = result
+#endif  /*}*/
+	leaq  __cpu_features+FEATURE_OFFSET+index_memcpy_limit_overlap(%rip),%r8  # arg5 = &limit
+	jmp HIDDEN_JUMPTARGET(__memcpy_overlap_x86_64)
+
+END_LAP(memcpy_lap)
+#endif  /*}*/
+
 #if defined PIC && !defined NOT_IN_libc
 ENTRY_CHK (__memcpy_chk)
 
diff --git a/sysdeps/x86_64/mempcpy.S b/sysdeps/x86_64/mempcpy.S
index 5cb256e..65a691a 100644
--- a/sysdeps/x86_64/mempcpy.S
+++ b/sysdeps/x86_64/mempcpy.S
@@ -1,6 +1,7 @@
 #define USE_AS_MEMPCPY
 #define memcpy __mempcpy
 #define __memcpy_chk __mempcpy_chk
+#define memcpy_lap __mempcpy_lap
 #include <sysdeps/x86_64/memcpy.S>
 
 libc_hidden_def (BP_SYM (__mempcpy))
diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym b/sysdeps/x86_64/multiarch/ifunc-defines.sym
index eb1538a..d164208 100644
--- a/sysdeps/x86_64/multiarch/ifunc-defines.sym
+++ b/sysdeps/x86_64/multiarch/ifunc-defines.sym
@@ -18,3 +18,4 @@ FEATURE_SIZE		sizeof (unsigned int)
 
 COMMON_CPUID_INDEX_1
 FEATURE_INDEX_1
+FEATURE_MEMCPY_LIMIT_OVERLAP
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
index f0d2bb7..9b99ee3 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -39,9 +39,41 @@ get_common_indeces (unsigned int *family, unsigned int *model)
   *model = (eax >> 4) & 0x0f;
 }
 
+#ifndef NOT_IN_libc  /*{ IS_IN_libc */
+static int local_atoi(char const *p)
+{
+	unsigned minus = 0;
+	if ('-'==p[0]) {
+		minus = 1;
+		++p;
+	}
+	unsigned base = 10;
+	if ('0'==p[0] && 'x'==(('x' ^ 'X') | p[1])) {
+		base = 0x10;
+		p += 2;
+	}
+	unsigned val = 0;
+	int c;
+	for ( ; (c = *p++); ) {
+		val *= base;
+		c |= (0x40 & c) >> 1;  /* tolower */
+		char const *q = "0123456789abcdef";
+		for (; *q; ++val) {
+			if (c == *q++)
+				break;
+		}
+	}
+	if (minus)
+		val = -val;
+	return (int)val;
+}
+#endif  /*}*/
 
 void
-__init_cpu_features (void)
+#if __i386__  /*{*/
+  __attribute__ ((regparm(1)))
+#endif  /*}*/
+__init_cpu_features (char *(*getenv_rtld)(char const *))
 {
   unsigned int ebx;
   unsigned int ecx;
@@ -115,6 +147,18 @@ __init_cpu_features (void)
   __cpu_features.model = model;
   atomic_write_barrier ();
   __cpu_features.kind = kind;
+
+#ifndef NOT_IN_libc  /*{ IS_IN_libc */
+  char const *const mco = getenv_rtld("MEMCPY_CHECK_");
+  if (mco && mco[0]) {
+    /* ld.so: in __strtol_internal: undefined reference to `__libc_tsd_LOCALE_data' */
+    int const limit = local_atoi(mco);
+    if (limit) {
+      __cpu_features.feature[FEATURE_MEMCPY_LIMIT_OVERLAP] = limit;
+      __cpu_features.feature[index_memcpy_check_overlap] |= bit_memcpy_check_overlap;
+    }
+  }
+#endif  /*}*/
 }
 
 #undef __get_cpu_features
@@ -123,7 +167,7 @@ const struct cpu_features *
 __get_cpu_features (void)
 {
   if (__cpu_features.kind == arch_kind_unknown)
-    __init_cpu_features ();
+    __init_cpu_features (getenv);
 
   return &__cpu_features;
 }
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index 6e409b8..77471eb 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -20,6 +20,7 @@
 #define bit_Fast_Copy_Backward		(1 << 1)
 #define bit_Slow_BSF			(1 << 2)
 #define bit_Prefer_SSE_for_memop	(1 << 3)
+#define bit_memcpy_check_overlap	(1 << 4)
 
 #ifdef	__ASSEMBLER__
 
@@ -39,6 +40,9 @@
 # define index_Fast_Copy_Backward	FEATURE_INDEX_1*FEATURE_SIZE
 # define index_Slow_BSF			FEATURE_INDEX_1*FEATURE_SIZE
 # define index_Prefer_SSE_for_memop	FEATURE_INDEX_1*FEATURE_SIZE
+# define index_memcpy_check_overlap	FEATURE_INDEX_1*FEATURE_SIZE
+
+# define index_memcpy_limit_overlap	FEATURE_MEMCPY_LIMIT_OVERLAP*FEATURE_SIZE
 
 #else	/* __ASSEMBLER__ */
 
@@ -54,6 +58,7 @@ enum
 enum
   {
     FEATURE_INDEX_1 = 0,
+    FEATURE_MEMCPY_LIMIT_OVERLAP = 1,
     /* Keep the following line at the end.  */
     FEATURE_INDEX_MAX
   };
@@ -81,11 +86,19 @@ extern struct cpu_features
 } __cpu_features attribute_hidden;
 
 
-extern void __init_cpu_features (void) attribute_hidden;
+# include <stdlib.h>
+typedef char *(*getenv_t)(char const *);
+
+#if __i386__  /*{*/
+extern void __attribute__ ((regparm(1))) __init_cpu_features (getenv_t) attribute_hidden;
+#else  /*}{*/
+extern void                              __init_cpu_features (getenv_t) attribute_hidden;
+#endif  /*}*/
+
 #define INIT_ARCH()\
   do							\
     if (__cpu_features.kind == arch_kind_unknown)	\
-      __init_cpu_features ();				\
+      __init_cpu_features (getenv);				\
   while (0)
 
 /* Used from outside libc.so to get access to the CPU features structure.  */
@@ -112,6 +125,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define index_Fast_Copy_Backward	FEATURE_INDEX_1
 # define index_Slow_BSF			FEATURE_INDEX_1
 # define index_Prefer_SSE_for_memop	FEATURE_INDEX_1
+# define index_memcpy_check_overlap	FEATURE_INDEX_1
 
 #define HAS_ARCH_FEATURE(idx, bit) \
   ((__get_cpu_features ()->feature[idx] & (bit)) != 0)
@@ -128,4 +142,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 #define HAS_PREFER_SSE_FOR_MEMOP \
   HAS_ARCH_FEATURE (index_Prefer_SSE_for_memop, bit_Prefer_SSE_for_memop)
 
+#define HAS_MEMCPY_CHECK_OVERLAP \
+  HAS_ARCH_FEATURE (index_memcpy_check_overlap, bit_memcpy_check_overlap)
+
 #endif	/* __ASSEMBLER__ */
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
index 48c974e..2b29643 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -19,6 +19,12 @@
    02111-1307 USA.  */
 
 #include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef ENTRY_LAP
+# define ENTRY_LAP(sym) ENTRY(sym)
+# define END_LAP(sym)   END(sym)
+#endif
 
 #if !defined NOT_IN_libc \
     && (defined SHARED \
@@ -30,6 +36,7 @@
 #ifndef MEMCPY
 # define MEMCPY		__memcpy_ssse3_back
 # define MEMCPY_CHK	__memcpy_chk_ssse3_back
+# define MEMCPY_LAP	__memcpy_lap_ssse3_back
 #endif
 
 #ifndef ALIGN
@@ -44,11 +51,27 @@
 #define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
   lea		TABLE(%rip), %r11;				\
   movslq	(%r11, INDEX, SCALE), INDEX;			\
-  lea		(%r11, INDEX), INDEX;				\
+  addq		%r11, INDEX;					\
   jmp		*INDEX;						\
   ud2
 
 	.section .text.ssse3,"ax",@progbits
+ENTRY_LAP (MEMCPY_LAP)
+	movq %rdi, %rax; movq %rsi, %rcx
+	subq %rsi, %rax; subq %rdi, %rcx
+	cmpq %rdx, %rax; jb L(badlap)  # dst in &src[0 for len)
+	                 cmpq %rdx, %rcx; jb L(badlap)  # src in &dst[0 for len)
+	jmp L(1)
+L(badlap):
+#ifdef USE_AS_MEMPCPY  /*{*/
+	leaq (%rdi,%rdx),%rcx  # arg4 = result
+#else  /*}{*/
+	movq %rdi,%rcx  # arg4 = result
+#endif  /*}*/
+	leaq  __cpu_features+FEATURE_OFFSET+index_memcpy_limit_overlap(%rip),%r8  # arg5 = &limit
+	jmp HIDDEN_JUMPTARGET(__memcpy_overlap_x86_64)
+END_LAP (MEMCPY_LAP)
+
 #if defined SHARED && !defined NOT_IN_libc
 ENTRY (MEMCPY_CHK)
 	cmpq	%rdx, %rcx
@@ -57,6 +80,7 @@ END (MEMCPY_CHK)
 #endif
 
 ENTRY (MEMCPY)
+L(1):
 	mov	%rdi, %rax
 #ifdef USE_AS_MEMPCPY
 	add	%rdx, %rax
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
index 9a878d3..e7732e9 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -19,6 +19,12 @@
    02111-1307 USA.  */
 
 #include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef ENTRY_LAP
+# define ENTRY_LAP(sym) ENTRY(sym)
+# define END_LAP(sym)   END(sym)
+#endif
 
 #if !defined NOT_IN_libc \
     && (defined SHARED \
@@ -30,6 +36,7 @@
 #ifndef MEMCPY
 # define MEMCPY		__memcpy_ssse3
 # define MEMCPY_CHK	__memcpy_chk_ssse3
+# define MEMCPY_LAP	__memcpy_lap_ssse3
 #endif
 
 #ifndef ALIGN
@@ -40,15 +47,31 @@
 
 /* Branch to an entry in a jump table.  TABLE is a jump table with
    relative offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX.  */
+   jump table.  SCALE is the scale of INDEX.  Modifies condition code.  */
 #define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
   lea		TABLE(%rip), %r11;				\
   movslq	(%r11, INDEX, SCALE), INDEX;			\
-  lea		(%r11, INDEX), INDEX;				\
+  addq		%r11, INDEX;					\
   jmp		*INDEX;						\
   ud2
 
 	.section .text.ssse3,"ax",@progbits
+ENTRY_LAP (MEMCPY_LAP)
+	movq %rdi, %rax; movq %rsi, %rcx
+	subq %rsi, %rax; subq %rdi, %rcx
+	cmpq %rdx, %rax; jb L(badlap)  # dst in &src[0 for len)
+	                 cmpq %rdx, %rcx; jb L(badlap)  # src in &dst[0 for len)
+	jmp L(1)
+L(badlap):
+#ifdef USE_AS_MEMPCPY  /*{*/
+	leaq (%rdi,%rdx),%rcx  # arg4 = result
+#else  /*}{*/
+	movq %rdi,%rcx  # arg4 = result
+#endif  /*}*/
+	leaq  __cpu_features+FEATURE_OFFSET+index_memcpy_limit_overlap(%rip),%r8  # arg5 = &limit
+	jmp HIDDEN_JUMPTARGET(__memcpy_overlap_x86_64)
+END_LAP (MEMCPY_LAP)
+
 #if defined SHARED && !defined NOT_IN_libc
 ENTRY (MEMCPY_CHK)
 	cmpq	%rdx, %rcx
@@ -57,6 +80,7 @@ END (MEMCPY_CHK)
 #endif
 
 ENTRY (MEMCPY)
+L(1):
 	mov	%rdi, %rax
 #ifdef USE_AS_MEMPCPY
 	add	%rdx, %rax
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 8e9fb19..97d886a 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -21,6 +21,12 @@
 #include <sysdep.h>
 #include <init-arch.h>
 
+#define MEMCPY_LAP_SUBST(name1, name2) \
+	leaq name1(%rip), %rax; \
+	testl $bit_memcpy_check_overlap, __cpu_features+FEATURE_OFFSET+index_memcpy_check_overlap(%rip); \
+	jz 0f; leaq name2(%rip), %rax; \
+0:
+
 /* Define multiple versions only for the definition in lib and for
    DSO.  In static binaries we need memcpy before the initialization
    happened.  */
@@ -31,13 +37,13 @@ ENTRY(memcpy)
 	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
 	jne	1f
 	call	__init_cpu_features
-1:	leaq	__memcpy_sse2(%rip), %rax
+1:	MEMCPY_LAP_SUBST(__memcpy_sse2, __memcpy_lap_sse2)
 	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
 	jz	2f
-	leaq	__memcpy_ssse3(%rip), %rax
+	MEMCPY_LAP_SUBST(__memcpy_ssse3, __memcpy_lap_ssse3)
 	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
 	jz	2f
-	leaq	__memcpy_ssse3_back(%rip), %rax
+	MEMCPY_LAP_SUBST(__memcpy_ssse3_back, __memcpy_lap_ssse3_back)
 2:	ret
 END(memcpy)
 
@@ -62,6 +68,17 @@ END(memcpy)
 # define END_CHK(name) \
 	cfi_endproc; .size __memcpy_chk_sse2, .-__memcpy_chk_sse2
 
+# undef ENTRY_LAP
+# define ENTRY_LAP(name) \
+	.type __memcpy_lap_sse2, @function; \
+	.globl __memcpy_lap_sse2; \
+	.p2align 4; \
+	__memcpy_lap_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_LAP
+# define END_LAP(name) \
+	cfi_endproc; .size __memcpy_lap_sse2, .-__memcpy_lap_sse2
+
 # undef libc_hidden_builtin_def
 /* It doesn't make sense to send libc-internal memcpy calls through a PLT.
    The speedup we get from using SSSE3 instruction is likely eaten away
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
index f9a4e9a..91a89fa 100644
--- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
@@ -1,4 +1,5 @@
 #define USE_AS_MEMMOVE
 #define MEMCPY		__memmove_ssse3_back
 #define MEMCPY_CHK	__memmove_chk_ssse3_back
+#define MEMCPY_LAP	__memmove_lap_ssse3_back
 #include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
index 295430b..c723c1a 100644
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -1,4 +1,5 @@
 #define USE_AS_MEMMOVE
 #define MEMCPY		__memmove_ssse3
 #define MEMCPY_CHK	__memmove_chk_ssse3
+#define MEMCPY_LAP	__memmove_lap_ssse3
 #include "memcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
index 82ffacb..06b307d 100644
--- a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
@@ -1,4 +1,5 @@
 #define USE_AS_MEMPCPY
 #define MEMCPY		__mempcpy_ssse3_back
 #define MEMCPY_CHK	__mempcpy_chk_ssse3_back
+#define MEMCPY_LAP	__mempcpy_lap_ssse3_back
 #include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3.S
index 822d98e..de70740 100644
--- a/sysdeps/x86_64/multiarch/mempcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/mempcpy-ssse3.S
@@ -1,4 +1,5 @@
 #define USE_AS_MEMPCPY
 #define MEMCPY		__mempcpy_ssse3
 #define MEMCPY_CHK	__mempcpy_chk_ssse3
+#define MEMCPY_LAP	__mempcpy_lap_ssse3
 #include "memcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index e8152d6..02eaa96 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -61,6 +61,17 @@ END(__mempcpy)
 # define END_CHK(name) \
 	cfi_endproc; .size __mempcpy_chk_sse2, .-__mempcpy_chk_sse2
 
+# undef ENTRY_LAP
+# define ENTRY_LAP(name) \
+	.type __mempcpy_lap_sse2, @function; \
+	.globl __mempcpy_lap_sse2; \
+	.p2align 4; \
+	__mempcpy_lap_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_LAP
+# define END_LAP(name) \
+	cfi_endproc; .size __mempcpy_lap_sse2, .-__mempcpy_lap_sse2
+
 # undef libc_hidden_def
 # undef libc_hidden_builtin_def
 /* It doesn't make sense to send libc-internal mempcpy calls through a PLT.
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index 1859289..e014283 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -21,7 +21,7 @@
 #include <sysdep.h>
 #include <init-arch.h>
 
-#ifdef USE_AS_STRNCMP
+#ifdef USE_AS_STRNCMP  /*{*/
 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
    if the new counter > the old one or is 0.  */
 # define UPDATE_STRNCMP_COUNTER				\
@@ -37,7 +37,7 @@
 # define STRCMP_SSSE3	__strncmp_ssse3
 # define STRCMP_SSE2	__strncmp_sse2
 # define __GI_STRCMP	__GI_strncmp
-#elif defined USE_AS_STRCASECMP_L
+#elif defined USE_AS_STRCASECMP_L  /*}{*/
 # include "locale-defines.h"
 
 # define UPDATE_STRNCMP_COUNTER
@@ -46,7 +46,7 @@
 # define STRCMP_SSSE3	__strcasecmp_l_ssse3
 # define STRCMP_SSE2	__strcasecmp_l_sse2
 # define __GI_STRCMP	__GI___strcasecmp_l
-#elif defined USE_AS_STRNCASECMP_L
+#elif defined USE_AS_STRNCASECMP_L  /*}{*/
 # include "locale-defines.h"
 
 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
@@ -64,25 +64,25 @@
 # define STRCMP_SSSE3	__strncasecmp_l_ssse3
 # define STRCMP_SSE2	__strncasecmp_l_sse2
 # define __GI_STRCMP	__GI___strncasecmp_l
-#else
+#else  /*}{*/
 # define UPDATE_STRNCMP_COUNTER
-# ifndef STRCMP
+# ifndef STRCMP  /*{*/
 #  define STRCMP	strcmp
 #  define STRCMP_SSE42	__strcmp_sse42
 #  define STRCMP_SSSE3	__strcmp_ssse3
 #  define STRCMP_SSE2	__strcmp_sse2
 #  define __GI_STRCMP	__GI_strcmp
-# endif
-#endif
+# endif  /*}*/
+#endif  /*}*/
 
-#ifndef LABEL
+#ifndef LABEL  /*{*/
 # define LABEL(l) L(l)
-#endif
+#endif  /*}*/
 
 /* Define multiple versions only for the definition in libc.  Don't
    define multiple versions for strncmp in static library since we
    need strncmp before the initialization happened.  */
-#if (defined SHARED || !defined USE_AS_STRNCMP) && !defined NOT_IN_libc
+#if (defined SHARED || !defined USE_AS_STRNCMP) && !defined NOT_IN_libc  /*{*/
 	.text
 ENTRY(STRCMP)
 	.type	STRCMP, @gnu_indirect_function
@@ -100,7 +100,7 @@ ENTRY(STRCMP)
 2:	ret
 END(STRCMP)
 
-# ifdef USE_AS_STRCASECMP_L
+# ifdef USE_AS_STRCASECMP_L  /*{*/
 ENTRY(__strcasecmp)
 	.type	__strcasecmp, @gnu_indirect_function
 	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
@@ -117,8 +117,8 @@ ENTRY(__strcasecmp)
 2:	ret
 END(__strcasecmp)
 weak_alias (__strcasecmp, strcasecmp)
-# endif
-# ifdef USE_AS_STRNCASECMP_L
+# endif  /*}*/
+# ifdef USE_AS_STRNCASECMP_L  /*{*/
 ENTRY(__strncasecmp)
 	.type	__strncasecmp, @gnu_indirect_function
 	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
@@ -135,7 +135,7 @@ ENTRY(__strncasecmp)
 2:	ret
 END(__strncasecmp)
 weak_alias (__strncasecmp, strncasecmp)
-# endif
+# endif  /*}*/
 
 /* We use 0x1a:
 	_SIDD_SBYTE_OPS
@@ -167,7 +167,7 @@ weak_alias (__strncasecmp, strncasecmp)
 	.section .text.sse4.2,"ax",@progbits
 	.align	16
 	.type	STRCMP_SSE42, @function
-# ifdef USE_AS_STRCASECMP_L
+# ifdef USE_AS_STRCASECMP_L  /*{*/
 ENTRY (__strcasecmp_sse42)
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	movq	%fs:(%rax),%rdx
@@ -177,8 +177,8 @@ ENTRY (__strcasecmp_sse42)
 	.byte	0x0f,0x1f,0x44,0x00,0x00
 END (__strcasecmp_sse42)
 	/* FALLTHROUGH to strcasecmp_l.  */
-# endif
-# ifdef USE_AS_STRNCASECMP_L
+# endif  /*}*/
+# ifdef USE_AS_STRNCASECMP_L  /*{*/
 ENTRY (__strncasecmp_sse42)
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	movq	%fs:(%rax),%rcx
@@ -188,7 +188,7 @@ ENTRY (__strncasecmp_sse42)
 	.byte	0x0f,0x1f,0x44,0x00,0x00
 END (__strncasecmp_sse42)
 	/* FALLTHROUGH to strncasecmp_l.  */
-# endif
+# endif  /*}*/
 
 STRCMP_SSE42:
 	cfi_startproc
@@ -197,42 +197,42 @@ STRCMP_SSE42:
 /*
  * This implementation uses SSE to compare up to 16 bytes at a time.
  */
-# ifdef USE_AS_STRCASECMP_L
+# ifdef USE_AS_STRCASECMP_L  /*{*/
 	/* We have to fall back on the C implementation for locales
 	   with encodings not matching ASCII for single bytes.  */
-#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0  /*{*/
 	movq	LOCALE_T___LOCALES+LC_CTYPE*8(%rdx), %rax
-#  else
+#  else  /*}{*/
 	movq	(%rdx), %rax
-#  endif
+#  endif  /*}*/
 	testl	$0, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
 	jne	__strcasecmp_l_nonascii
-# endif
-# ifdef USE_AS_STRNCASECMP_L
+# endif  /*}*/
+# ifdef USE_AS_STRNCASECMP_L  /*{*/
 	/* We have to fall back on the C implementation for locales
 	   with encodings not matching ASCII for single bytes.  */
-#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0  /*{*/
 	movq	LOCALE_T___LOCALES+LC_CTYPE*8(%rcx), %rax
-#  else
+#  else  /*}{*/
 	movq	(%rcx), %rax
-#  endif
+#  endif  /*}*/
 	testl	$0, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
 	jne	__strncasecmp_l_nonascii
-# endif
+# endif  /*}*/
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	test	%rdx, %rdx
 	je	LABEL(strcmp_exitz_sse4_2)
 	cmp	$1, %rdx
 	je	LABEL(Byte0_sse4_2)
 	mov	%rdx, %r11
-# endif
+# endif  /*}*/
 	mov	%esi, %ecx
 	mov	%edi, %eax
 /* Use 64bit AND here to avoid long NOP padding.  */
 	and	$0x3f, %rcx		/* rsi alignment in cache line */
 	and	$0x3f, %rax		/* rdi alignment in cache line */
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L  /*{*/
 	.section .rodata.cst16,"aM",@progbits,16
 	.align 16
 .Lbelowupper_sse4:
@@ -251,14 +251,14 @@ STRCMP_SSE42:
 #  define UCHIGH_reg %xmm5
 	movdqa	.Ltouppermask_sse4(%rip), %xmm6
 #  define LCQWORD_reg %xmm6
-# endif
+# endif  /*}*/
 	cmp	$0x30, %ecx
 	ja	LABEL(crosscache_sse4_2)/* rsi: 16-byte load will cross cache line */
 	cmp	$0x30, %eax
 	ja	LABEL(crosscache_sse4_2)/* rdi: 16-byte load will cross cache line */
 	movdqu	(%rdi), %xmm1
 	movdqu	(%rsi), %xmm2
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L  /*{*/
 #  define TOLOWER(reg1, reg2) \
 	movdqa	reg1, %xmm7;					\
 	movdqa	UCHIGH_reg, %xmm8;				\
@@ -275,9 +275,9 @@ STRCMP_SSE42:
 	por	%xmm7, reg1;					\
 	por	%xmm9, reg2
 	TOLOWER (%xmm1, %xmm2)
-# else
+# else  /*}{*/
 #  define TOLOWER(reg1, reg2)
-# endif
+# endif  /*}*/
 	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
 	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
 	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
@@ -285,10 +285,10 @@ STRCMP_SSE42:
 	pmovmskb %xmm1, %edx
 	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
 	jnz	LABEL(less16bytes_sse4_2)/* If not, find different value or null char */
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)/* finish comparision */
-# endif
+# endif  /*}*/
 	add	$16, %rsi		/* prepare to search next 16 bytes */
 	add	$16, %rdi		/* prepare to search next 16 bytes */
 
@@ -330,13 +330,13 @@ LABEL(ashr_0_sse4_2):
 	movdqa	(%rsi), %xmm1
 	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
 	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
-# else
+# else  /*}{*/
 	movdqa	(%rdi), %xmm2
 	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm2, %xmm1			/* compare 16 bytes for equality */
-# endif
+# endif  /*}*/
 	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
 	pmovmskb %xmm1, %r9d
 	shr	%cl, %edx			/* adjust 0xffff for offset */
@@ -360,52 +360,52 @@ LABEL(ashr_0_sse4_2):
 	.p2align 4
 LABEL(ashr_0_use_sse4_2):
 	movdqa	(%rdi,%rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	lea	16(%rdx), %rdx
 	jbe	LABEL(ashr_0_use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 
 	movdqa	(%rdi,%rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	lea	16(%rdx), %rdx
 	jbe	LABEL(ashr_0_use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	jmp	LABEL(ashr_0_use_sse4_2)
 
 
 	.p2align 4
 LABEL(ashr_0_use_sse4_2_exit):
 	jnc	LABEL(strcmp_exitz_sse4_2)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	%rcx, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	lea	-16(%rdx, %rcx), %rcx
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %edx
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L  /*{*/
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
 	movl	(%rcx,%rax,4), %eax
 	movl	(%rcx,%rdx,4), %edx
-# endif
+# endif  /*}*/
 	sub	%edx, %eax
 	ret
 
@@ -454,18 +454,18 @@ LABEL(loop_ashr_1_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $1, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 
 	add	$16, %rdx
 	add	$16, %r10
@@ -473,18 +473,18 @@ LABEL(loop_ashr_1_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $1, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_1_use_sse4_2)
 
@@ -494,10 +494,10 @@ LABEL(nibble_ashr_1_use_sse4_2):
 	movdqa	-16(%rdi, %rdx), %xmm0
 	psrldq	$1, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	cmp	%r11, %rcx
 	jae	LABEL(nibble_ashr_use_sse4_2_exit)
-# endif
+# endif  /*}*/
 	cmp	$14, %ecx
 	ja	LABEL(loop_ashr_1_use_sse4_2)
 
@@ -546,18 +546,18 @@ LABEL(loop_ashr_2_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $2, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 
 	add	$16, %rdx
 	add	$16, %r10
@@ -565,18 +565,18 @@ LABEL(loop_ashr_2_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $2, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_2_use_sse4_2)
 
@@ -586,10 +586,10 @@ LABEL(nibble_ashr_2_use_sse4_2):
 	movdqa	-16(%rdi, %rdx), %xmm0
 	psrldq	$2, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	cmp	%r11, %rcx
 	jae	LABEL(nibble_ashr_use_sse4_2_exit)
-# endif
+# endif  /*}*/
 	cmp	$13, %ecx
 	ja	LABEL(loop_ashr_2_use_sse4_2)
 
@@ -638,18 +638,18 @@ LABEL(loop_ashr_3_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $3, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 
 	add	$16, %rdx
 	add	$16, %r10
@@ -657,18 +657,18 @@ LABEL(loop_ashr_3_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $3, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_3_use_sse4_2)
 
@@ -678,10 +678,10 @@ LABEL(nibble_ashr_3_use_sse4_2):
 	movdqa	-16(%rdi, %rdx), %xmm0
 	psrldq	$3, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	cmp	%r11, %rcx
 	jae	LABEL(nibble_ashr_use_sse4_2_exit)
-# endif
+# endif  /*}*/
 	cmp	$12, %ecx
 	ja	LABEL(loop_ashr_3_use_sse4_2)
 
@@ -731,18 +731,18 @@ LABEL(loop_ashr_4_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $4, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 
 	add	$16, %rdx
 	add	$16, %r10
@@ -750,18 +750,18 @@ LABEL(loop_ashr_4_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $4, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_4_use_sse4_2)
 
@@ -771,10 +771,10 @@ LABEL(nibble_ashr_4_use_sse4_2):
 	movdqa	-16(%rdi, %rdx), %xmm0
 	psrldq	$4, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	cmp	%r11, %rcx
 	jae	LABEL(nibble_ashr_use_sse4_2_exit)
-# endif
+# endif  /*}*/
 	cmp	$11, %ecx
 	ja	LABEL(loop_ashr_4_use_sse4_2)
 
@@ -824,18 +824,18 @@ LABEL(loop_ashr_5_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $5, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 
 	add	$16, %rdx
 	add	$16, %r10
@@ -844,18 +844,18 @@ LABEL(loop_ashr_5_use_sse4_2):
 	movdqa	(%rdi, %rdx), %xmm0
 
 	palignr $5, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_5_use_sse4_2)
 
@@ -865,10 +865,10 @@ LABEL(nibble_ashr_5_use_sse4_2):
 	movdqa	-16(%rdi, %rdx), %xmm0
 	psrldq	$5, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	cmp	%r11, %rcx
 	jae	LABEL(nibble_ashr_use_sse4_2_exit)
-# endif
+# endif  /*}*/
 	cmp	$10, %ecx
 	ja	LABEL(loop_ashr_5_use_sse4_2)
 
@@ -918,18 +918,18 @@ LABEL(loop_ashr_6_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $6, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 
 	add	$16, %rdx
 	add	$16, %r10
@@ -937,18 +937,18 @@ LABEL(loop_ashr_6_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $6, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_6_use_sse4_2)
 
@@ -958,10 +958,10 @@ LABEL(nibble_ashr_6_use_sse4_2):
 	movdqa	-16(%rdi, %rdx), %xmm0
 	psrldq	$6, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	cmp	%r11, %rcx
 	jae	LABEL(nibble_ashr_use_sse4_2_exit)
-# endif
+# endif  /*}*/
 	cmp	$9, %ecx
 	ja	LABEL(loop_ashr_6_use_sse4_2)
 
@@ -1011,18 +1011,18 @@ LABEL(loop_ashr_7_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $7, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 
 	add	$16, %rdx
 	add	$16, %r10
@@ -1030,18 +1030,18 @@ LABEL(loop_ashr_7_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $7, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_7_use_sse4_2)
 
@@ -1051,10 +1051,10 @@ LABEL(nibble_ashr_7_use_sse4_2):
 	movdqa	-16(%rdi, %rdx), %xmm0
 	psrldq	$7, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	cmp	%r11, %rcx
 	jae	LABEL(nibble_ashr_use_sse4_2_exit)
-# endif
+# endif  /*}*/
 	cmp	$8, %ecx
 	ja	LABEL(loop_ashr_7_use_sse4_2)
 
@@ -1104,18 +1104,18 @@ LABEL(loop_ashr_8_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $8, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 
 	add	$16, %rdx
 	add	$16, %r10
@@ -1123,18 +1123,18 @@ LABEL(loop_ashr_8_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $8, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_8_use_sse4_2)
 
@@ -1144,10 +1144,10 @@ LABEL(nibble_ashr_8_use_sse4_2):
 	movdqa	-16(%rdi, %rdx), %xmm0
 	psrldq	$8, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	cmp	%r11, %rcx
 	jae	LABEL(nibble_ashr_use_sse4_2_exit)
-# endif
+# endif  /*}*/
 	cmp	$7, %ecx
 	ja	LABEL(loop_ashr_8_use_sse4_2)
 
@@ -1198,18 +1198,18 @@ LABEL(loop_ashr_9_use_sse4_2):
 	movdqa	(%rdi, %rdx), %xmm0
 
 	palignr $9, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 
 	add	$16, %rdx
 	add	$16, %r10
@@ -1217,18 +1217,18 @@ LABEL(loop_ashr_9_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $9, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_9_use_sse4_2)
 
@@ -1238,10 +1238,10 @@ LABEL(nibble_ashr_9_use_sse4_2):
 	movdqa	-16(%rdi, %rdx), %xmm0
 	psrldq	$9, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	cmp	%r11, %rcx
 	jae	LABEL(nibble_ashr_use_sse4_2_exit)
-# endif
+# endif  /*}*/
 	cmp	$6, %ecx
 	ja	LABEL(loop_ashr_9_use_sse4_2)
 
@@ -1291,18 +1291,18 @@ LABEL(loop_ashr_10_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $10, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 
 	add	$16, %rdx
 	add	$16, %r10
@@ -1310,18 +1310,18 @@ LABEL(loop_ashr_10_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $10, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_10_use_sse4_2)
 
@@ -1331,10 +1331,10 @@ LABEL(nibble_ashr_10_use_sse4_2):
 	movdqa	-16(%rdi, %rdx), %xmm0
 	psrldq	$10, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	cmp	%r11, %rcx
 	jae	LABEL(nibble_ashr_use_sse4_2_exit)
-# endif
+# endif  /*}*/
 	cmp	$5, %ecx
 	ja	LABEL(loop_ashr_10_use_sse4_2)
 
@@ -1384,18 +1384,18 @@ LABEL(loop_ashr_11_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $11, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 
 	add	$16, %rdx
 	add	$16, %r10
@@ -1403,18 +1403,18 @@ LABEL(loop_ashr_11_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $11, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_11_use_sse4_2)
 
@@ -1424,10 +1424,10 @@ LABEL(nibble_ashr_11_use_sse4_2):
 	movdqa	-16(%rdi, %rdx), %xmm0
 	psrldq	$11, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	cmp	%r11, %rcx
 	jae	LABEL(nibble_ashr_use_sse4_2_exit)
-# endif
+# endif  /*}*/
 	cmp	$4, %ecx
 	ja	LABEL(loop_ashr_11_use_sse4_2)
 
@@ -1477,18 +1477,18 @@ LABEL(loop_ashr_12_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $12, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 
 	add	$16, %rdx
 	add	$16, %r10
@@ -1496,18 +1496,18 @@ LABEL(loop_ashr_12_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $12, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_12_use_sse4_2)
 
@@ -1517,10 +1517,10 @@ LABEL(nibble_ashr_12_use_sse4_2):
 	movdqa	-16(%rdi, %rdx), %xmm0
 	psrldq	$12, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	cmp	%r11, %rcx
 	jae	LABEL(nibble_ashr_use_sse4_2_exit)
-# endif
+# endif  /*}*/
 	cmp	$3, %ecx
 	ja	LABEL(loop_ashr_12_use_sse4_2)
 
@@ -1571,18 +1571,18 @@ LABEL(loop_ashr_13_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $13, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 
 	add	$16, %rdx
 	add	$16, %r10
@@ -1590,18 +1590,18 @@ LABEL(loop_ashr_13_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $13, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_13_use_sse4_2)
 
@@ -1611,10 +1611,10 @@ LABEL(nibble_ashr_13_use_sse4_2):
 	movdqa	-16(%rdi, %rdx), %xmm0
 	psrldq	$13, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	cmp	%r11, %rcx
 	jae	LABEL(nibble_ashr_use_sse4_2_exit)
-# endif
+# endif  /*}*/
 	cmp	$2, %ecx
 	ja	LABEL(loop_ashr_13_use_sse4_2)
 
@@ -1665,18 +1665,18 @@ LABEL(loop_ashr_14_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $14, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 
 	add	$16, %rdx
 	add	$16, %r10
@@ -1684,18 +1684,18 @@ LABEL(loop_ashr_14_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $14, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_14_use_sse4_2)
 
@@ -1705,10 +1705,10 @@ LABEL(nibble_ashr_14_use_sse4_2):
 	movdqa	-16(%rdi, %rdx), %xmm0
 	psrldq	$14, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	cmp	%r11, %rcx
 	jae	LABEL(nibble_ashr_use_sse4_2_exit)
-# endif
+# endif  /*}*/
 	cmp	$1, %ecx
 	ja	LABEL(loop_ashr_14_use_sse4_2)
 
@@ -1761,18 +1761,18 @@ LABEL(loop_ashr_15_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $15, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 
 	add	$16, %rdx
 	add	$16, %r10
@@ -1780,18 +1780,18 @@ LABEL(loop_ashr_15_use_sse4_2):
 
 	movdqa	(%rdi, %rdx), %xmm0
 	palignr $15, -16(%rdi, %rdx), %xmm0
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	jbe	LABEL(use_sse4_2_exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_15_use_sse4_2)
 
@@ -1801,28 +1801,28 @@ LABEL(nibble_ashr_15_use_sse4_2):
 	movdqa	-16(%rdi, %rdx), %xmm0
 	psrldq	$15, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	cmp	%r11, %rcx
 	jae	LABEL(nibble_ashr_use_sse4_2_exit)
-# endif
+# endif  /*}*/
 	cmp	$0, %ecx
 	ja	LABEL(loop_ashr_15_use_sse4_2)
 
 LABEL(nibble_ashr_use_sse4_2_exit):
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L  /*{*/
 	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
-# else
+# else  /*}{*/
 	movdqa	(%rsi,%rdx), %xmm1
 	TOLOWER (%xmm0, %xmm1)
 	pcmpistri $0x1a, %xmm1, %xmm0
-# endif
+# endif  /*}*/
 	.p2align 4
 LABEL(use_sse4_2_exit):
 	jnc	LABEL(strcmp_exitz_sse4_2)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	%rcx, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	add	%rcx, %rdx
 	lea	-16(%rdi, %r9), %rdi
 	movzbl	(%rdi, %rdx), %eax
@@ -1831,11 +1831,11 @@ LABEL(use_sse4_2_exit):
 	jz	LABEL(use_sse4_2_ret_sse4_2)
 	xchg	%eax, %edx
 LABEL(use_sse4_2_ret_sse4_2):
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L  /*{*/
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
 	movl	(%rcx,%rdx,4), %edx
 	movl	(%rcx,%rax,4), %eax
-# endif
+# endif  /*}*/
 
 	sub	%edx, %eax
 	ret
@@ -1852,18 +1852,18 @@ LABEL(ret_sse4_2):
 LABEL(less16bytes_sse4_2):
 	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L  /*{*/
 	sub	%rdx, %r11
 	jbe	LABEL(strcmp_exitz_sse4_2)
-# endif
+# endif  /*}*/
 	movzbl	(%rsi, %rdx), %ecx
 	movzbl	(%rdi, %rdx), %eax
 
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L  /*{*/
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
 	movl	(%rdx,%rcx,4), %ecx
 	movl	(%rdx,%rax,4), %eax
-# endif
+# endif  /*}*/
 
 	sub	%ecx, %eax
 	ret
@@ -1878,11 +1878,11 @@ LABEL(Byte0_sse4_2):
 	movzx	(%rsi), %ecx
 	movzx	(%rdi), %eax
 
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L  /*{*/
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
 	movl	(%rdx,%rcx,4), %ecx
 	movl	(%rdx,%rax,4), %eax
-# endif
+# endif  /*}*/
 
 	sub	%ecx, %eax
 	ret
@@ -1926,7 +1926,7 @@ LABEL(unaligned_table_sse4_2):
 # define END(name) \
 	cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2
 
-# ifdef USE_AS_STRCASECMP_L
+# ifdef USE_AS_STRCASECMP_L  /*{*/
 #  define ENTRY2(name) \
 	.type __strcasecmp_sse2, @function; \
 	.align 16; \
@@ -1934,9 +1934,9 @@ LABEL(unaligned_table_sse4_2):
 	CALL_MCOUNT
 #  define END2(name) \
 	cfi_endproc; .size __strcasecmp_sse2, .-__strcasecmp_sse2
-# endif
+# endif  /*}*/
 
-# ifdef USE_AS_STRNCASECMP_L
+# ifdef USE_AS_STRNCASECMP_L  /*{*/
 #  define ENTRY2(name) \
 	.type __strncasecmp_sse2, @function; \
 	.align 16; \
@@ -1944,7 +1944,7 @@ LABEL(unaligned_table_sse4_2):
 	CALL_MCOUNT
 #  define END2(name) \
 	cfi_endproc; .size __strncasecmp_sse2, .-__strncasecmp_sse2
-# endif
+# endif  /*}*/
 
 # undef libc_hidden_builtin_def
 /* It doesn't make sense to send libc-internal strcmp calls through a PLT.
@@ -1952,6 +1952,6 @@ LABEL(unaligned_table_sse4_2):
    by the indirect call in the PLT.  */
 # define libc_hidden_builtin_def(name) \
 	.globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2
-#endif
+#endif  /*}*/
 
 #include "../strcmp.S"
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index f4fa16c..b228811 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -28,8 +28,6 @@
 
 #undef UPDATE_STRNCMP_COUNTER
 
-/* Only change: add commented braces for parenthesis matching in editors. */
-
 #ifndef LABEL  /*{*/
 #define LABEL(l) L(l)
 #endif  /*}*/
@@ -139,9 +137,13 @@ L(oop):	movb	(%rdi), %al
 	jne	L(neq)
 	incq	%rdi
 	incq	%rsi
+#ifdef USE_AS_STRNCMP  /*{*/
+	decq	%rdx
+	jz	L(eq)
+#endif  /*}*/
 	testb	%al, %al
 	jnz	L(oop)
-
+L(eq):
 	xorl	%eax, %eax
 	ret
 
@@ -150,7 +152,7 @@ L(neq):	movl	$1, %eax
 	cmovbl	%ecx, %eax
 	ret
 END (BP_SYM (STRCMP))
-#else	/* NOT_IN_libc */  /*}{*/
+#else  /*}{ ! NOT_IN_libc */
 # ifdef USE_AS_STRCASECMP_L  /*{*/
 	/* We have to fall back on the C implementation for locales
 	   with encodings not matching ASCII for single bytes.  */
