Merge pull request #32 from mstorsjo/cosmetics

Consistently use unix newlines, remove trailing whitespace
2013-12-13 08:54:14 -08:00 · 2013-12-13 08:54:14 -08:00 · a913cc853e
commit a913cc853e
parent 90e0057ba6 f9dea46712
52 changed files with 10693 additions and 10694 deletions
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 OpenH264
 =======
 OpenH264 is a codec library which supports H.264 encoding and decoding. It is suitable for use in real time applications such as WebRTC. See http://www.openh264.org/ for more details.
- 
+
 Encoder Features
 ------------------------
 - Constrained Baseline Profile up to Level 5.2 (4096x2304)
@ -17,10 +17,10 @@ Encoder Features
 - Single reference frame for inter prediction
 - Multiple reference frames when using LTR and/or 3-4 temporal layers
 - Periodic and on-demand Instantaneous Decoder Refresh (IDR) frame insertion
- Dynamic changes to bit rate, frame rate, and resolution 
+- Dynamic changes to bit rate, frame rate, and resolution
 - Annex B byte stream output
 - YUV 4:2:0 planar input
- 
+
 Decoder Features
 ------------------------
 - Constrained Baseline Profile up to Level 5.2 (4096x2304)
@ -32,7 +32,7 @@ Decoder Features
 - Multiple reference frames when specified in Sequence Parameter Set (SPS)
 - Annex B byte stream input
 - YUV 4:2:0 planar output
- 
+
 OS Support
 ----------------
 - Windows 64-bit and 32-bit (initial release is only 32-bit, 64-bit will follow soon)
@ -40,7 +40,7 @@ OS Support
 - Linux 64-bit and 32-bit (initial release is only 32-bit, 64-bit will follow soon)
 - Android 32-bit (initial release does not include this target, will follow soon)
 - iOS 64-bit and 32-bit (not supported yet, may be added in the future)
- 
+
 Processor Support
 -------------------------
 - Intel x86 optionally with MMX/SSE (no AVX yet, help is welcome)
@ -53,37 +53,37 @@ Linux makefiles for 32 bit builds are available:
    : build the decoder library and executable via codec/build/linux/dec/makefile
    : build the encoder library and executable via codec/build/linux/enc/makefile
    : build the encoder shared library via processing/build/linux/makefile
- 
+
 Windows Visual Studio 2008/2010/2012 projects are available:
    : build the decoder via the Visual Studio projects in codec/build/win32/dec
    : build the encoder via the Visual Studio projects in codec/build/win32/enc
    : build the encoder shared library via the Visual Studio projects in processing/build/win32/
- 
+
 NASM needed to be installed for assembly code: workable version 2.07 or above, nasm can downloaded from http://www.nasm.us/
- 
+
 API details to be provided later.
- 
+
 Using the Test App
 -------------------------
 Linux shell scripts to build the test apps:
    : build via testbin/AutoBuild_Linux.sh
    : clean via testbin/AutoClean_Linux.sh
- 
+
 Windows batch files to build the test apps:
    : Visual Studio 2008 use testbin/AutoBuild_Windows_VS2008.bat
    : Visual Studio 2010 use testbin/AutoBuild_Windows_VS2010.bat
    : Visual Studio 2012 use testbin/AutoBuild_Windows_VS2012.bat
- 
+
 Usage information can be found in testbin/CmdLineReadMe
 Command line options and details to be provided later.
- 
+
 Using the Source
 -----------------------
 codec - encoder, decoder, console (test app), build (makefile, vcproj)
 processing - raw pixel processing (used by encoder)
 testbin - autobuild scripts, test app config files, yuv test files
 bin - binaries for library and test app
- 
+
 Known Issues
 -------------------
 See the issue tracker on https://github.com/cisco/openh264/issues
@ -91,7 +91,7 @@ See the issue tracker on https://github.com/cisco/openh264/issues
 - Encoder errors when compressed frame size exceeds half uncompressed size
 - Encoder console app only support multiple of 16 width/height for now
 - Decoder errors when compressed frame size exceeds 1MB
- 
+
 License
 ----------
 BSD, see LICENSE file for details.
--- a/build/mktargets.py
+++ b/build/mktargets.py
@ -19,7 +19,7 @@ def make_o(x):
 def write_cpp_rule(f, x):
    src = "$(%s_SRCDIR)/%s"%(PREFIX, x)
    dst = "$(%s_SRCDIR)/%s"%(PREFIX, make_o(x))
-    
+
    f.write("%s: %s\n"%(dst, src))
    f.write('\t$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(' + PREFIX + '_CFLAGS) $(' + PREFIX + '_INCLUDES) -c -o ' + dst + ' ' + src + '\n');
    f.write("\n")
@ -27,7 +27,7 @@ def write_cpp_rule(f, x):
 def write_asm_rule(f, x):
    src = "$(%s_SRCDIR)/%s"%(PREFIX, x)
    dst = "$(%s_SRCDIR)/%s"%(PREFIX, make_o(x))
-    
+
    f.write("%s: %s\n"%(dst, src))
    f.write('\t$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(' + PREFIX + '_ASMFLAGS) $(' + PREFIX + '_ASM_INCLUDES) -o ' + dst + ' ' + src + '\n');
    f.write("\n")
@ -70,7 +70,7 @@ f.write("%s_SRCDIR=%s\n"%(PREFIX, args.directory))
 f.write("%s_CPP_SRCS=\\\n"%(PREFIX))
 for c in cpp:
    f.write("\t$(%s_SRCDIR)/%s\\\n"%(PREFIX, c))
-f.write("\n")    
+f.write("\n")
 f.write("%s_OBJS += $(%s_CPP_SRCS:.cpp=.o)\n"%(PREFIX, PREFIX))

 f.write("ifeq ($(USE_ASM), Yes)\n");
--- a/codec/build/linux/dec/makefile
+++ b/codec/build/linux/dec/makefile
@ -25,7 +25,7 @@ GCC = gcc -m32
 ASFLAGS= -f elf -DNOPREFIX -I ../../../decoder/core/asm/

 LIBS= -lstdc++ -ldl
-#-lm 
+#-lm
 CFLAGS=  $(INCLUDE) -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DHAVE_CACHE_LINE_ALIGN

 ifeq ($(DBG),1)
@ -65,7 +65,7 @@ $(CORESRCDIR)/decoder_core.cpp \
 $(CORESRCDIR)/utils.cpp \
 $(PLUSSRCDIR)/welsDecoderExt.cpp \
 $(PLUSSRCDIR)/welsCodecTrace.cpp \
-$(COMMONSRCDIR)/logging.cpp 
+$(COMMONSRCDIR)/logging.cpp

 ASMSRC= $(ASMSRCDIR)/block_add.asm \
 $(ASMSRCDIR)/cpuid.asm \
@ -78,7 +78,7 @@ $(ASMSRCDIR)/mb_copy.asm \
 $(ASMSRCDIR)/mc_luma.asm \
 $(ASMSRCDIR)/memzero.asm \
 $(ASMSRCDIR)/asm_inc.asm \
- 
+
 MAINSRC= $(MAINSRCDIR)/d3d9_utils.cpp \
 $(MAINSRCDIR)/h264dec.cpp \
 $(MAINSRCDIR)/read_config.cpp
@ -119,7 +119,7 @@ $(OBJDIR)/mc_chroma.o \
 $(OBJDIR)/mb_copy.o \
 $(OBJDIR)/mc_luma.o \
 $(OBJDIR)/memzero.o \
-$(OBJDIR)/asm_inc.o 
+$(OBJDIR)/asm_inc.o
 endif

 OBJBIN=	$(OBJDIR)/d3d9_utils.o \
@ -134,7 +134,7 @@ default: depend checkdir lib dylib exe release

 dependencies:
 	@echo "" >dependencies
-	
+
 checkdir:
 	@echo 'checkdir..'
 	@if test ! -d $(BINDIR) ; \
@ -154,7 +154,7 @@ checkdir:
 		mkdir -p $(OBJDIR) ; \
 	fi
 	@echo
-	
+
 release:
 	@echo 'release..'
 	@echo 'cp -f $(SHAREDLIB) $(OUTDIR)'
@ -169,14 +169,14 @@ clean:
 	@rm -f $(OBJBIN)
 	@rm -f $(BINLIB)
 	@rm -f $(SHAREDLIB)
-	@rm -f $(BIN)    
+	@rm -f $(BIN)

 tags:
 	@echo update tag table
 	@etags $(CORESRCDIR)/*.c $(CORESRCDIR)/*.cpp $(PLUSSRCDIR)/*.cpp $(MAINSRCDIR)/*.cpp
-	
-	
-lib:   	$(OBJDEC) 
+
+
+lib:   	$(OBJDEC)
 	@echo '$(OBJDEC)'
 	@echo
 	@echo 'ar cr $(BINLIB) $(OBJDEC)'
@ -197,15 +197,15 @@ dylib:   $(OBJDEC)
 	@$(CXX)  -shared -Wl,-Bsymbolic -o $(SHAREDLIB) $(OBJDEC)  $(LIBS)
 	@echo '... done'
 	@echo
-	
+

 exe:	$(OBJBIN)
-	@echo	
+	@echo
 	@echo '$(OBJBIN)'
 	@echo
 	@echo '$(CXX) $(LIBS) $(OBJBIN) $(BINLIB) -o $(BIN)'
 	@echo 'creating binary "$(BIN)"'
-	@$(CXX) $(OBJBIN) $(BINLIB) -o $(BIN) $(LIBS) 
+	@$(CXX) $(OBJBIN) $(BINLIB) -o $(BIN) $(LIBS)
 	@echo '... done'
 	@echo

@ -223,31 +223,31 @@ depend:

 $(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.c
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<

 $(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
 	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
-		
+
 $(OBJDIR)/%.o$(SUFFIX): $(PLUSSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
-	
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(ASMSRCDIR)/%.asm
 	@echo 'compiling object file "$@" ...'
-	@$(AS) $(ASFLAGS) -o $@ $<	
+	@$(AS) $(ASFLAGS) -o $@ $<

 #$(OBJDIR)/%.o$(SUFFIX): $(ASMCOMDIR)/%.asm
 #	@echo 'compiling object file "$@" ...'
 #	@$(AS) $(ASFLAGS) -o $@ $<
-	
+
 $(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<

 $(OBJDIR)/%.o$(SUFFIX): $(COMMONSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
 	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
-	
+
 include $(DEPEND)

--- a/codec/build/linux/enc/makefile
+++ b/codec/build/linux/enc/makefile
@ -26,8 +26,8 @@ GCC = gcc -m32
 ASFLAGS= -f elf -DNOPREFIX -I ../../../encoder/core/asm/

 LIBS= -lstdc++ -ldl -lpthread -lm
-#-lm 
-CFLAGS=  $(INCLUDE) -m32 -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DWELS_SVC -DENCODER_CORE -DHAVE_CACHE_LINE_ALIGN -DWELS_TESTBED -DMT_ENABLED 
+#-lm
+CFLAGS=  $(INCLUDE) -m32 -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DWELS_SVC -DENCODER_CORE -DHAVE_CACHE_LINE_ALIGN -DWELS_TESTBED -DMT_ENABLED

 ifeq ($(DBG),1)
 #SUFFIX= .dbg
@ -150,7 +150,7 @@ $(OBJDIR)/quant.o \
 $(OBJDIR)/satd_sad.o \
 $(OBJDIR)/score.o \
 $(OBJDIR)/asm_inc.o \
-$(OBJDIR)/vaa.o 
+$(OBJDIR)/vaa.o
 endif
 OBJBIN=	$(OBJDIR)/read_config.o \
 $(OBJDIR)/welsenc.o
@ -163,7 +163,7 @@ default: depend checkdir lib dylib exe release

 dependencies:
 	@echo "" >dependencies
-	
+
 checkdir:
 	@echo 'checkdir..'
 	@if test ! -d $(OUTDIR) ; \
@ -195,9 +195,9 @@ clean:
 tags:
 	@echo update tag table
 	@etags $(THREADLIBSRCDIR)/*.cpp $(COMMSRCDIR)/*.cpp $(CORESRCDIR)/*.cpp $(PLUSSRCDIR)/*.cpp $(MAINSRCDIR)/*.cpp
-	
-	
-lib:   	$(OBJENC) 
+
+
+lib:   	$(OBJENC)
 	@echo '$(OBJENC)'
 	@echo
 	@echo 'ar cr $(BINLIB) $(OBJENC)'
@ -218,7 +218,7 @@ dylib:   $(OBJDEC)
 	@$(GCC)  -shared -Wl,-Bsymbolic -m32 -o $(SHAREDLIB) $(OBJENC)  $(LIBS)
 	@echo '... done'
 	@echo
-	
+
 release:
 	@echo 'release..'
 	@echo 'cp -f $(SHAREDLIB) $(OUTDIR)'
@ -228,7 +228,7 @@ release:
 	@echo

 exe:	$(OBJBIN)
-	@echo	
+	@echo
 	@echo '$(OBJBIN)'
 	@echo
 	@echo '$(GCC) $(LIBS) $(OBJBIN) $(BINLIB) -m32 -o $(BIN)'
@ -251,24 +251,24 @@ $(OBJDIR)/%.o$(SUFFIX): $(THREADLIBSRCDIR)/%.cpp

 $(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
-	
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(PLUSSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<

 $(OBJDIR)/%.o$(SUFFIX): $(ASMSRCDIR)/%.asm
 	@echo 'compiling object file "$@" ...'
-	@$(AS) $(ASFLAGS) -o $@ $<	
-	
+	@$(AS) $(ASFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<	
-	
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<	
-	
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(COMMONSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
 	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
--- a/codec/decoder/core/asm/asm_inc.asm
+++ b/codec/decoder/core/asm/asm_inc.asm
@ -43,7 +43,7 @@
 ; Options, for DEBUG
 ;***********************************************************************

-%if 1 
+%if 1
 	%define MOVDQ movdqa
 %else
 	%define MOVDQ movdqu
@ -58,7 +58,7 @@
 BITS 32

 ;***********************************************************************
-; Macros 
+; Macros
 ;***********************************************************************

 %macro WELS_EXTERN 1
@ -74,7 +74,7 @@ BITS 32
 	pxor        %2, %2
    psubw       %2, %1
    pmaxsw      %1, %2
-%endmacro 	
+%endmacro

 %macro MMX_XSwap  4
    movq		%4, %2
@ -105,7 +105,7 @@ BITS 32
    SSE2_XSawp qdq, %5, %2, %3
 %endmacro

-;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4 
+;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
 %macro SSE2_TransTwo4x4W 5
    SSE2_XSawp wd,  %1, %2, %5
    SSE2_XSawp wd,  %3, %4, %2
@ -125,26 +125,26 @@ BITS 32
 	movdqa	%6, %9
 	movdqa	%9, %4
 	SSE2_XSawp bw,  %7, %6, %4
-	
-	SSE2_XSawp wd,  %1, %3, %6	
+
+	SSE2_XSawp wd,  %1, %3, %6
 	SSE2_XSawp wd,  %8, %2, %3
 	SSE2_XSawp wd,  %5, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %3	
+	movdqa	%9, %3
 	SSE2_XSawp wd,  %7, %4, %3
-	
-	SSE2_XSawp dq,  %1, %5, %4	
+
+	SSE2_XSawp dq,  %1, %5, %4
 	SSE2_XSawp dq,  %6, %2, %5
 	SSE2_XSawp dq,  %8, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %5		
+	movdqa	%9, %5
 	SSE2_XSawp dq,  %7, %3, %5
-	
+
 	SSE2_XSawp qdq,  %1, %8, %3
 	SSE2_XSawp qdq,  %4, %2, %8
 	SSE2_XSawp qdq,  %6, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %1		
+	movdqa	%9, %1
 	SSE2_XSawp qdq,  %7, %5, %1
 	movdqa	%5, %9
 %endmacro
@ -170,9 +170,9 @@ BITS 32
 %macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
 	mov %3h, %3l
 	movd %1, e%3x		; i.e, 1% = eax (=b0)
-	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0	
-	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0	
-%endmacro  
+	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
+	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro

 ;copy a dw into a xmm for 8 times
 %macro  SSE2_Copy8Times 2
--- a/codec/decoder/core/asm/block_add.asm
+++ b/codec/decoder/core/asm/block_add.asm
@ -48,7 +48,7 @@ BITS 32
 ; Macros and other preprocessor constants
 ;*******************************************************************************

-%macro   BLOCK_ADD_16_SSE2   4 
+%macro   BLOCK_ADD_16_SSE2   4
 	movdqa    xmm0,       [%2]
 	movdqa    xmm1,       [%3]
    movdqa    xmm2,       [%3+10h]
@ -65,7 +65,7 @@ BITS 32

 	lea          %2,      [%2+%4]
 	lea          %3,      [%3+%4*2]
-	lea          %1,      [%1+%4] 
+	lea          %1,      [%1+%4]
 %endmacro

 %macro    BLOCK_ADD_8_MMXEXT   4
@ -106,7 +106,7 @@ BITS 32

 	lea          %2,      [%2+%4]
 	lea          %3,      [%3+%5*2]
-	lea          %1,      [%1+%4] 
+	lea          %1,      [%1+%4]
 %endmacro


@ -130,24 +130,24 @@ BITS 32
 	lea          %1,      [%1+%4]
 %endmacro

-%macro    BLOCK_ADD_8_STRIDE_2_LINES_SSE2   5    
+%macro    BLOCK_ADD_8_STRIDE_2_LINES_SSE2   5
 	movdqa xmm1, [%3]
 	movq xmm0, [%2]
 	punpcklbw xmm0, xmm7
 	paddw xmm0, xmm1
 	packuswb xmm0, xmm7
-	movq [%1], xmm0	
-	
+	movq [%1], xmm0
+
 	movdqa xmm3, [%3+%5*2]
 	movq xmm2, [%2+%4]
 	punpcklbw xmm2, xmm7
 	paddw xmm2, xmm3
-	packuswb xmm2, xmm7	
-	movq [%1+%4], xmm2	
-	
+	packuswb xmm2, xmm7
+	movq [%1+%4], xmm2
+
 	lea %1, [%1+%4*2]
 	lea %2, [%2+%4*2]
-	lea %3, [%3+%5*4]	
+	lea %3, [%3+%5*4]
 %endmacro

 %macro   CHECK_DATA_16_ZERO_SSE4     3
@ -159,7 +159,7 @@ BITS 32
 	por		   xmm0,	 xmm1
 	ptest      xmm7,     xmm0
 	cmovae     eax,      %3
-	
+
 	add        %1,       20h
 	add        ecx,      04h
 	mov        byte [%2+ebx],  al
@ -170,12 +170,12 @@ BITS 32
    movdqa     xmm1,      [%1+%3]
    movdqa     xmm2,      [%1+%3*2]
    movdqa     xmm3,      [%1+%4]
-    
+
    mov        eax,       0h
    mov        ebx,       0h
    movdqa     xmm4,      xmm0
    movdqa     xmm5,      xmm2
-    
+
    punpcklqdq  xmm0,     xmm1
    punpckhqdq  xmm4,     xmm1
    punpcklqdq  xmm2,     xmm3
@ -183,12 +183,12 @@ BITS 32

 	por			xmm0,	  xmm2
 	por			xmm4,	  xmm5
-    
+
    ptest       xmm7,     xmm0
    cmovae      eax,      %5
    ptest       xmm7,     xmm4
-    cmovae      ebx,      %5    
-    
+    cmovae      ebx,      %5
+
    mov     byte [%2],    al
    mov     byte [%2+1],  bl
 %endmacro
@ -230,45 +230,45 @@ BITS 32
    movdqa     xmm0,      [%1]
    movdqa     xmm1,      [%1+10h]
    mov        ebx,       [ecx]
-    
+
    pcmpeqw    xmm0,      xmm7
    pcmpeqw    xmm1,      xmm7
    packsswb   xmm0,      xmm1
-    pmovmskb   edx,       xmm0    
+    pmovmskb   edx,       xmm0
    sub        edx,       0ffffh
-    
-    cmovb      eax,       ebp   
+
+    cmovb      eax,       ebp
    add        ecx,       4
    add        %1,        20h
    mov      byte [%2+ebx],    al
 %endmacro
-    
+


 %macro   CHECK_RS_4x4_BLOCK_2_ZERO_SSE2    5
    movdqa    xmm0,      [%1]
    movdqa    xmm1,      [%1 + %3]
    movdqa    xmm2,      [%1 + %3*2]
-    movdqa    xmm3,      [%1 + %4]    
-    
+    movdqa    xmm3,      [%1 + %4]
+
    movdqa    xmm4,       xmm0
    movdqa    xmm5,       xmm2
-    
+
    punpcklqdq   xmm0,    xmm1
    punpckhqdq   xmm4,    xmm1
    punpcklqdq   xmm2,    xmm3
    punpckhqdq   xmm5,    xmm3
-    
+
    pcmpeqw      xmm0,    xmm7
    pcmpeqw      xmm2,    xmm7
    pcmpeqw      xmm4,    xmm7
    pcmpeqw      xmm5,    xmm7
-    
+
    packsswb     xmm0,    xmm2
    packsswb     xmm4,    xmm5
    pmovmskb     eax,     xmm0
    pmovmskb     ebx,     xmm4
-    
+
    sub          eax,     0ffffh
    mov          eax,     0
    cmovb        eax,     %5
@ -276,7 +276,7 @@ BITS 32
    mov          ebx,     0
    cmovb        ebx,     %5
    mov       byte [%2],    al
-    mov       byte [%2+1],  bl        
+    mov       byte [%2+1],  bl
 %endmacro

 ;*******************************************************************************
@ -291,12 +291,12 @@ SECTION .rodata align=16

 ALIGN  16
 SubMbScanIdx:
-     dd    0x0,  0x1,  0x4,  0x5, 
+     dd    0x0,  0x1,  0x4,  0x5,
 	 dd    0x2,  0x3,  0x6,  0x7,
 	 dd    0x8,  0x9,  0xc,  0xd,
 	 dd    0xa,  0xb,  0xe,  0xf,
 	 dd    0x10, 0x11, 0x14, 0x15,
-	 dd    0x12, 0x13, 0x16, 0x17,     
+	 dd    0x12, 0x13, 0x16, 0x17,

 ;*******************************************************************************
 ; Code
@ -312,10 +312,10 @@ ALIGN    16
 ;  void_t WelsResBlockZero16x16_sse2(int16_t* pBlock,int32_t iStride)
 ;*******************************************************************************
 WelsResBlockZero16x16_sse2:
-    push     esi	
+    push     esi

 	mov      esi,        [esp+08h]
-	mov      ecx,        [esp+0ch]	
+	mov      ecx,        [esp+0ch]
 	lea      ecx,        [ecx*2]
 	lea      eax,        [ecx*3]

@ -375,7 +375,7 @@ WelsResBlockZero16x16_sse2:

 	movdqa   [esi+eax],     xmm7
 	movdqa   [esi+eax+10h],     xmm7
-    
+
    pop      esi
 	ret

@ -386,7 +386,7 @@ ALIGN    16
 ;*******************************************************************************
 ;  void_t WelsResBlockZero8x8_sse2(int16_t * pBlock, int32_t iStride)
 ;*******************************************************************************
-WelsResBlockZero8x8_sse2: 
+WelsResBlockZero8x8_sse2:
 	  push      esi

      mov       esi,     [esp+08h]
@ -407,7 +407,7 @@ WelsResBlockZero8x8_sse2:
 	  movdqa    [esi+ecx*2],   xmm7
 	  movdqa    [esi+eax],     xmm7

-	  
+
 	  pop       esi
 	  ret

--- a/codec/decoder/core/asm/cpuid.asm
+++ b/codec/decoder/core/asm/cpuid.asm
@ -84,12 +84,12 @@ ALIGN 16
 ;   void WelsCPUId( int32_t index, int32_t *uiFeatureA, int32_t *uiFeatureB, int32_t *uiFeatureC, int32_t *uiFeatureD )
 ;****************************************************************************************************
 WelsCPUId:
-	push	ebx	
+	push	ebx
 	push	edi
-	
+
 	mov     eax, [esp+12]	; operating index
    cpuid					; cpuid
-	
+
 	; processing various information return
 	mov     edi, [esp+16]
    mov     [edi], eax
@ -100,10 +100,10 @@ WelsCPUId:
    mov     edi, [esp+28]
    mov     [edi], edx

-	pop		edi	
+	pop		edi
    pop     ebx
 	ret
-	
+
 WELS_EXTERN WelsCPUSupportAVX
 ; need call after cpuid=1 and eax, ecx flag got then
 ALIGN 16
@ -139,7 +139,7 @@ ALIGN 16
 WelsCPUSupportFMA:
 	mov eax, [esp+4]
 	mov ecx, [esp+8]
-	
+
 	; refer to detection of FMA addressed in INTEL AVX manual document
 	and ecx, 018001000H
 	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
@ -153,7 +153,7 @@ WelsCPUSupportFMA:
 	mov eax, 1
 	ret
 fma_not_supported:
-	mov eax, 0	
+	mov eax, 0
 	ret

 WELS_EXTERN WelsEmms
--- a/codec/decoder/core/asm/dct.asm
+++ b/codec/decoder/core/asm/dct.asm
@ -1,129 +1,129 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        ?Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        ?Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  dct.asm
-;*
-;*  Abstract
-;*      WelsDctFourT4_sse2
-;*
-;*  History
-;*      8/4/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-%macro MMX_SumSubDiv2 3
-    movq    %3, %2
-    psraw   %3, $1
-    paddw   %3, %1
-    psraw   %1, $1
-    psubw   %1, %2
-%endmacro
-
-%macro MMX_SumSub 3
-	movq    %3, %2
-    psubw   %2, %1
-    paddw   %1, %3
-%endmacro
-
-%macro MMX_IDCT 6
-    MMX_SumSub      %4, %5, %6
-    MMX_SumSubDiv2  %3, %2, %1
-    MMX_SumSub		%1, %4, %6
-	MMX_SumSub		%3, %5, %6
-%endmacro
-
-
-%macro MMX_StoreDiff4P 5
-    movd       %2, %5
-    punpcklbw  %2, %4
-    paddw      %1, %3
-    psraw      %1, $6
-    paddsw     %1, %2
-    packuswb   %1, %2
-    movd       %5, %1
-%endmacro
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-WELS_EXTERN IdctResAddPred_mmx
-
-ALIGN 16
-;*******************************************************************************
-;   void_t __cdecl IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
-;*******************************************************************************
-
-IdctResAddPred_mmx:
-
-%define	pushsize	0
-%define pPred       esp+pushsize+4
-%define kiStride     esp+pushsize+8
-%define pRs         esp+pushsize+12
-
-	mov     eax, [pRs   ] 
-    mov     edx, [pPred ]   
-    mov     ecx, [kiStride]   
-    movq    mm0, [eax+ 0]
-    movq    mm1, [eax+ 8]
-    movq    mm2, [eax+16]
-    movq    mm3, [eax+24]
-
-	MMX_Trans4x4W        mm0, mm1, mm2, mm3, mm4
-	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
-    MMX_Trans4x4W        mm1, mm3, mm0, mm4, mm2
-	MMX_IDCT			mm3, mm0, mm4, mm2, mm1, mm6
-
-    WELS_Zero			mm7
-    WELS_DW32			mm6
-    
-    MMX_StoreDiff4P    mm3, mm0, mm6, mm7, [edx]
-    MMX_StoreDiff4P    mm4, mm0, mm6, mm7, [edx+ecx]
-    lea     edx, [edx+2*ecx]
-    MMX_StoreDiff4P    mm1, mm0, mm6, mm7, [edx]
-    MMX_StoreDiff4P    mm2, mm0, mm6, mm7, [edx+ecx]
-    
-%undef	pushsize
-%undef  pPred
-%undef  kiStride
-%undef  pRs
-	emms
-    ret
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        ?Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        ?Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  dct.asm
+;*
+;*  Abstract
+;*      WelsDctFourT4_sse2
+;*
+;*  History
+;*      8/4/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+%macro MMX_SumSubDiv2 3
+    movq    %3, %2
+    psraw   %3, $1
+    paddw   %3, %1
+    psraw   %1, $1
+    psubw   %1, %2
+%endmacro
+
+%macro MMX_SumSub 3
+	movq    %3, %2
+    psubw   %2, %1
+    paddw   %1, %3
+%endmacro
+
+%macro MMX_IDCT 6
+    MMX_SumSub      %4, %5, %6
+    MMX_SumSubDiv2  %3, %2, %1
+    MMX_SumSub		%1, %4, %6
+	MMX_SumSub		%3, %5, %6
+%endmacro
+
+
+%macro MMX_StoreDiff4P 5
+    movd       %2, %5
+    punpcklbw  %2, %4
+    paddw      %1, %3
+    psraw      %1, $6
+    paddsw     %1, %2
+    packuswb   %1, %2
+    movd       %5, %1
+%endmacro
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+WELS_EXTERN IdctResAddPred_mmx
+
+ALIGN 16
+;*******************************************************************************
+;   void_t __cdecl IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
+;*******************************************************************************
+
+IdctResAddPred_mmx:
+
+%define	pushsize	0
+%define pPred       esp+pushsize+4
+%define kiStride     esp+pushsize+8
+%define pRs         esp+pushsize+12
+
+	mov     eax, [pRs   ]
+    mov     edx, [pPred ]
+    mov     ecx, [kiStride]
+    movq    mm0, [eax+ 0]
+    movq    mm1, [eax+ 8]
+    movq    mm2, [eax+16]
+    movq    mm3, [eax+24]
+
+	MMX_Trans4x4W        mm0, mm1, mm2, mm3, mm4
+	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
+    MMX_Trans4x4W        mm1, mm3, mm0, mm4, mm2
+	MMX_IDCT			mm3, mm0, mm4, mm2, mm1, mm6
+
+    WELS_Zero			mm7
+    WELS_DW32			mm6
+
+    MMX_StoreDiff4P    mm3, mm0, mm6, mm7, [edx]
+    MMX_StoreDiff4P    mm4, mm0, mm6, mm7, [edx+ecx]
+    lea     edx, [edx+2*ecx]
+    MMX_StoreDiff4P    mm1, mm0, mm6, mm7, [edx]
+    MMX_StoreDiff4P    mm2, mm0, mm6, mm7, [edx+ecx]
+
+%undef	pushsize
+%undef  pPred
+%undef  kiStride
+%undef  pRs
+	emms
+    ret
--- a/codec/decoder/core/asm/deblock.asm
+++ b/codec/decoder/core/asm/deblock.asm
--- a/codec/decoder/core/asm/expand_picture.asm
+++ b/codec/decoder/core/asm/expand_picture.asm
@ -155,11 +155,11 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
 	lea %1, [%1+%2]
 %endmacro

-%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]		
+%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]
 	; ebx [width/16(8)]
 	; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16)		; top
 	; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16)	; bottom
-		
+
 %if %1 == 32		; for luma
 	sar ebx, 04h 	; width / 16(8) pixels
 .top_bottom_loops:
@ -173,7 +173,7 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_end16x4_sse2 edi, ecx, xmm0, a
-	
+
 	; bottom
 	movdqa xmm1, [eax] 		; last line of picture pData
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
@ -184,15 +184,15 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-		
+
 	lea esi, [esi+16]		; top pSrc
 	lea edi, [edi+16]		; top dst
 	lea eax, [eax+16]		; bottom pSrc
 	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?	
-	
+	neg ecx 			; positive/negative stride need for next loop?
+
 	dec ebx
-	jnz near .top_bottom_loops		
+	jnz near .top_bottom_loops
 %elif %1 == 16	; for chroma ??
 	mov edx, ebx
 	sar ebx, 04h 	; (width / 16) pixels
@ -202,21 +202,21 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
 	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_end16x4_sse2 edi, ecx, xmm0, a	
-	
+	mov_line_end16x4_sse2 edi, ecx, xmm0, a
+
 	; bottom
 	movdqa xmm1, [eax] 		; last line of picture pData
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_end16x4_sse2 ebp, ecx, xmm1, a	
-		
+	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
+
 	lea esi, [esi+16]		; top pSrc
 	lea edi, [edi+16]		; top dst
 	lea eax, [eax+16]		; bottom pSrc
 	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?	
-	
+	neg ecx 			; positive/negative stride need for next loop?
+
 	dec ebx
 	jnz near .top_bottom_loops

@ -243,50 +243,50 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
 %endif
 %endmacro

-%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a	
+%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
 	; ecx [height]
 	; esi [pSrc+0], 	   edi [pSrc-32], edx [stride], 32(16)	; left
 	; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16)			; right
 ;	xor eax, eax 	; for pixel pData (uint8_t)		; make sure eax=0 at least high 24 bits of eax = 0
-	
-%if %1 == 32		; for luma	
+
+%if %1 == 32		; for luma
 .left_right_loops:
 	; left
 	mov al, byte [esi]		; pixel pData for left border
 	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdqa [edi], xmm0
 	movdqa [edi+16], xmm0
-	
+
 	; right
 	mov al, byte [ebx]
 	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdqa [ebp], xmm1
 	movdqa [ebp+16], xmm1
-	
+
 	lea esi, [esi+edx]		; left pSrc
 	lea edi, [edi+edx]		; left dst
 	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst	
-	
+	lea ebp, [ebp+edx]		; right dst
+
 	dec ecx
-	jnz near .left_right_loops		
-%elif %1 == 16	; for chroma ??	
+	jnz near .left_right_loops
+%elif %1 == 16	; for chroma ??
 .left_right_loops:
 	; left
 	mov al, byte [esi]		; pixel pData for left border
 	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [edi], xmm0	
-	
+	movdqa [edi], xmm0
+
 	; right
 	mov al, byte [ebx]
 	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdq%2 [ebp], xmm1								; might not be aligned 16 bytes in case chroma planes
-	
+
 	lea esi, [esi+edx]		; left pSrc
 	lea edi, [edi+edx]		; left dst
 	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst	
-	
+	lea ebp, [ebp+edx]		; right dst
+
 	dec ecx
 	jnz near .left_right_loops
 %endif
@ -339,25 +339,25 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
 	; TL
 	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?	
+	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 	mov_line_end16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?

 	; TR
 	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
 	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?	
+	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
 	mov_line_end16x4_sse2 ebp, ecx, xmm4, %2	; dst, stride, xmm?

 	; BL
 	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?	
+	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 	mov_line_end16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?

 	; BR
 	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?	
+	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 	mov_line_end16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 %endif
 %endmacro
@ -375,7 +375,7 @@ ExpandPictureLuma_sse2:
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@ -387,10 +387,10 @@ ExpandPictureLuma_sse2:
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; kiStride	
+	mov ecx, edx							; kiStride
 	neg ecx 								; -kiStride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*kiStride
 	lea eax, [esi+eax]						; last line of picture pData
@ -398,16 +398,16 @@ ExpandPictureLuma_sse2:
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 32 * stride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; kiWidth-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; kiWidth
-	exp_top_bottom_sse2	32	
-	
+	exp_top_bottom_sse2	32
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@ -419,14 +419,14 @@ ExpandPictureLuma_sse2:
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
 	butterfly_1to16_sse xmm4, xmm0, a		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	32, a
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@ -436,7 +436,7 @@ ExpandPictureLuma_sse2:
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	mov eax, -32							; luma=-32, chroma=-16
 	neg ecx										; -stride
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
 	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
@ -444,19 +444,19 @@ ExpandPictureLuma_sse2:
 	mov ecx, [esp+28]					; kiStride
 	imul edx, ecx							; (height+32(16)) * stride
 	lea eax, [edi+edx]						; last line of bottom-left border
-	lea ebx, [ebp+edx]						; last line of bottom-right border	
+	lea ebx, [ebp+edx]						; last line of bottom-right border
 	neg ecx										; -kiStride
 	; for left & right border expanding
-	exp_cross_sse2		32, a	
-	
+	exp_cross_sse2		32, a
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret

 ALIGN 16
@ -472,7 +472,7 @@ ExpandPictureChromaAlign_sse2:
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@ -484,10 +484,10 @@ ExpandPictureChromaAlign_sse2:
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; kiStride	
+	mov ecx, edx							; kiStride
 	neg ecx 								; -kiStride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*kiStride
 	lea eax, [esi+eax]						; last line of picture pData
@ -495,16 +495,16 @@ ExpandPictureChromaAlign_sse2:
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*kiStride + 16 * kiStride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; kiWidth-1
-	lea ebx, [eax+ebx]						; pDst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; pDst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; kiWidth
-	exp_top_bottom_sse2	16	
-	
+	exp_top_bottom_sse2	16
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst: left border pSrc
@ -516,14 +516,14 @@ ExpandPictureChromaAlign_sse2:
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
 	butterfly_1to16_sse xmm4, xmm0, a		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	16, a
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@ -533,9 +533,9 @@ ExpandPictureChromaAlign_sse2:
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	mov eax, -16							; chroma=-16
 	neg ecx										; -stride
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]				
+	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
 	mov ecx, [esp+28]						; kiStride
 	add edx, 16							; height+16, luma=32, chroma=16
@ -545,15 +545,15 @@ ExpandPictureChromaAlign_sse2:
 	neg ecx										; -kiStride
 	; for left & right border expanding
 	exp_cross_sse2		16, a
-	
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret

 ALIGN 16
@ -569,7 +569,7 @@ ExpandPictureChromaUnalign_sse2:
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@ -581,10 +581,10 @@ ExpandPictureChromaUnalign_sse2:
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; kiStride	
+	mov ecx, edx							; kiStride
 	neg ecx 								; -kiStride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*kiStride
 	lea eax, [esi+eax]						; last line of picture pData
@ -592,16 +592,16 @@ ExpandPictureChromaUnalign_sse2:
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*kiStride + 16 * kiStride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; kiWidth-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; kiWidth
-	exp_top_bottom_sse2	16	
-	
+	exp_top_bottom_sse2	16
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@ -613,14 +613,14 @@ ExpandPictureChromaUnalign_sse2:
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
 	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	16, u
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@ -630,9 +630,9 @@ ExpandPictureChromaUnalign_sse2:
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	neg ecx									; -kiStride
 	mov eax, -16							; chroma=-16
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]						
+	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
 	mov ecx, [esp+28]						; kiStride
 	add edx, 16							; kiHeight+16, luma=32, chroma=16
@ -642,14 +642,14 @@ ExpandPictureChromaUnalign_sse2:
 	neg ecx									; -kiStride
 	; for left & right border expanding
 	exp_cross_sse2		16, u
-	
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret

--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
--- a/codec/decoder/core/asm/mb_copy.asm
+++ b/codec/decoder/core/asm/mb_copy.asm
@ -37,7 +37,7 @@
 ;*  History
 ;*      15/09/2009 Created
 ;*		12/28/2009 Modified with larger throughput
-;*		12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2, 
+;*		12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
 ;*				   WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
 ;*
 ;*
@ -84,7 +84,7 @@ ALIGN 16
 ;                           int iHeight );
 ;*******************************************************************************
 PixelAvgWidthEq4_mmx:
-   
+
    push        esi
    push        edi
    push        ebp
@ -102,7 +102,7 @@ ALIGN 4
 	movd        mm0, [ebp]
    pavgb       mm0, [esi]
    movd        [edi], mm0
-   
+
    dec         ebx
    lea         edi, [edi+eax]
    lea         esi, [esi+ecx]
@ -115,7 +115,7 @@ ALIGN 4
    pop         edi
    pop         esi
    ret
-                          
+
 ALIGN 16
 ;*******************************************************************************
 ; void_t PixelAvgWidthEq8_mmx( uint8_t *pDst,  int iDstStride,
@ -124,7 +124,7 @@ ALIGN 16
 ;                           int iHeight );
 ;*******************************************************************************
 PixelAvgWidthEq8_mmx:
-    
+
    push        esi
    push        edi
    push        ebp
@ -145,14 +145,14 @@ ALIGN 4
    movq        mm0, [esi+ecx]
    pavgb       mm0, [ebp+edx]
    movq		[edi+eax], mm0
-    
+
    lea			esi,  [esi+2*ecx]
    lea			ebp, [ebp+2*edx]
    lea			edi,  [edi+2*eax]
-    
+
    sub           ebx, 2
    jnz         .height_loop
-	
+
 	WELSEMMS
    pop         ebx
    pop         ebp
@ -174,7 +174,7 @@ PixelAvgWidthEq16_sse2:
    push        edi
    push        ebp
    push        ebx
-    
+

    mov         edi, [esp+20]       ; pDst
    mov         eax, [esp+24]       ; iDstStride
@ -188,28 +188,28 @@ ALIGN 4
 	movdqu      xmm0, [esi]
 	pavgb         xmm0, [ebp]
    movdqu      [edi], xmm0
-    
+
 	movdqu      xmm0, [esi+ecx]
 	pavgb         xmm0, [ebp+edx]
    movdqu      [edi+eax], xmm0
-	
+
 	movdqu      xmm0, [esi+2*ecx]
 	pavgb         xmm0, [ebp+2*edx]
    movdqu      [edi+2*eax], xmm0
-    
+
    lea              esi,  [esi+2*ecx]
    lea			   ebp, [ebp+2*edx]
    lea			   edi,  [edi+2*eax]
-     
+
 	movdqu      xmm0, [esi+ecx]
 	pavgb         xmm0, [ebp+edx]
    movdqu      [edi+eax], xmm0
-    
+
    lea              esi,  [esi+2*ecx]
    lea			   ebp, [ebp+2*edx]
    lea			   edi,  [edi+2*eax]
-	    
-    
+
+
    sub         ebx, 4
    jne         .height_loop

@ -232,7 +232,7 @@ McCopyWidthEq4_mmx:
    push    edi
    push    ebx

-    
+
    mov esi,  [esp+16]
    mov eax, [esp+20]
    mov edi,  [esp+24]
@ -242,12 +242,12 @@ ALIGN 4
 .height_loop:
 	mov ebx, [esi]
 	mov [edi], ebx
-	
+
 	add esi, eax
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	WELSEMMS   
+	WELSEMMS
 	pop	   ebx
    pop     edi
    pop     esi
@ -275,12 +275,12 @@ ALIGN 4
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	
-	WELSEMMS   
+
+	WELSEMMS
    pop     edi
    pop     esi
    ret
-	
+



@ -308,7 +308,7 @@ McCopyWidthEq16_sse2:
    push    edi

    mov     esi, [esp+12]       ; pSrc
-    mov     eax, [esp+16]       ; iSrcStride    
+    mov     eax, [esp+16]       ; iSrcStride
    mov     edi, [esp+20]       ; pDst
    mov     edx, [esp+24]       ; iDstStride
    mov     ecx, [esp+28]       ; iHeight
@ -324,7 +324,7 @@ ALIGN 4
    lea     esi, [esi+eax*2]
    lea     edi, [edi+edx*2]
    jnz     .height_loop
-  
+
    pop     edi
    pop     esi
    ret
--- a/codec/decoder/core/asm/mc_chroma.asm
+++ b/codec/decoder/core/asm/mc_chroma.asm
@ -1,317 +1,317 @@
-;*!
-;* \copy
-;*     Copyright (c)  2004-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mc_chroma.asm
-;*
-;*  Abstract
-;*      mmx motion compensation for chroma
-;*
-;*  History
-;*      10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
-	dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
-	dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src, 
-;							int32_t iSrcStride, 
-;							uint8_t *pDst, 
-;							int32_t iDstStride, 
-;							uint8_t *pABCD, 
-;							int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
-	push esi
-	push edi
-	push ebx
-	
-	mov eax, [esp +12 + 20]
-	movd mm3, [eax]
-	WELS_Zero mm7
-	punpcklbw mm3, mm3
-	movq      mm4, mm3
-	punpcklwd mm3, mm3       
-	punpckhwd mm4, mm4		 
-	
-	movq	  mm5, mm3
-	punpcklbw mm3, mm7
-	punpckhbw mm5, mm7
-	
-	movq	  mm6, mm4
-	punpcklbw mm4, mm7
-	punpckhbw mm6, mm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
-	lea ebx, [esi + eax]
-	movd mm0, [esi]
-	movd mm1, [esi+1]
-	punpcklbw mm0, mm7
-	punpcklbw mm1, mm7
-.xloop:
-	
-	pmullw mm0, mm3
-	pmullw mm1, mm5
-	paddw  mm0, mm1
-	
-	movd  mm1, [ebx]
-	punpcklbw mm1, mm7
-	movq mm2, mm1
-	pmullw mm1, mm4
-	paddw mm0, mm1
-	
-	movd mm1, [ebx+1]
-	punpcklbw mm1, mm7
-	movq mm7, mm1
-	pmullw mm1,mm6
-	paddw mm0, mm1
-	movq mm1,mm7
-
-	paddw mm0, [h264_d0x20_mmx]
-	psrlw mm0, 6
-	
-	WELS_Zero mm7
-	packuswb mm0, mm7
-	movd [edi], mm0	
-
-	movq mm0, mm2
-	
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-	WELSEMMS
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc, 
-;						int32_t iSrcStride, 
-;						uint8_t *pDst, 
-;						int32_t iDstStride, 
-;						uint8_t *pABCD, 
-;						int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
-	push esi
-	push edi
-	push ebx
-	
-	mov eax, [esp +12 + 20]
-	movd xmm3, [eax]
-	WELS_Zero xmm7
-	punpcklbw  xmm3, xmm3
-	punpcklwd  xmm3, xmm3
-	
-	movdqa	   xmm4, xmm3
-	punpckldq  xmm3, xmm3
-	punpckhdq  xmm4, xmm4
-	movdqa     xmm5, xmm3
-	movdqa	   xmm6, xmm4
-	
-	punpcklbw  xmm3, xmm7
-	punpckhbw  xmm5, xmm7
-	punpcklbw  xmm4, xmm7
-	punpckhbw  xmm6, xmm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
-	lea ebx, [esi + eax]
-	movq xmm0, [esi]
-	movq xmm1, [esi+1]
-	punpcklbw xmm0, xmm7
-	punpcklbw xmm1, xmm7
-.xloop:
-	
-	pmullw xmm0, xmm3
-	pmullw xmm1, xmm5
-	paddw  xmm0, xmm1
-	
-	movq  xmm1, [ebx]
-	punpcklbw xmm1, xmm7
-	movdqa xmm2, xmm1
-	pmullw xmm1, xmm4
-	paddw xmm0, xmm1
-	
-	movq xmm1, [ebx+1]
-	punpcklbw xmm1, xmm7
-	movdqa xmm7, xmm1
-	pmullw xmm1, xmm6
-	paddw xmm0, xmm1
-	movdqa xmm1,xmm7
-
-	paddw xmm0, [h264_d0x20_sse2]
-	psrlw xmm0, 6
-	
-	WELS_Zero xmm7
-	packuswb xmm0, xmm7
-	movq [edi], xmm0	
-
-	movdqa xmm0, xmm2
-	
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-	
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-
-
-ALIGN 16
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-;						 int32_t iSrcStride, 
-;                        uint8_t *pDst,  
-;                        int32_t iDstStride,
-;                        uint8_t *pABCD,
-;					     int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
-	push ebx
-	push esi
-	push edi
-		
-	mov eax, [esp + 12 + 20]
-
-    pxor      xmm7, xmm7
-    movd   xmm5, [eax]   
-    punpcklwd xmm5, xmm5  
-    punpckldq xmm5, xmm5 
-    movdqa    xmm6, xmm5
-    punpcklqdq xmm5, xmm5
-    punpckhqdq xmm6, xmm6    
-    
-	mov eax, [esp + 12 + 4]   
-	mov edx, [esp + 12 + 8]   
-	mov esi, [esp + 12 + 12]  
-	mov edi, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-    
-    sub esi, edi
-    sub esi, edi
-	movdqa xmm7, [h264_d0x20_sse2]
-
-	movdqu xmm0, [eax]
-	movdqa xmm1, xmm0
-	psrldq xmm1, 1
-	punpcklbw xmm0, xmm1
-	
-.hloop_chroma:	
-	lea	esi, [esi+2*edi]
-	
-	movdqu xmm2, [eax+edx]
-	movdqa xmm3, xmm2
-	psrldq xmm3, 1
-	punpcklbw xmm2, xmm3
-	movdqa      xmm4, xmm2
-	
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm0, xmm2
-    paddw      xmm0, xmm7
-	psrlw      xmm0, 6
-    packuswb   xmm0, xmm0
-    movq       [esi],xmm0	
-    
-    lea eax, [eax+2*edx]
-    movdqu xmm2, [eax]
-    movdqa xmm3, xmm2
-    psrldq xmm3, 1
-    punpcklbw xmm2, xmm3
-    movdqa      xmm0, xmm2
-    
-    pmaddubsw  xmm4, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm4, xmm2
-    paddw      xmm4, xmm7
-	psrlw      xmm4, 6
-    packuswb   xmm4, xmm4
-    movq       [esi+edi],xmm4	
-	
-	sub ecx, 2
-	jnz .hloop_chroma
-	pop edi
-	pop esi
-	pop ebx
-
-	ret
-
-
+;*!
+;* \copy
+;*     Copyright (c)  2004-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mc_chroma.asm
+;*
+;*  Abstract
+;*      mmx motion compensation for chroma
+;*
+;*  History
+;*      10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+	dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+	dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( uint8_t *src,
+;							int32_t iSrcStride,
+;							uint8_t *pDst,
+;							int32_t iDstStride,
+;							uint8_t *pABCD,
+;							int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+McChromaWidthEq4_mmx:
+	push esi
+	push edi
+	push ebx
+
+	mov eax, [esp +12 + 20]
+	movd mm3, [eax]
+	WELS_Zero mm7
+	punpcklbw mm3, mm3
+	movq      mm4, mm3
+	punpcklwd mm3, mm3
+	punpckhwd mm4, mm4
+
+	movq	  mm5, mm3
+	punpcklbw mm3, mm7
+	punpckhbw mm5, mm7
+
+	movq	  mm6, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm6, mm7
+
+	mov esi, [esp +12+ 4]
+	mov eax, [esp + 12 + 8]
+	mov edi, [esp + 12 + 12]
+	mov edx, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
+	lea ebx, [esi + eax]
+	movd mm0, [esi]
+	movd mm1, [esi+1]
+	punpcklbw mm0, mm7
+	punpcklbw mm1, mm7
+.xloop:
+
+	pmullw mm0, mm3
+	pmullw mm1, mm5
+	paddw  mm0, mm1
+
+	movd  mm1, [ebx]
+	punpcklbw mm1, mm7
+	movq mm2, mm1
+	pmullw mm1, mm4
+	paddw mm0, mm1
+
+	movd mm1, [ebx+1]
+	punpcklbw mm1, mm7
+	movq mm7, mm1
+	pmullw mm1,mm6
+	paddw mm0, mm1
+	movq mm1,mm7
+
+	paddw mm0, [h264_d0x20_mmx]
+	psrlw mm0, 6
+
+	WELS_Zero mm7
+	packuswb mm0, mm7
+	movd [edi], mm0
+
+	movq mm0, mm2
+
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+	WELSEMMS
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( uint8_t *pSrc,
+;						int32_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride,
+;						uint8_t *pABCD,
+;						int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+McChromaWidthEq8_sse2:
+	push esi
+	push edi
+	push ebx
+
+	mov eax, [esp +12 + 20]
+	movd xmm3, [eax]
+	WELS_Zero xmm7
+	punpcklbw  xmm3, xmm3
+	punpcklwd  xmm3, xmm3
+
+	movdqa	   xmm4, xmm3
+	punpckldq  xmm3, xmm3
+	punpckhdq  xmm4, xmm4
+	movdqa     xmm5, xmm3
+	movdqa	   xmm6, xmm4
+
+	punpcklbw  xmm3, xmm7
+	punpckhbw  xmm5, xmm7
+	punpcklbw  xmm4, xmm7
+	punpckhbw  xmm6, xmm7
+
+	mov esi, [esp +12+ 4]
+	mov eax, [esp + 12 + 8]
+	mov edi, [esp + 12 + 12]
+	mov edx, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
+	lea ebx, [esi + eax]
+	movq xmm0, [esi]
+	movq xmm1, [esi+1]
+	punpcklbw xmm0, xmm7
+	punpcklbw xmm1, xmm7
+.xloop:
+
+	pmullw xmm0, xmm3
+	pmullw xmm1, xmm5
+	paddw  xmm0, xmm1
+
+	movq  xmm1, [ebx]
+	punpcklbw xmm1, xmm7
+	movdqa xmm2, xmm1
+	pmullw xmm1, xmm4
+	paddw xmm0, xmm1
+
+	movq xmm1, [ebx+1]
+	punpcklbw xmm1, xmm7
+	movdqa xmm7, xmm1
+	pmullw xmm1, xmm6
+	paddw xmm0, xmm1
+	movdqa xmm1,xmm7
+
+	paddw xmm0, [h264_d0x20_sse2]
+	psrlw xmm0, 6
+
+	WELS_Zero xmm7
+	packuswb xmm0, xmm7
+	movq [edi], xmm0
+
+	movdqa xmm0, xmm2
+
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+
+
+ALIGN 16
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
+;						 int32_t iSrcStride,
+;                        uint8_t *pDst,
+;                        int32_t iDstStride,
+;                        uint8_t *pABCD,
+;					     int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+McChromaWidthEq8_ssse3:
+	push ebx
+	push esi
+	push edi
+
+	mov eax, [esp + 12 + 20]
+
+    pxor      xmm7, xmm7
+    movd   xmm5, [eax]
+    punpcklwd xmm5, xmm5
+    punpckldq xmm5, xmm5
+    movdqa    xmm6, xmm5
+    punpcklqdq xmm5, xmm5
+    punpckhqdq xmm6, xmm6
+
+	mov eax, [esp + 12 + 4]
+	mov edx, [esp + 12 + 8]
+	mov esi, [esp + 12 + 12]
+	mov edi, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
+    sub esi, edi
+    sub esi, edi
+	movdqa xmm7, [h264_d0x20_sse2]
+
+	movdqu xmm0, [eax]
+	movdqa xmm1, xmm0
+	psrldq xmm1, 1
+	punpcklbw xmm0, xmm1
+
+.hloop_chroma:
+	lea	esi, [esi+2*edi]
+
+	movdqu xmm2, [eax+edx]
+	movdqa xmm3, xmm2
+	psrldq xmm3, 1
+	punpcklbw xmm2, xmm3
+	movdqa      xmm4, xmm2
+
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm7
+	psrlw      xmm0, 6
+    packuswb   xmm0, xmm0
+    movq       [esi],xmm0
+
+    lea eax, [eax+2*edx]
+    movdqu xmm2, [eax]
+    movdqa xmm3, xmm2
+    psrldq xmm3, 1
+    punpcklbw xmm2, xmm3
+    movdqa      xmm0, xmm2
+
+    pmaddubsw  xmm4, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm4, xmm2
+    paddw      xmm4, xmm7
+	psrlw      xmm4, 6
+    packuswb   xmm4, xmm4
+    movq       [esi+edi],xmm4
+
+	sub ecx, 2
+	jnz .hloop_chroma
+	pop edi
+	pop esi
+	pop ebx
+
+	ret
+
+
--- a/codec/decoder/core/asm/mc_luma.asm
+++ b/codec/decoder/core/asm/mc_luma.asm
@ -69,16 +69,16 @@ WELS_EXTERN McHorVer20WidthEq4_mmx

 ALIGN 16
 ;*******************************************************************************
-; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc, 
-;                       int iSrcStride, 
-;						uint8_t *pDst, 
-;						int iDstStride, 
+; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc,
+;                       int iSrcStride,
+;						uint8_t *pDst,
+;						int iDstStride,
 ;						int iHeight)
 ;*******************************************************************************
 McHorVer20WidthEq4_mmx:
 	push esi
 	push edi
-	
+
 	mov  esi, [esp+12]
 	mov eax, [esp+16]
 	mov edi, [esp+20]
@ -100,7 +100,7 @@ McHorVer20WidthEq4_mmx:
 	punpcklbw mm4, mm7
 	movd mm5, [esi+3]
 	punpcklbw mm5, mm7
-	
+
 	paddw mm2, mm3
 	paddw mm4, mm5
 	psllw mm4, 2
@ -113,12 +113,12 @@ McHorVer20WidthEq4_mmx:
 	psraw mm0, 5
 	packuswb mm0, mm7
 	movd [edi], mm0
-	
+
 	add esi, eax
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	
+
 	WELSEMMS
 	pop edi
 	pop esi
@ -181,8 +181,8 @@ WELS_EXTERN McHorVer20WidthEq16_sse2

 ALIGN 16
 ;***********************************************************************
-; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc, 
-;                       int16_t iSrcStride, 
+; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc,
+;                       int16_t iSrcStride,
 ;						uint8_t *pDst,
 ;						int32_t iDstStride
 ;						int32_t iHeight
@ -197,11 +197,11 @@ McHorVer22Width8HorFirst_sse2:
 	mov edi, [esp+24]		;pDst
 	mov edx, [esp+28]	;iDstStride
 	mov ebx, [esp+32]	;iHeight
-	pxor xmm7, xmm7	
-	
+	pxor xmm7, xmm7
+
 	sub esi, eax				;;;;;;;;need more 5 lines.
 	sub esi, eax
-		
+
 .yloop_width_8:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
@ -215,7 +215,7 @@ McHorVer22Width8HorFirst_sse2:
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@ -225,7 +225,7 @@ McHorVer22Width8HorFirst_sse2:
 	psllw xmm4, 2
 	paddw xmm0, xmm4
 	movdqa [edi], xmm0
-		
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@ -238,8 +238,8 @@ McHorVer22Width8HorFirst_sse2:
 ALIGN 16
 ;***********************************************************************
 ;void_t McHorVer22VerLast_sse2(
-;											uint8_t *pSrc, 
-;											int32_t pSrcStride, 
+;											uint8_t *pSrc,
+;											int32_t pSrcStride,
 ;											uint8_t * pDst,
 ;											int32_t iDstStride,
 ;											int32_t iWidth,
@ -250,17 +250,17 @@ ALIGN 16
 	paddw  %1, %6
 	movdqa %7, %2
 	movdqa %8, %3
-	
-	
+
+
 	paddw %7, %5
 	paddw %8, %4
-	
-	psubw  %1, %7   
-	psraw   %1, 2	  
-	paddw  %1, %8   
-	psubw  %1, %7 
-	psraw   %1, 2	
-	paddw  %8, %1   
+
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %1, %8
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %8, %1
 	paddw  %8, [h264_mc_hc_32]
 	psraw   %8, 6
 	packuswb %8, %8
@ -272,15 +272,15 @@ McHorVer22VerLast_sse2:
 	push edi
 	push ebx
 	push ebp
-	
+
 	mov esi, [esp+20]
 	mov eax, [esp+24]
 	mov edi, [esp+28]
 	mov edx, [esp+32]
 	mov ebx, [esp+36]
-	mov ecx, [esp+40]	
-	shr ebx, 3	
-	
+	mov ecx, [esp+40]
+	shr ebx, 3
+
 .width_loop:
 	movdqa xmm0, [esi]
 	movdqa xmm1, [esi+eax]
@ -290,73 +290,73 @@ McHorVer22VerLast_sse2:
 	lea esi, [esi+2*eax]
 	movdqa xmm4, [esi]
 	movdqa xmm5, [esi+eax]
-	
+
 	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	lea esi, [esi+2*eax]
 	movdqa xmm6, [esi]
-	
+
 	movdqa xmm0, xmm1
 	movdqa xmm1, xmm2
 	movdqa xmm2, xmm3
 	movdqa xmm3, xmm4
 	movdqa xmm4, xmm5
 	movdqa xmm5, xmm6
-	
+
 	add edi, edx
-	sub esi, eax		
-	
+	sub esi, eax
+
 .start:
 	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm6, [esi]
 	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm7, [esi+eax]
 	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm0, [esi]
 	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm1, [esi+eax]
 	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm2, [esi]
 	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm3, [esi+eax]
 	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm4, [esi]
 	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm5, [esi+eax]
 	jmp near .start
-	
+
 .x_loop_dec:
 	dec ebx
 	jz near .exit
@ -366,9 +366,9 @@ McHorVer22VerLast_sse2:
 	add esi, 16
 	add edi, 8
 	jmp .width_loop
-	
-	
-	
+
+
+
 .exit:
 	pop ebp
 	pop ebx
@ -379,28 +379,28 @@ McHorVer22VerLast_sse2:

 ALIGN 16
 ;*******************************************************************************
-; void_t McHorVer20WidthEq8_sse2(  uint8_t *pSrc, 
-;                       int iSrcStride, 
-;												uint8_t *pDst, 
-;												int iDstStride, 
+; void_t McHorVer20WidthEq8_sse2(  uint8_t *pSrc,
+;                       int iSrcStride,
+;												uint8_t *pDst,
+;												int iDstStride,
 ;												int iHeight,
 ;                      );
 ;*******************************************************************************
 McHorVer20WidthEq8_sse2:
 	push	esi
 	push	edi
-	
+
 	mov esi, [esp + 12]         ;pSrc
 	mov eax, [esp + 16]         ;iSrcStride
 	mov edi, [esp + 20]         ;pDst
 	mov ecx, [esp + 28]         ;iHeight
 	mov edx, [esp + 24]			;iDstStride
-	
+
 	lea esi, [esi-2]            ;pSrc -= 2;
-	
+
 	pxor xmm7, xmm7
 	movdqa xmm6, [h264_w0x10_1]
-.y_loop:	
+.y_loop:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@ -413,7 +413,7 @@ McHorVer20WidthEq8_sse2:
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@ -424,7 +424,7 @@ McHorVer20WidthEq8_sse2:
 	paddw xmm0, xmm4
 	paddw xmm0, xmm6
 	psraw xmm0, 5
-	
+
 	packuswb xmm0, xmm7
 	movq [edi], xmm0

@ -432,37 +432,37 @@ McHorVer20WidthEq8_sse2:
 	lea esi, [esi+eax]
 	dec ecx
 	jnz near .y_loop
-	
+
 	pop edi
 	pop esi
 	ret
-	
+
 ALIGN 16
 ;*******************************************************************************
-; void_t McHorVer20WidthEq16_sse2(  uint8_t *pSrc, 
-;                       int iSrcStride, 
-;												uint8_t *pDst, 
-;												int iDstStride, 
+; void_t McHorVer20WidthEq16_sse2(  uint8_t *pSrc,
+;                       int iSrcStride,
+;												uint8_t *pDst,
+;												int iDstStride,
 ;												int iHeight,
 ;                      );
 ;*******************************************************************************
 McHorVer20WidthEq16_sse2:
 	push	esi
 	push	edi
-	
+

 	mov esi, [esp + 12]         ;pSrc
 	mov eax, [esp + 16]         ;iSrcStride
 	mov edi, [esp + 20]         ;pDst
 	mov ecx, [esp + 28]         ;iHeight
 	mov edx, [esp + 24]			;iDstStride
-	
+
 	lea esi, [esi-2]            ;pSrc -= 2;
-	
+
 	pxor xmm7, xmm7
 	movdqa xmm6, [h264_w0x10_1]
 .y_loop:
-	
+
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@ -475,7 +475,7 @@ McHorVer20WidthEq16_sse2:
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@ -501,7 +501,7 @@ McHorVer20WidthEq16_sse2:
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3+8]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@ -514,9 +514,9 @@ McHorVer20WidthEq16_sse2:
 	psraw xmm0, 5
 	packuswb xmm0, xmm7
 	movq [edi+8], xmm0
-	
-	lea edi, [edi+edx]	
-	lea esi, [esi+eax]	
+
+	lea edi, [edi+edx]
+	lea esi, [esi+eax]
 	dec ecx
 	jnz near .y_loop
 	pop edi
@ -525,17 +525,17 @@ McHorVer20WidthEq16_sse2:


 ;*******************************************************************************
-; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc, 
-;                       int iSrcStride, 
-;                       uint8_t *pDst, 
-;                       int iDstStride, 
+; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc,
+;                       int iSrcStride,
+;                       uint8_t *pDst,
+;                       int iDstStride,
 ;                       int iHeight )
 ;*******************************************************************************
 ALIGN 16
 McHorVer02WidthEq8_sse2:
 	push esi
 	push edi
-	
+
 	mov esi, [esp + 12]           ;pSrc
 	mov edx, [esp + 16]	          ;iSrcStride
 	mov edi, [esp + 20]           ;pDst
@ -546,7 +546,7 @@ McHorVer02WidthEq8_sse2:
 	sub esi, edx

 	WELS_Zero xmm7
-			
+
 	SSE_LOAD_8P xmm0, xmm7, [esi]
 	SSE_LOAD_8P xmm1, xmm7, [esi+edx]
 	lea esi, [esi+2*edx]
@ -555,8 +555,8 @@ McHorVer02WidthEq8_sse2:
 	lea esi, [esi+2*edx]
 	SSE_LOAD_8P xmm4, xmm7, [esi]
 	SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-	
-.start:	
+
+.start:
 	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .xx_exit
@ -566,7 +566,7 @@ McHorVer02WidthEq8_sse2:
 	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
 	dec ecx
 	jz near .xx_exit
-	
+
 	lea edi, [edi+2*eax]
 	SSE_LOAD_8P xmm7, xmm0, [esi+edx]
 	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
--- a/codec/decoder/core/asm/memzero.asm
+++ b/codec/decoder/core/asm/memzero.asm
@ -32,7 +32,7 @@
 ;*  memzero.asm
 ;*
 ;*  Abstract
-;*      
+;*
 ;*
 ;*  History
 ;*      9/16/2009 Created
@ -47,8 +47,8 @@ BITS 32
 ; Code
 ;***********************************************************************

-SECTION .text			
-		
+SECTION .text
+
 ALIGN 16
 ;***********************************************************************
 ;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@ -57,7 +57,7 @@ WELS_EXTERN WelsPrefetchZero_mmx
 WelsPrefetchZero_mmx:
 	mov  eax,[esp+4]
 	prefetchnta [eax]
-	ret 			
+	ret


 ALIGN 16
@ -69,7 +69,7 @@ WelsSetMemZeroAligned64_sse2:
 		mov		eax,	[esp + 4]          ; dst
 		mov		ecx,	[esp + 8]
 		neg		ecx
-			
+
 		pxor	xmm0,		xmm0
 .memzeroa64_sse2_loops:
 		movdqa	[eax],		xmm0
@ -77,11 +77,11 @@ WelsSetMemZeroAligned64_sse2:
 		movdqa	[eax+32],	xmm0
 		movdqa	[eax+48],	xmm0
 		add		eax, 0x40
-		
+
 		add ecx, 0x40
 		jnz near .memzeroa64_sse2_loops
-			
-		ret	
+
+		ret

 ALIGN 16
 ;***********************************************************************
@ -92,7 +92,7 @@ WelsSetMemZeroSize64_mmx:
 		mov		eax,	[esp + 4]          ; dst
 		mov		ecx,	[esp + 8]
 		neg		ecx
-			
+
 		pxor	mm0,		mm0
 .memzero64_mmx_loops:
 		movq	[eax],		mm0
@ -102,16 +102,16 @@ WelsSetMemZeroSize64_mmx:
 		movq	[eax+32],	mm0
 		movq	[eax+40],	mm0
 		movq	[eax+48],	mm0
-		movq	[eax+56],	mm0		
+		movq	[eax+56],	mm0
 		add		eax,		0x40
-		
+
 		add ecx, 0x40
 		jnz near .memzero64_mmx_loops
-			
-		WELSEMMS	
-		ret	
-	
-ALIGN 16		
+
+		WELSEMMS
+		ret
+
+ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
 ;***********************************************************************
@ -119,17 +119,17 @@ WELS_EXTERN WelsSetMemZeroSize8_mmx
 WelsSetMemZeroSize8_mmx:
 		mov		eax,	[esp + 4]		; dst
 		mov		ecx,	[esp + 8]		; size
-		neg		ecx			
+		neg		ecx
 		pxor	mm0,		mm0
-		
+
 .memzero8_mmx_loops:
 		movq	[eax],		mm0
 		add		eax,		0x08
-	
+
 		add		ecx,		0x08
 		jnz near .memzero8_mmx_loops
-		
-		WELSEMMS	
-		ret	

-							
+		WELSEMMS
+		ret
+
+
--- a/codec/decoder/plus/res/welsdec.rc
+++ b/codec/decoder/plus/res/welsdec.rc
@ -27,18 +27,18 @@ LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
 // TEXTINCLUDE
 //

-1 TEXTINCLUDE 
+1 TEXTINCLUDE
 BEGIN
    "resource.h\0"
 END

-2 TEXTINCLUDE 
+2 TEXTINCLUDE
 BEGIN
    "#include ""windows.h""\r\n"
    "\0"
 END

-3 TEXTINCLUDE 
+3 TEXTINCLUDE
 BEGIN
    "\r\n"
    "\0"
--- a/codec/encoder/core/asm/asm_inc.asm
+++ b/codec/encoder/core/asm/asm_inc.asm
@ -43,7 +43,7 @@
 ; Options, for DEBUG
 ;***********************************************************************

-%if 1 
+%if 1
 	%define MOVDQ movdqa
 %else
 	%define MOVDQ movdqu
@ -58,7 +58,7 @@
 BITS 32

 ;***********************************************************************
-; Macros 
+; Macros
 ;***********************************************************************

 %macro WELS_EXTERN 1
@ -74,7 +74,7 @@ BITS 32
 	pxor        %2, %2
    psubw       %2, %1
    pmaxsw      %1, %2
-%endmacro 	
+%endmacro

 %macro MMX_XSwap  4
    movq		%4, %2
@ -105,7 +105,7 @@ BITS 32
    SSE2_XSawp qdq, %5, %2, %3
 %endmacro

-;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4 
+;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
 %macro SSE2_TransTwo4x4W 5
    SSE2_XSawp wd,  %1, %2, %5
    SSE2_XSawp wd,  %3, %4, %2
@ -125,26 +125,26 @@ BITS 32
 	movdqa	%6, %9
 	movdqa	%9, %4
 	SSE2_XSawp bw,  %7, %6, %4
-	
-	SSE2_XSawp wd,  %1, %3, %6	
+
+	SSE2_XSawp wd,  %1, %3, %6
 	SSE2_XSawp wd,  %8, %2, %3
 	SSE2_XSawp wd,  %5, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %3	
+	movdqa	%9, %3
 	SSE2_XSawp wd,  %7, %4, %3
-	
-	SSE2_XSawp dq,  %1, %5, %4	
+
+	SSE2_XSawp dq,  %1, %5, %4
 	SSE2_XSawp dq,  %6, %2, %5
 	SSE2_XSawp dq,  %8, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %5		
+	movdqa	%9, %5
 	SSE2_XSawp dq,  %7, %3, %5
-	
+
 	SSE2_XSawp qdq,  %1, %8, %3
 	SSE2_XSawp qdq,  %4, %2, %8
 	SSE2_XSawp qdq,  %6, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %1		
+	movdqa	%9, %1
 	SSE2_XSawp qdq,  %7, %5, %1
 	movdqa	%5, %9
 %endmacro
@ -170,9 +170,9 @@ BITS 32
 %macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
 	mov %3h, %3l
 	movd %1, e%3x		; i.e, 1% = eax (=b0)
-	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0	
-	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0	
-%endmacro  
+	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
+	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro

 ;copy a dw into a xmm for 8 times
 %macro  SSE2_Copy8Times 2
--- a/codec/encoder/core/asm/coeff.asm
+++ b/codec/encoder/core/asm/coeff.asm
@ -318,25 +318,25 @@ byte_1pos_table:
 SECTION .text


-	
+
 ;***********************************************************************
-;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx); 
+;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
 ;***********************************************************************
 WELS_EXTERN CavlcParamCal_sse2
 CavlcParamCal_sse2:
 	push ebx
 	push edi
 	push esi
-	
+
 	mov			eax,	[esp+16]	;coffLevel
 	mov			edi,	[esp+24]	;Level
 	mov			ebx,	[esp+32]	;endIdx
 	cmp			ebx,	3
-	jne			.Level16	
+	jne			.Level16
 	pxor		xmm1,	xmm1
 	movq		xmm0,	[eax]	; removed QWORD
-	jmp			.Cal_begin		
-.Level16:	
+	jmp			.Cal_begin
+.Level16:
 	movdqa		xmm0,	[eax]
 	movdqa		xmm1,	[eax+16]
 .Cal_begin:
@ -354,7 +354,7 @@ CavlcParamCal_sse2:
 	pcmpeqw		xmm7,	xmm7	;generate -1
    mov			ebx,	0xff
    ;pinsrw		xmm6,	ebx,	3
-   
+
    mov       bl,   dh

 	lea       ebx,  [byte_1pos_table+8*ebx]
@ -362,7 +362,7 @@ CavlcParamCal_sse2:
 	pextrw    ecx,  xmm0, 3
 	shr       ecx,  8
    mov       dh,   cl
- 
+
 .loopHighFind0:
    cmp       ecx,   0
    je        .loopHighFind0End
@ -372,7 +372,7 @@ CavlcParamCal_sse2:
    add       esi, 8
    mov       esi, [eax+2*esi]
    mov       [edi], si
-    add       edi,   2 
+    add       edi,   2
    ;add       ebx,   1
    inc		  ebx
    dec       ecx
@ -403,8 +403,8 @@ CavlcParamCal_sse2:
 	;and       edx, 0xff
 	movzx	  edx,	byte [ebx]
 	mov       edx, [eax+2*edx]
-	mov       [edi], dx 
-	add       edi,   2 
+	mov       [edi], dx
+	add       edi,   2
 	;add       ebx,   1
 	inc		  ebx
    dec       esi
@ -436,8 +436,8 @@ CavlcParamCal_sse2:
    psllq    xmm0, xmm3
    psrlq    xmm0, xmm3
    movdqa   xmm4, xmm1
-    psllq    xmm1, xmm2 
-    psrlq    xmm4, xmm3 
+    psllq    xmm1, xmm2
+    psrlq    xmm4, xmm3
    punpcklqdq xmm1, xmm4
    por      xmm0,  xmm1

--- a/codec/encoder/core/asm/cpuid.asm
+++ b/codec/encoder/core/asm/cpuid.asm
@ -84,12 +84,12 @@ ALIGN 16
 ;   void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
 ;****************************************************************************************************
 WelsCPUId:
-	push	ebx	
+	push	ebx
 	push	edi
-	
+
 	mov     eax, [esp+12]	; operating index
    cpuid					; cpuid
-	
+
 	; processing various information return
 	mov     edi, [esp+16]
    mov     [edi], eax
@ -100,10 +100,10 @@ WelsCPUId:
    mov     edi, [esp+28]
    mov     [edi], edx

-	pop		edi	
+	pop		edi
    pop     ebx
 	ret
-	
+
 WELS_EXTERN WelsCPUSupportAVX
 ; need call after cpuid=1 and eax, ecx flag got then
 ALIGN 16
@ -139,7 +139,7 @@ ALIGN 16
 WelsCPUSupportFMA:
 	mov eax, [esp+4]
 	mov ecx, [esp+8]
-	
+
 	; refer to detection of FMA addressed in INTEL AVX manual document
 	and ecx, 018001000H
 	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
@ -153,7 +153,7 @@ WelsCPUSupportFMA:
 	mov eax, 1
 	ret
 fma_not_supported:
-	mov eax, 0	
+	mov eax, 0
 	ret

 WELS_EXTERN WelsEmms
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@ -48,26 +48,26 @@ SECTION .rodata align=16

 ;***********************************************************************
 ; Constant
-;***********************************************************************		
-			
+;***********************************************************************
+
 align 16
-SSE2_DeQuant8 dw  10, 13, 10, 13, 13, 16, 13, 16, 
+SSE2_DeQuant8 dw  10, 13, 10, 13, 13, 16, 13, 16,
 			dw	10, 13, 10, 13, 13, 16, 13, 16,
-            dw  11, 14, 11, 14, 14, 18, 14, 18, 
+            dw  11, 14, 11, 14, 14, 18, 14, 18,
 			dw  11, 14, 11, 14, 14, 18, 14, 18,
-			dw  13, 16, 13, 16, 16, 20, 16, 20, 
 			dw  13, 16, 13, 16, 16, 20, 16, 20,
-            dw  14, 18, 14, 18, 18, 23, 18, 23, 
+			dw  13, 16, 13, 16, 16, 20, 16, 20,
+            dw  14, 18, 14, 18, 18, 23, 18, 23,
 			dw  14, 18, 14, 18, 18, 23, 18, 23,
-			dw  16, 20, 16, 20, 20, 25, 20, 25, 
 			dw  16, 20, 16, 20, 20, 25, 20, 25,
-            dw  18, 23, 18, 23, 23, 29, 23, 29, 
+			dw  16, 20, 16, 20, 20, 25, 20, 25,
+            dw  18, 23, 18, 23, 23, 29, 23, 29,
 			dw  18, 23, 18, 23, 23, 29, 23, 29
-			
+

 ;***********************************************************************
 ; MMX functions
-;***********************************************************************			
+;***********************************************************************

 %macro MMX_LoadDiff4P 5
 	movd        %1, [%3]
@ -112,7 +112,7 @@ SSE2_DeQuant8 dw  10, 13, 10, 13, 13, 16, 13, 16,
    MMX_SumSub		%4, %1, %6
    MMX_SumSub		%3, %2, %6
    MMX_SumSub		%3, %4, %6
-    MMX_SumSubMul2  %1, %2, %5  
+    MMX_SumSubMul2  %1, %2, %5
 %endmacro

 %macro MMX_IDCT 6
@ -145,13 +145,13 @@ WelsDctT4_mmx:
    mov     edx, [esp+24]   ; i_pix2

    WELS_Zero    mm7
-    
+
    MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, eax, ebx, ecx, edx, mm0, mm7

-    MMX_DCT			mm1, mm2, mm3 ,mm4, mm5, mm6           
+    MMX_DCT			mm1, mm2, mm3 ,mm4, mm5, mm6
    MMX_Trans4x4W	mm3, mm1, mm4, mm5, mm2
-    
-    MMX_DCT			mm3, mm5, mm2 ,mm4, mm1, mm6                    
+
+    MMX_DCT			mm3, mm5, mm2 ,mm4, mm1, mm6
    MMX_Trans4x4W	mm2, mm3, mm4, mm1, mm5

    mov     eax, [esp+ 8]   ; pDct
@ -178,15 +178,15 @@ WelsIDctT4Rec_mmx:
 %define     i_pred      esp+pushsize+16
 %define     pDct        esp+pushsize+20

-	mov     eax, [pDct   ] 
+	mov     eax, [pDct   ]
    movq    mm0, [eax+ 0]
    movq    mm1, [eax+ 8]
    movq    mm2, [eax+16]
    movq    mm3, [eax+24]
-    mov     edx, [p_dst ]   
-    mov     ecx, [i_dst ]   
+    mov     edx, [p_dst ]
+    mov     ecx, [i_dst ]
    mov     eax, [p_pred]
-    mov     ebx, [i_pred]     
+    mov     ebx, [i_pred]

 	MMX_Trans4x4W		mm0, mm1, mm2, mm3, mm4
 	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
@ -195,14 +195,14 @@ WelsIDctT4Rec_mmx:

    WELS_Zero			mm7
    WELS_DW32			mm6
-    
+
    MMX_StoreDiff4P		mm3, mm0, mm6, mm7, [edx], [eax]
    MMX_StoreDiff4P		mm4, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
    lea     edx, [edx+2*ecx]
    lea     eax, [eax+2*ebx]
    MMX_StoreDiff4P		mm1, mm0, mm6, mm7, [edx], [eax]
    MMX_StoreDiff4P		mm2, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
-    
+
 	WELSEMMS
 %undef	pushsize
 %undef  p_dst
@ -220,17 +220,17 @@ WelsIDctT4Rec_mmx:
 %macro SSE2_Store4x8p 6
 	SSE2_XSawp qdq, %2, %3, %6
 	SSE2_XSawp qdq, %4, %5, %3
-	MOVDQ    [%1+0x00], %2 
-	MOVDQ    [%1+0x10], %4 
-	MOVDQ    [%1+0x20], %6 
-	MOVDQ    [%1+0x30], %3 
+	MOVDQ    [%1+0x00], %2
+	MOVDQ    [%1+0x10], %4
+	MOVDQ    [%1+0x20], %6
+	MOVDQ    [%1+0x30], %3
 %endmacro

 %macro SSE2_Load4x8p 6
 	MOVDQ    %2,	[%1+0x00]
-	MOVDQ    %4,	[%1+0x10]  
-	MOVDQ    %6,	[%1+0x20]  
-	MOVDQ    %3,	[%1+0x30]  
+	MOVDQ    %4,	[%1+0x10]
+	MOVDQ    %6,	[%1+0x20]
+	MOVDQ    %3,	[%1+0x30]
 	SSE2_XSawp qdq, %4, %3, %5
 	SSE2_XSawp qdq, %2, %6, %3
 %endmacro
@ -271,40 +271,40 @@ WelsIDctT4Rec_mmx:
 %endmacro

 %macro SSE2_Load8DC	6
-	movdqa		%1,		%6		; %1 = dc0 dc1	
+	movdqa		%1,		%6		; %1 = dc0 dc1
 	paddw       %1,		%5
-    psraw       %1,		$6		; (dc + 32) >> 6	
-    
+    psraw       %1,		$6		; (dc + 32) >> 6
+
    movdqa		%2,		%1
    psrldq		%2,		4
 	punpcklwd	%2,		%2
-	punpckldq	%2,		%2		; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3	   
+	punpckldq	%2,		%2		; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3

    movdqa		%3,		%1
    psrldq		%3,		8
 	punpcklwd	%3,		%3
 	punpckldq	%3,		%3		; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
-	
+
 	movdqa		%4,		%1
    psrldq		%4,		12
 	punpcklwd	%4,		%4
 	punpckldq	%4,		%4		; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
-	    	
+
 	punpcklwd	%1,		%1
-	punpckldq	%1,		%1		; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1	
+	punpckldq	%1,		%1		; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
 %endmacro

 %macro SSE2_DCT 6
-    SSE2_SumSub		%6, %3,	%5						
-	SSE2_SumSub		%1, %2, %5																		
-	SSE2_SumSub		%3, %2, %5					
-	SSE2_SumSubMul2		%6, %1, %4               	
+    SSE2_SumSub		%6, %3,	%5
+	SSE2_SumSub		%1, %2, %5
+	SSE2_SumSub		%3, %2, %5
+	SSE2_SumSubMul2		%6, %1, %4
 %endmacro

 %macro SSE2_IDCT 7
-    SSE2_SumSub       %7, %2, %6					
-    SSE2_SumSubDiv2     %1, %3, %5, %4              
-    SSE2_SumSub	     %2, %1, %5 
+    SSE2_SumSub       %7, %2, %6
+    SSE2_SumSubDiv2     %1, %3, %5, %4
+    SSE2_SumSub	     %2, %1, %5
    SSE2_SumSub		 %7, %4, %5
 %endmacro

@ -316,12 +316,12 @@ ALIGN 16
 WelsDctFourT4_sse2:
    push    ebx
    push	esi
-    mov		esi, [esp+12] 
+    mov		esi, [esp+12]
    mov     eax, [esp+16]   ; pix1
    mov     ebx, [esp+20]   ; i_pix1
    mov     ecx, [esp+24]   ; pix2
-    mov     edx, [esp+28]   ; i_pix2    
-    
+    mov     edx, [esp+28]   ; i_pix2
+
    pxor    xmm7, xmm7

 	;Load 4x8
@ -331,33 +331,33 @@ WelsDctFourT4_sse2:
 	lea		ecx, [ecx + 2 * edx]
 	SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [eax], [ecx]
    SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
-	
+
 	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
 	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
-	SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2             		
+	SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
 	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
-	
-	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5  
-	
+
+	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
+
 	lea		eax, [eax + 2 * ebx]
 	lea		ecx, [ecx + 2 * edx]
-    
+
 	;Load 4x8
 	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [eax      ], [ecx    ]
    SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [eax+ebx  ], [ecx+edx]
 	lea		eax, [eax + 2 * ebx]
-	lea		ecx, [ecx + 2 * edx]	
+	lea		ecx, [ecx + 2 * edx]
    SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [eax], [ecx]
    SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
-	
+
 	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
-	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1		
-    SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2              		
+	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
+    SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
 	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
-	
+
 	lea		esi, [esi+64]
-	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5 
-	
+	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
+
    pop esi
    pop ebx
    ret
@ -377,62 +377,62 @@ WelsIDctFourT4Rec_sse2:
 %define	pushsize	8
    push		ebx
    push		esi
-    
-    mov			eax,		[rec]   
-    mov			ebx,		[stride]   
-    mov			ecx,		[pred]  
-    mov			edx,		[pred_stride]   
-    mov			esi,		[rs]  
+
+    mov			eax,		[rec]
+    mov			ebx,		[stride]
+    mov			ecx,		[pred]
+    mov			edx,		[pred_stride]
+    mov			esi,		[rs]

 	;Load 4x8
-	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5 	
-	
+	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5
+
 	SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
  	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
    SSE2_TransTwo4x4W	xmm1, xmm4, xmm0, xmm2, xmm3
    SSE2_IDCT			xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
-    
+
 	WELS_Zero			xmm7
    WELS_DW32			xmm6

 	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [eax		],	[ecx]
 	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [eax + ebx	],	[ecx + edx]
 	lea		eax, [eax + 2 * ebx]
-	lea		ecx, [ecx + 2 * edx]	
+	lea		ecx, [ecx + 2 * edx]
 	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [eax],			[ecx]
 	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [eax + ebx	],	[ecx + edx]
-   
+
    add		esi, 64
 	lea		eax, [eax + 2 * ebx]
 	lea		ecx, [ecx + 2 * edx]
-   	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5 	
-	
+   	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5
+
 	SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
-	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0           
+	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
    SSE2_TransTwo4x4W   xmm1, xmm4, xmm0, xmm2, xmm3
 	SSE2_IDCT			xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1

 	WELS_Zero			xmm7
    WELS_DW32			xmm6
-    
+
 	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [eax		],	[ecx]
 	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [eax + ebx	],	[ecx + edx]
 	lea		eax, [eax + 2 * ebx]
-	lea		ecx, [ecx + 2 * edx]	
+	lea		ecx, [ecx + 2 * edx]
 	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [eax],			[ecx]
-	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [eax + ebx],	[ecx + edx] 
+	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [eax + ebx],	[ecx + edx]

    pop		esi
    pop		ebx
    ret
-    
+
  %macro SSE2_StoreDiff4x8p 8
   	SSE2_StoreDiff8p    %1, %3, %4, [%5],			[%6]
-	SSE2_StoreDiff8p    %1, %3, %4, [%5 + %7],		[%6 + %8]	
+	SSE2_StoreDiff8p    %1, %3, %4, [%5 + %7],		[%6 + %8]
 	SSE2_StoreDiff8p    %2, %3, %4, [%5 + 8],		[%6 + 8]
-	SSE2_StoreDiff8p    %2, %3, %4, [%5 + %7 + 8],	[%6 + %8 + 8]	
+	SSE2_StoreDiff8p    %2, %3, %4, [%5 + %7 + 8],	[%6 + %8 + 8]
 %endmacro
- 
+
 ;***********************************************************************
 ; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
 ;***********************************************************************
@ -443,47 +443,47 @@ ALIGN 16
 WelsIDctRecI16x16Dc_sse2:
    push		esi
    push		edi
-    
+
 	mov			ecx,		[luma_dc]
-    mov			eax,		[rec]	
-    mov			edx,		[stride]	
-    mov			esi,		[pred]	
-    mov			edi,		[pred_stride]	    	
+    mov			eax,		[rec]
+    mov			edx,		[stride]
+    mov			esi,		[pred]
+    mov			edi,		[pred_stride]
 	pxor		xmm7,		xmm7
    WELS_DW32	xmm6
-    
+
 	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [ecx]
 	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
-	
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]	
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi	
-	  
-	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]	
-	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-	
-	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]		
-	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-	
-	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [ecx + 16]	
-	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]		
+	lea			esi,		[esi + 2 * edi]
 	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
-	
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]	
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi	
-	  
-	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]	 
+	lea			esi,		[esi + 2 * edi]
 	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-	
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]		
+	lea			esi,		[esi + 2 * edi]
 	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-		
+
+	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [ecx + 16]
+	lea			eax,		[eax + 2 * edx]
+	lea			esi,		[esi + 2 * edi]
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+
+	lea			eax,		[eax + 2 * edx]
+	lea			esi,		[esi + 2 * edi]
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+
+	lea			eax,		[eax + 2 * edx]
+	lea			esi,		[esi + 2 * edi]
+	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
+
+	lea			eax,		[eax + 2 * edx]
+	lea			esi,		[esi + 2 * edi]
+	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
+
    pop		edi
    pop		esi
    ret
@ -517,7 +517,7 @@ WelsIDctRecI16x16Dc_sse2:
 	punpckldq	%3,			%4
 	punpcklqdq	%1,			%3
 %endmacro
- 
+
 ;***********************************************************************
 ;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
 ;***********************************************************************
@ -525,23 +525,23 @@ WELS_EXTERN WelsHadamardT4Dc_sse2
 WelsHadamardT4Dc_sse2:
 		mov			eax,		[esp + 4]	; luma_dc
 		mov			ecx,		[esp + 8]	; pDct
-		
+
 		SSE2_Load4Col	    xmm1, xmm5, xmm6, xmm0, ecx
 		SSE2_Load4Col	    xmm2, xmm5, xmm6, xmm0, ecx + 0x40
 		SSE2_Load4Col	    xmm3, xmm5, xmm6, xmm0, ecx + 0x100
 		SSE2_Load4Col	    xmm4, xmm5, xmm6, xmm0, ecx + 0x140
-		
+
 		SSE2_SumSubD		xmm1, xmm2, xmm7
 		SSE2_SumSubD		xmm3, xmm4, xmm7
 		SSE2_SumSubD		xmm2, xmm4, xmm7
-		SSE2_SumSubD		xmm1, xmm3, xmm7	
+		SSE2_SumSubD		xmm1, xmm3, xmm7

 		SSE2_Trans4x4D		xmm4, xmm2, xmm1, xmm3, xmm5	; pOut: xmm4,xmm3,xmm5,xmm1
-	
+
 		SSE2_SumSubD		xmm4, xmm3, xmm7
 		SSE2_SumSubD		xmm5, xmm1, xmm7

-		WELS_DD1 xmm6      
+		WELS_DD1 xmm6
 		SSE2_SumSubDiv2D	xmm3, xmm1, xmm6, xmm0			; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
 		SSE2_SumSubDiv2D	xmm4, xmm5, xmm6, xmm1			; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
        SSE2_Trans4x4D		xmm3, xmm0, xmm1, xmm4, xmm2	; pOut: xmm3,xmm4,xmm2,xmm1
@ -550,7 +550,7 @@ WelsHadamardT4Dc_sse2:
 		packssdw	xmm2,	xmm1
 		movdqa	[eax+ 0],   xmm3
 		movdqa	[eax+16],   xmm2
-		
-		ret	
+
+		ret


--- a/codec/encoder/core/asm/deblock.asm
+++ b/codec/encoder/core/asm/deblock.asm
--- a/codec/encoder/core/asm/expand_picture.asm
+++ b/codec/encoder/core/asm/expand_picture.asm
@ -153,11 +153,11 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
 	lea %1, [%1+%2]
 %endmacro

-%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]		
+%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]
 	; ebx [width/16(8)]
 	; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16)		; top
 	; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16)	; bottom
-		
+
 %if %1 == 32		; for luma
 	sar ebx, 04h 	; width / 16(8) pixels
 .top_bottom_loops:
@ -171,7 +171,7 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_end16x4_sse2 edi, ecx, xmm0, a
-	
+
 	; bottom
 	movdqa xmm1, [eax] 		; last line of picture pData
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
@ -182,15 +182,15 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-		
+
 	lea esi, [esi+16]		; top pSrc
 	lea edi, [edi+16]		; top dst
 	lea eax, [eax+16]		; bottom pSrc
 	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?	
-	
+	neg ecx 			; positive/negative stride need for next loop?
+
 	dec ebx
-	jnz near .top_bottom_loops		
+	jnz near .top_bottom_loops
 %elif %1 == 16	; for chroma ??
 	mov edx, ebx
 	sar ebx, 04h 	; (width / 16) pixels
@ -200,21 +200,21 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
 	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_end16x4_sse2 edi, ecx, xmm0, a	
-	
+	mov_line_end16x4_sse2 edi, ecx, xmm0, a
+
 	; bottom
 	movdqa xmm1, [eax] 		; last line of picture pData
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_end16x4_sse2 ebp, ecx, xmm1, a	
-		
+	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
+
 	lea esi, [esi+16]		; top pSrc
 	lea edi, [edi+16]		; top dst
 	lea eax, [eax+16]		; bottom pSrc
 	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?	
-	
+	neg ecx 			; positive/negative stride need for next loop?
+
 	dec ebx
 	jnz near .top_bottom_loops

@ -241,50 +241,50 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
 %endif
 %endmacro

-%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a	
+%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
 	; ecx [height]
 	; esi [pSrc+0], 	   edi [pSrc-32], edx [stride], 32(16)	; left
 	; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16)			; right
 ;	xor eax, eax 	; for pixel pData (uint8_t)		; make sure eax=0 at least high 24 bits of eax = 0
-	
-%if %1 == 32		; for luma	
+
+%if %1 == 32		; for luma
 .left_right_loops:
 	; left
 	mov al, byte [esi]		; pixel pData for left border
 	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdqa [edi], xmm0
 	movdqa [edi+16], xmm0
-	
+
 	; right
 	mov al, byte [ebx]
 	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdqa [ebp], xmm1
 	movdqa [ebp+16], xmm1
-	
+
 	lea esi, [esi+edx]		; left pSrc
 	lea edi, [edi+edx]		; left dst
 	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst	
-	
+	lea ebp, [ebp+edx]		; right dst
+
 	dec ecx
-	jnz near .left_right_loops		
-%elif %1 == 16	; for chroma ??	
+	jnz near .left_right_loops
+%elif %1 == 16	; for chroma ??
 .left_right_loops:
 	; left
 	mov al, byte [esi]		; pixel pData for left border
 	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [edi], xmm0	
-	
+	movdqa [edi], xmm0
+
 	; right
 	mov al, byte [ebx]
 	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdq%2 [ebp], xmm1								; might not be aligned 16 bytes in case chroma planes
-	
+
 	lea esi, [esi+edx]		; left pSrc
 	lea edi, [edi+edx]		; left dst
 	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst	
-	
+	lea ebp, [ebp+edx]		; right dst
+
 	dec ecx
 	jnz near .left_right_loops
 %endif
@ -337,25 +337,25 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
 	; TL
 	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?	
+	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 	mov_line_end16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?

 	; TR
 	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
 	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?	
+	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
 	mov_line_end16x4_sse2 ebp, ecx, xmm4, %2	; dst, stride, xmm?

 	; BL
 	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?	
+	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 	mov_line_end16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?

 	; BR
 	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?	
+	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 	mov_line_end16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 %endif
 %endmacro
@ -373,7 +373,7 @@ ExpandPictureLuma_sse2:
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@ -385,10 +385,10 @@ ExpandPictureLuma_sse2:
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; stride	
+	mov ecx, edx							; stride
 	neg ecx 								; -stride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*stride
 	lea eax, [esi+eax]						; last line of picture pData
@ -396,16 +396,16 @@ ExpandPictureLuma_sse2:
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 32 * stride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; width-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; width
-	exp_top_bottom_sse2	32	
-	
+	exp_top_bottom_sse2	32
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@ -417,14 +417,14 @@ ExpandPictureLuma_sse2:
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
 	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	32, a
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@ -434,7 +434,7 @@ ExpandPictureLuma_sse2:
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	mov eax, -32							; luma=-32, chroma=-16
 	neg ecx										; -stride
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
 	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
@ -442,19 +442,19 @@ ExpandPictureLuma_sse2:
 	mov ecx, [esp+28]					; stride
 	imul edx, ecx							; (height+32(16)) * stride
 	lea eax, [edi+edx]						; last line of bottom-left border
-	lea ebx, [ebp+edx]						; last line of bottom-right border	
+	lea ebx, [ebp+edx]						; last line of bottom-right border
 	neg ecx										; -stride
 	; for left & right border expanding
-	exp_cross_sse2		32, a	
-	
+	exp_cross_sse2		32, a
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret

 ALIGN 16
@ -470,7 +470,7 @@ ExpandPictureChromaAlign_sse2:
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@ -482,10 +482,10 @@ ExpandPictureChromaAlign_sse2:
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; stride	
+	mov ecx, edx							; stride
 	neg ecx 								; -stride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*stride
 	lea eax, [esi+eax]						; last line of picture pData
@ -493,16 +493,16 @@ ExpandPictureChromaAlign_sse2:
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 16 * stride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; width-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; width
-	exp_top_bottom_sse2	16	
-	
+	exp_top_bottom_sse2	16
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@ -514,14 +514,14 @@ ExpandPictureChromaAlign_sse2:
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
 	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	16, a
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@ -531,9 +531,9 @@ ExpandPictureChromaAlign_sse2:
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	mov eax, -16							; chroma=-16
 	neg ecx										; -stride
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]				
+	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
 	mov ecx, [esp+28]						; stride
 	add edx, 16							; height+16, luma=32, chroma=16
@ -543,15 +543,15 @@ ExpandPictureChromaAlign_sse2:
 	neg ecx										; -stride
 	; for left & right border expanding
 	exp_cross_sse2		16, a
-	
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret

 ALIGN 16
@ -567,7 +567,7 @@ ExpandPictureChromaUnalign_sse2:
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@ -579,10 +579,10 @@ ExpandPictureChromaUnalign_sse2:
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; stride	
+	mov ecx, edx							; stride
 	neg ecx 								; -stride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*stride
 	lea eax, [esi+eax]						; last line of picture pData
@ -590,16 +590,16 @@ ExpandPictureChromaUnalign_sse2:
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 16 * stride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; width-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; width
-	exp_top_bottom_sse2	16	
-	
+	exp_top_bottom_sse2	16
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@ -611,14 +611,14 @@ ExpandPictureChromaUnalign_sse2:
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
 	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	16, u
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@ -628,9 +628,9 @@ ExpandPictureChromaUnalign_sse2:
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	neg ecx									; -stride
 	mov eax, -16							; chroma=-16
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]						
+	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
 	mov ecx, [esp+28]						; stride
 	add edx, 16							; height+16, luma=32, chroma=16
@ -640,14 +640,14 @@ ExpandPictureChromaUnalign_sse2:
 	neg ecx									; -stride
 	; for left & right border expanding
 	exp_cross_sse2		16, u
-	
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret

--- a/codec/encoder/core/asm/intra_pred.asm
+++ b/codec/encoder/core/asm/intra_pred.asm
--- a/codec/encoder/core/asm/intra_pred_util.asm
+++ b/codec/encoder/core/asm/intra_pred_util.asm
@ -32,7 +32,7 @@
 ;*  intra_pred_util.asm
 ;*
 ;*  Abstract
-;*      mmxext/sse for WelsFillingPred8to16, WelsFillingPred8x2to16 and 
+;*      mmxext/sse for WelsFillingPred8to16, WelsFillingPred8x2to16 and
 ;*		WelsFillingPred1to16 etc.
 ;*
 ;*  History
@ -84,7 +84,7 @@ WelsFillingPred8to16_mmx:
 	movq mm0, [ecx]
 	movq [eax  ], mm0
 	movq [eax+8], mm0
-	
+
 	WELSEMMS
 	ret

@ -100,16 +100,16 @@ WelsFillingPred8x2to16_mmx:
 	movq mm1, [ecx+8]
 	movq [eax  ], mm0
 	movq [eax+8], mm1
-	
+
 	WELSEMMS

 	ret

 %macro butterfly_1to8_mmx	3	; mm? for dst, mm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-	mov %3h, %3l	
-	movd %2, e%3x		; i.e, 1% = eax (=b0)	
-	pshufw %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0	
-%endmacro 
+	mov %3h, %3l
+	movd %2, e%3x		; i.e, 1% = eax (=b0)
+	pshufw %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro

 ALIGN 16
 ;***********************************************************************----------------
@ -120,10 +120,10 @@ WelsFillingPred1to16_mmx:

 	mov cl, byte [esp+8]	; v
 	butterfly_1to8_mmx	mm0, mm1, c	; mm? for dst, mm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-	
+
 	movq [eax  ], mm0
 	movq [eax+8], mm0
-	
+
 	WELSEMMS

 	ret
@ -136,9 +136,9 @@ WelsFillingPred8x2to16_sse2:
 	mov eax, [esp+4]	; pred
 	mov ecx, [esp+8]	; v

-	movdqa xmm0, [ecx]	
-	movdqa [eax], xmm0	
-	
+	movdqa xmm0, [ecx]
+	movdqa [eax], xmm0
+
 	ret

 ALIGN 16
@ -150,7 +150,7 @@ WelsFillingPred1to16_sse2:

 	mov cl, byte [esp+8]	; v
 	butterfly_1to16_sse	xmm0, xmm1, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	
+
 	movdqa [eax], xmm0
-	
+
 	ret
--- a/codec/encoder/core/asm/mb_copy.asm
+++ b/codec/encoder/core/asm/mb_copy.asm
@ -32,7 +32,7 @@
 ;*  mb_copy.asm
 ;*
 ;*  Abstract
-;*      mb_copy 
+;*      mb_copy
 ;*
 ;*
 ;*********************************************************************************************/
@ -52,9 +52,9 @@ SECTION .text
 WELS_EXTERN WelsCopy16x16_sse2
 WELS_EXTERN WelsCopy16x16NotAligned_sse2
 WELS_EXTERN WelsCopy8x8_mmx
-WELS_EXTERN WelsCopy16x8NotAligned_sse2	; 
-WELS_EXTERN WelsCopy8x16_mmx		; 
-WELS_EXTERN UpdateMbMv_sse2		; 
+WELS_EXTERN WelsCopy16x8NotAligned_sse2	;
+WELS_EXTERN WelsCopy8x16_mmx		;
+WELS_EXTERN UpdateMbMv_sse2		;

 ;***********************************************************************
 ; void WelsCopy16x16_sse2(	uint8_t* Dst,
@ -66,7 +66,7 @@ ALIGN 16
 WelsCopy16x16_sse2:
 	push esi
 	push edi
-	push ebx	
+	push ebx

 	mov edi, [esp+16]	; Dst
 	mov eax, [esp+20]	; iStrideD
@ -107,7 +107,7 @@ WelsCopy16x16_sse2:
 	movdqa xmm5, [esi+ecx]
 	movdqa xmm6, [esi+2*ecx]
 	movdqa xmm7, [esi+edx]
-	
+
 	movdqa [edi], xmm0
 	movdqa [edi+eax], xmm1
 	movdqa [edi+2*eax], xmm2
@ -116,7 +116,7 @@ WelsCopy16x16_sse2:
 	movdqa [edi], xmm4
 	movdqa [edi+eax], xmm5
 	movdqa [edi+2*eax], xmm6
-	movdqa [edi+ebx], xmm7	
+	movdqa [edi+ebx], xmm7

 	pop ebx
 	pop edi
@ -134,7 +134,7 @@ ALIGN 16
 WelsCopy16x16NotAligned_sse2:
 	push esi
 	push edi
-	push ebx	
+	push ebx

 	mov edi, [esp+16]	; Dst
 	mov eax, [esp+20]	; iStrideD
@ -175,7 +175,7 @@ WelsCopy16x16NotAligned_sse2:
 	movdqu xmm5, [esi+ecx]
 	movdqu xmm6, [esi+2*ecx]
 	movdqu xmm7, [esi+edx]
-	
+
 	movdqa [edi], xmm0
 	movdqa [edi+eax], xmm1
 	movdqa [edi+2*eax], xmm2
@ -184,8 +184,8 @@ WelsCopy16x16NotAligned_sse2:
 	movdqa [edi], xmm4
 	movdqa [edi+eax], xmm5
 	movdqa [edi+2*eax], xmm6
-	movdqa [edi+ebx], xmm7	
-	
+	movdqa [edi+ebx], xmm7
+
 	pop ebx
 	pop edi
 	pop esi
@ -202,7 +202,7 @@ ALIGN 16
 WelsCopy16x8NotAligned_sse2:
 	push esi
 	push edi
-	push ebx	
+	push ebx

 	mov edi, [esp+16]	; Dst
 	mov eax, [esp+20]	; iStrideD
@ -220,7 +220,7 @@ WelsCopy16x8NotAligned_sse2:
 	movdqu xmm4, [esi]
 	movdqu xmm5, [esi+ecx]
 	movdqu xmm6, [esi+2*ecx]
-	movdqu xmm7, [esi+edx]	
+	movdqu xmm7, [esi+edx]

 	movdqa [edi], xmm0
 	movdqa [edi+eax], xmm1
@ -231,7 +231,7 @@ WelsCopy16x8NotAligned_sse2:
 	movdqa [edi+eax], xmm5
 	movdqa [edi+2*eax], xmm6
 	movdqa [edi+ebx], xmm7
-	
+
 	pop ebx
 	pop edi
 	pop esi
@ -245,7 +245,7 @@ WelsCopy16x8NotAligned_sse2:
 ;                       int32_t  iStrideS )
 ;***********************************************************************
 ALIGN 16
-WelsCopy8x16_mmx:	
+WelsCopy8x16_mmx:
 	push ebx

 	mov eax, [esp + 8 ]           ;Dst
@ -253,60 +253,60 @@ WelsCopy8x16_mmx:
 	mov ebx, [esp + 16]           ;Src
 	mov edx, [esp + 20]           ;iStrideS

-	movq mm0, [ebx]	
-	movq mm1, [ebx+edx]	
+	movq mm0, [ebx]
+	movq mm1, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm2, [ebx]	
-	movq mm3, [ebx+edx]	
+	movq mm2, [ebx]
+	movq mm3, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm4, [ebx]	
-	movq mm5, [ebx+edx]	
+	movq mm4, [ebx]
+	movq mm5, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm6, [ebx]	
-	movq mm7, [ebx+edx]	
+	movq mm6, [ebx]
+	movq mm7, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	
-	movq [eax], mm0	
-	movq [eax+ecx], mm1	
+
+	movq [eax], mm0
+	movq [eax+ecx], mm1
 	lea eax, [eax+2*ecx]
-	movq [eax], mm2	
+	movq [eax], mm2
 	movq [eax+ecx], mm3
 	lea eax, [eax+2*ecx]
-	movq [eax], mm4	
+	movq [eax], mm4
 	movq [eax+ecx], mm5
 	lea eax, [eax+2*ecx]
-	movq [eax], mm6	
+	movq [eax], mm6
 	movq [eax+ecx], mm7
 	lea eax, [eax+2*ecx]

-	movq mm0, [ebx]	
-	movq mm1, [ebx+edx]	
+	movq mm0, [ebx]
+	movq mm1, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm2, [ebx]	
-	movq mm3, [ebx+edx]	
+	movq mm2, [ebx]
+	movq mm3, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm4, [ebx]	
-	movq mm5, [ebx+edx]	
+	movq mm4, [ebx]
+	movq mm5, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm6, [ebx]	
-	movq mm7, [ebx+edx]		
-	
-	movq [eax], mm0	
-	movq [eax+ecx], mm1	
+	movq mm6, [ebx]
+	movq mm7, [ebx+edx]
+
+	movq [eax], mm0
+	movq [eax+ecx], mm1
 	lea eax, [eax+2*ecx]
-	movq [eax], mm2	
+	movq [eax], mm2
 	movq [eax+ecx], mm3
 	lea eax, [eax+2*ecx]
-	movq [eax], mm4	
+	movq [eax], mm4
 	movq [eax+ecx], mm5
 	lea eax, [eax+2*ecx]
-	movq [eax], mm6	
-	movq [eax+ecx], mm7	
+	movq [eax], mm6
+	movq [eax+ecx], mm7

 	WELSEMMS
-	pop ebx	
+	pop ebx
 	ret
-	
+
 ;***********************************************************************
 ; void WelsCopy8x8_mmx(  uint8_t* Dst,
 ;                        int32_t  iStrideD,
@ -314,7 +314,7 @@ WelsCopy8x16_mmx:
 ;                        int32_t  iStrideS )
 ;***********************************************************************
 ALIGN 16
-WelsCopy8x8_mmx:	
+WelsCopy8x8_mmx:
 	push ebx
 	push esi
 	mov eax, [esp + 12]           ;Dst
@ -343,7 +343,7 @@ WelsCopy8x8_mmx:
 	lea esi, [esi+2*ebx]
 	movq mm6, [esi]
 	movq mm7, [esi+ebx]
-	
+
 	movq [eax], mm0
 	movq [eax+ecx], mm1
 	lea eax, [eax+2*ecx]
@ -355,12 +355,12 @@ WelsCopy8x8_mmx:
 	lea eax, [eax+2*ecx]
 	movq [eax], mm6
 	movq [eax+ecx], mm7
-		
+
 	WELSEMMS
-	pop esi	
+	pop esi
 	pop ebx
 	ret
-	
+
 ; (dunhuang@cisco), 12/21/2011
 ;***********************************************************************
 ; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
@ -417,7 +417,7 @@ WELS_EXTERN PixelAvgWidthEq16_sse2
 WELS_EXTERN McCopyWidthEq4_mmx
 WELS_EXTERN McCopyWidthEq8_mmx
 WELS_EXTERN McCopyWidthEq16_sse2
-                          
+

 ALIGN 16
 ;***********************************************************************
@ -432,38 +432,38 @@ PixelAvgWidthEq8_mmx:
    push        esi
    push        edi

-    mov         edi, [esp+20]       
-    mov         esi, [esp+28]       
-    mov         edx, [esp+36]       
-    mov         ebp, [esp+24]       
-    mov         eax, [esp+32]       
-    mov         ebx, [esp+40]       
-    mov         ecx, [esp+44]       
+    mov         edi, [esp+20]
+    mov         esi, [esp+28]
+    mov         edx, [esp+36]
+    mov         ebp, [esp+24]
+    mov         eax, [esp+32]
+    mov         ebx, [esp+40]
+    mov         ecx, [esp+44]
 	sar			ecx, 2
 .height_loop:
-	movq        mm0, [esi]	
+	movq        mm0, [esi]
    pavgb       mm0, [edx]
    movq        [edi], mm0
-	movq		mm1, [esi+eax]		
+	movq		mm1, [esi+eax]
 	pavgb		mm1, [edx+ebx]
 	movq		[edi+ebp], mm1
 	lea         edi, [edi+2*ebp]
 	lea         esi, [esi+2*eax]
 	lea         edx, [edx+2*ebx]

-	movq        mm2, [esi]	
+	movq        mm2, [esi]
 	pavgb       mm2, [edx]
    movq        [edi], mm2
-	movq		mm3, [esi+eax]	
+	movq		mm3, [esi+eax]
 	pavgb		mm3, [edx+ebx]
 	movq		[edi+ebp], mm3
 	lea         edi, [edi+2*ebp]
 	lea         esi, [esi+2*eax]
 	lea         edx, [edx+2*ebx]
-	
+
 	dec         ecx
    jne         .height_loop
-	
+
 	WELSEMMS
    pop         edi
    pop         esi
@ -485,42 +485,42 @@ PixelAvgWidthEq16_sse2:
    push        esi
    push        edi

-    mov         edi, [esp+20]       
-    mov         esi, [esp+28]       
-    mov         edx, [esp+36]       
-    mov         ebp, [esp+24]       
-    mov         eax, [esp+32]       
-    mov         ebx, [esp+40]       
-    mov         ecx, [esp+44]       
+    mov         edi, [esp+20]
+    mov         esi, [esp+28]
+    mov         edx, [esp+36]
+    mov         ebp, [esp+24]
+    mov         eax, [esp+32]
+    mov         ebx, [esp+40]
+    mov         ecx, [esp+44]
 	sar			ecx, 2
 .height_loop:
 	movdqu      xmm0, [esi]
 	movdqu      xmm1, [edx]
 	movdqu      xmm2, [esi+eax]
-	movdqu      xmm3, [edx+ebx]	
+	movdqu      xmm3, [edx+ebx]
 	pavgb       xmm0, xmm1
 	pavgb       xmm2, xmm3
 	movdqu      [edi], xmm0
 	movdqu      [edi+ebp], xmm2
 	lea			edi, [edi+2*ebp]
 	lea			esi, [esi+2*eax]
-	lea			edx, [edx+2*ebx]	
+	lea			edx, [edx+2*ebx]

 	movdqu      xmm4, [esi]
 	movdqu      xmm5, [edx]
 	movdqu      xmm6, [esi+eax]
-	movdqu      xmm7, [edx+ebx]	
+	movdqu      xmm7, [edx+ebx]
 	pavgb       xmm4, xmm5
 	pavgb       xmm6, xmm7
 	movdqu      [edi], xmm4
 	movdqu      [edi+ebp], xmm6
 	lea         edi, [edi+2*ebp]
 	lea         esi, [esi+2*eax]
-    lea         edx, [edx+2*ebx]	
-    
+    lea         edx, [edx+2*ebx]
+
 	dec         ecx
 	jne         .height_loop
-	
+
    pop         edi
    pop         esi
    pop         ebx
@ -540,7 +540,7 @@ avg_w16_align_0_ssse3:
    dec    dword [esp+4]
    jg     avg_w16_align_0_ssse3
    ret
-    
+
    ALIGN 64
 avg_w16_align_1_ssse3:
    movdqa  xmm1, [ebx+16]
@ -555,7 +555,7 @@ avg_w16_align_1_ssse3:
    jg     avg_w16_align_1_ssse3
    ret

-  
+
 ALIGN 16
 ;***********************************************************************
 ; void PixelAvgWidthEq16_ssse3(uint8_t *pDst,  int32_t iDstStride,
@ -574,7 +574,7 @@ PixelAvgWidthEq16_ssse3:
    mov         ebx, [esp+28]       ; src1
    mov         ecx, [esp+36]       ; src2
    mov         esi, [esp+24]       ; i_dst_stride
-    
+
     %define avg_w16_offset (avg_w16_align_1_ssse3-avg_w16_align_0_ssse3)
    mov edx, ebx
    and edx, 0x01
@ -582,11 +582,11 @@ PixelAvgWidthEq16_ssse3:
    lea ebp, [avg_w16_offset]
    imul ebp, edx
    lea edx, [ebp+eax]
-    
-    mov eax, [esp+32]  
-    mov ebp, [esp+44] 
+
+    mov eax, [esp+32]
+    mov ebp, [esp+44]
    push ebp
-    mov ebp, [esp+44]	
+    mov ebp, [esp+44]
    and ebx, 0xfffffff0
    call edx
 	pop		   ebp
@ -607,7 +607,7 @@ McCopyWidthEq4_mmx:
    push    edi
    push    ebx

-    
+
    mov esi,  [esp+16]
    mov eax, [esp+20]
    mov edi,  [esp+24]
@ -617,12 +617,12 @@ ALIGN 4
 .height_loop:
 	mov ebx, [esi]
 	mov [edi], ebx
-	
+
 	add esi, eax
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	WELSEMMS   
+	WELSEMMS
 	pop	   ebx
    pop     edi
    pop     esi
@ -650,12 +650,12 @@ ALIGN 4
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	
-	WELSEMMS   
+
+	WELSEMMS
    pop     edi
    pop     esi
    ret
-	
+
 ALIGN 16
 ;***********************************************************************
 ;   void McCopyWidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
@ -664,11 +664,11 @@ McCopyWidthEq16_sse2:
    push    esi
    push    edi

-    mov     esi, [esp+12]       
-    mov     eax, [esp+16]       
-    mov     edi, [esp+20]       
-    mov     edx, [esp+24]       
-    mov     ecx, [esp+28]       
+    mov     esi, [esp+12]
+    mov     eax, [esp+16]
+    mov     edi, [esp+20]
+    mov     edx, [esp+24]
+    mov     ecx, [esp+28]

 ALIGN 4
 .height_loop:
@ -681,7 +681,7 @@ ALIGN 4
    lea     esi, [esi+eax*2]
    lea     edi, [edi+edx*2]
    jnz     .height_loop
-  
+
    pop     edi
    pop     esi
    ret
--- a/codec/encoder/core/asm/mc_chroma.asm
+++ b/codec/encoder/core/asm/mc_chroma.asm
@ -1,317 +1,317 @@
-;*!
-;* \copy
-;*     Copyright (c)  2004-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mc_chroma.asm
-;*
-;*  Abstract
-;*      mmx motion compensation for chroma
-;*
-;*  History
-;*      10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
-	dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
-	dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src, 
-;							int32_t iSrcStride, 
-;							uint8_t *pDst, 
-;							int32_t iDstStride, 
-;							uint8_t *pABCD, 
-;							int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
-	push esi
-	push edi
-	push ebx
-	
-	mov eax, [esp +12 + 20]
-	movd mm3, [eax]
-	WELS_Zero mm7
-	punpcklbw mm3, mm3
-	movq      mm4, mm3
-	punpcklwd mm3, mm3       
-	punpckhwd mm4, mm4		 
-	
-	movq	  mm5, mm3
-	punpcklbw mm3, mm7
-	punpckhbw mm5, mm7
-	
-	movq	  mm6, mm4
-	punpcklbw mm4, mm7
-	punpckhbw mm6, mm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
-	lea ebx, [esi + eax]
-	movd mm0, [esi]
-	movd mm1, [esi+1]
-	punpcklbw mm0, mm7
-	punpcklbw mm1, mm7
-.xloop:
-	
-	pmullw mm0, mm3
-	pmullw mm1, mm5
-	paddw  mm0, mm1
-	
-	movd  mm1, [ebx]
-	punpcklbw mm1, mm7
-	movq mm2, mm1
-	pmullw mm1, mm4
-	paddw mm0, mm1
-	
-	movd mm1, [ebx+1]
-	punpcklbw mm1, mm7
-	movq mm7, mm1
-	pmullw mm1,mm6
-	paddw mm0, mm1
-	movq mm1,mm7
-
-	paddw mm0, [h264_d0x20_mmx]
-	psrlw mm0, 6
-	
-	WELS_Zero mm7
-	packuswb mm0, mm7
-	movd [edi], mm0	
-
-	movq mm0, mm2
-	
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-	WELSEMMS
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc, 
-;						int32_t iSrcStride, 
-;						uint8_t *pDst, 
-;						int32_t iDstStride, 
-;						uint8_t *pABCD, 
-;						int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
-	push esi
-	push edi
-	push ebx
-	
-	mov eax, [esp +12 + 20]
-	movd xmm3, [eax]
-	WELS_Zero xmm7
-	punpcklbw  xmm3, xmm3
-	punpcklwd  xmm3, xmm3
-	
-	movdqa	   xmm4, xmm3
-	punpckldq  xmm3, xmm3
-	punpckhdq  xmm4, xmm4
-	movdqa     xmm5, xmm3
-	movdqa	   xmm6, xmm4
-	
-	punpcklbw  xmm3, xmm7
-	punpckhbw  xmm5, xmm7
-	punpcklbw  xmm4, xmm7
-	punpckhbw  xmm6, xmm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
-	lea ebx, [esi + eax]
-	movq xmm0, [esi]
-	movq xmm1, [esi+1]
-	punpcklbw xmm0, xmm7
-	punpcklbw xmm1, xmm7
-.xloop:
-	
-	pmullw xmm0, xmm3
-	pmullw xmm1, xmm5
-	paddw  xmm0, xmm1
-	
-	movq  xmm1, [ebx]
-	punpcklbw xmm1, xmm7
-	movdqa xmm2, xmm1
-	pmullw xmm1, xmm4
-	paddw xmm0, xmm1
-	
-	movq xmm1, [ebx+1]
-	punpcklbw xmm1, xmm7
-	movdqa xmm7, xmm1
-	pmullw xmm1, xmm6
-	paddw xmm0, xmm1
-	movdqa xmm1,xmm7
-
-	paddw xmm0, [h264_d0x20_sse2]
-	psrlw xmm0, 6
-	
-	WELS_Zero xmm7
-	packuswb xmm0, xmm7
-	movq [edi], xmm0	
-
-	movdqa xmm0, xmm2
-	
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-	
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-
-
-ALIGN 16
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-;						 int32_t iSrcStride, 
-;                        uint8_t *pDst,  
-;                        int32_t iDstStride,
-;                        uint8_t *pABCD,
-;					     int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
-	push ebx
-	push esi
-	push edi
-		
-	mov eax, [esp + 12 + 20]
-
-    pxor      xmm7, xmm7
-    movd   xmm5, [eax]   
-    punpcklwd xmm5, xmm5  
-    punpckldq xmm5, xmm5 
-    movdqa    xmm6, xmm5
-    punpcklqdq xmm5, xmm5
-    punpckhqdq xmm6, xmm6    
-    
-	mov eax, [esp + 12 + 4]   
-	mov edx, [esp + 12 + 8]   
-	mov esi, [esp + 12 + 12]  
-	mov edi, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-    
-    sub esi, edi
-    sub esi, edi
-	movdqa xmm7, [h264_d0x20_sse2]
-
-	movdqu xmm0, [eax]
-	movdqa xmm1, xmm0
-	psrldq xmm1, 1
-	punpcklbw xmm0, xmm1
-	
-.hloop_chroma:	
-	lea	esi, [esi+2*edi]
-	
-	movdqu xmm2, [eax+edx]
-	movdqa xmm3, xmm2
-	psrldq xmm3, 1
-	punpcklbw xmm2, xmm3
-	movdqa      xmm4, xmm2
-	
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm0, xmm2
-    paddw      xmm0, xmm7
-	psrlw      xmm0, 6
-    packuswb   xmm0, xmm0
-    movq       [esi],xmm0	
-    
-    lea eax, [eax+2*edx]
-    movdqu xmm2, [eax]
-    movdqa xmm3, xmm2
-    psrldq xmm3, 1
-    punpcklbw xmm2, xmm3
-    movdqa      xmm0, xmm2
-    
-    pmaddubsw  xmm4, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm4, xmm2
-    paddw      xmm4, xmm7
-	psrlw      xmm4, 6
-    packuswb   xmm4, xmm4
-    movq       [esi+edi],xmm4	
-	
-	sub ecx, 2
-	jnz .hloop_chroma
-	pop edi
-	pop esi
-	pop ebx
-
-	ret
-
-
+;*!
+;* \copy
+;*     Copyright (c)  2004-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mc_chroma.asm
+;*
+;*  Abstract
+;*      mmx motion compensation for chroma
+;*
+;*  History
+;*      10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+	dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+	dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( uint8_t *src,
+;							int32_t iSrcStride,
+;							uint8_t *pDst,
+;							int32_t iDstStride,
+;							uint8_t *pABCD,
+;							int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+McChromaWidthEq4_mmx:
+	push esi
+	push edi
+	push ebx
+
+	mov eax, [esp +12 + 20]
+	movd mm3, [eax]
+	WELS_Zero mm7
+	punpcklbw mm3, mm3
+	movq      mm4, mm3
+	punpcklwd mm3, mm3
+	punpckhwd mm4, mm4
+
+	movq	  mm5, mm3
+	punpcklbw mm3, mm7
+	punpckhbw mm5, mm7
+
+	movq	  mm6, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm6, mm7
+
+	mov esi, [esp +12+ 4]
+	mov eax, [esp + 12 + 8]
+	mov edi, [esp + 12 + 12]
+	mov edx, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
+	lea ebx, [esi + eax]
+	movd mm0, [esi]
+	movd mm1, [esi+1]
+	punpcklbw mm0, mm7
+	punpcklbw mm1, mm7
+.xloop:
+
+	pmullw mm0, mm3
+	pmullw mm1, mm5
+	paddw  mm0, mm1
+
+	movd  mm1, [ebx]
+	punpcklbw mm1, mm7
+	movq mm2, mm1
+	pmullw mm1, mm4
+	paddw mm0, mm1
+
+	movd mm1, [ebx+1]
+	punpcklbw mm1, mm7
+	movq mm7, mm1
+	pmullw mm1,mm6
+	paddw mm0, mm1
+	movq mm1,mm7
+
+	paddw mm0, [h264_d0x20_mmx]
+	psrlw mm0, 6
+
+	WELS_Zero mm7
+	packuswb mm0, mm7
+	movd [edi], mm0
+
+	movq mm0, mm2
+
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+	WELSEMMS
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( uint8_t *pSrc,
+;						int32_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride,
+;						uint8_t *pABCD,
+;						int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+McChromaWidthEq8_sse2:
+	push esi
+	push edi
+	push ebx
+
+	mov eax, [esp +12 + 20]
+	movd xmm3, [eax]
+	WELS_Zero xmm7
+	punpcklbw  xmm3, xmm3
+	punpcklwd  xmm3, xmm3
+
+	movdqa	   xmm4, xmm3
+	punpckldq  xmm3, xmm3
+	punpckhdq  xmm4, xmm4
+	movdqa     xmm5, xmm3
+	movdqa	   xmm6, xmm4
+
+	punpcklbw  xmm3, xmm7
+	punpckhbw  xmm5, xmm7
+	punpcklbw  xmm4, xmm7
+	punpckhbw  xmm6, xmm7
+
+	mov esi, [esp +12+ 4]
+	mov eax, [esp + 12 + 8]
+	mov edi, [esp + 12 + 12]
+	mov edx, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
+	lea ebx, [esi + eax]
+	movq xmm0, [esi]
+	movq xmm1, [esi+1]
+	punpcklbw xmm0, xmm7
+	punpcklbw xmm1, xmm7
+.xloop:
+
+	pmullw xmm0, xmm3
+	pmullw xmm1, xmm5
+	paddw  xmm0, xmm1
+
+	movq  xmm1, [ebx]
+	punpcklbw xmm1, xmm7
+	movdqa xmm2, xmm1
+	pmullw xmm1, xmm4
+	paddw xmm0, xmm1
+
+	movq xmm1, [ebx+1]
+	punpcklbw xmm1, xmm7
+	movdqa xmm7, xmm1
+	pmullw xmm1, xmm6
+	paddw xmm0, xmm1
+	movdqa xmm1,xmm7
+
+	paddw xmm0, [h264_d0x20_sse2]
+	psrlw xmm0, 6
+
+	WELS_Zero xmm7
+	packuswb xmm0, xmm7
+	movq [edi], xmm0
+
+	movdqa xmm0, xmm2
+
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+
+
+ALIGN 16
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
+;						 int32_t iSrcStride,
+;                        uint8_t *pDst,
+;                        int32_t iDstStride,
+;                        uint8_t *pABCD,
+;					     int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+McChromaWidthEq8_ssse3:
+	push ebx
+	push esi
+	push edi
+
+	mov eax, [esp + 12 + 20]
+
+    pxor      xmm7, xmm7
+    movd   xmm5, [eax]
+    punpcklwd xmm5, xmm5
+    punpckldq xmm5, xmm5
+    movdqa    xmm6, xmm5
+    punpcklqdq xmm5, xmm5
+    punpckhqdq xmm6, xmm6
+
+	mov eax, [esp + 12 + 4]
+	mov edx, [esp + 12 + 8]
+	mov esi, [esp + 12 + 12]
+	mov edi, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
+    sub esi, edi
+    sub esi, edi
+	movdqa xmm7, [h264_d0x20_sse2]
+
+	movdqu xmm0, [eax]
+	movdqa xmm1, xmm0
+	psrldq xmm1, 1
+	punpcklbw xmm0, xmm1
+
+.hloop_chroma:
+	lea	esi, [esi+2*edi]
+
+	movdqu xmm2, [eax+edx]
+	movdqa xmm3, xmm2
+	psrldq xmm3, 1
+	punpcklbw xmm2, xmm3
+	movdqa      xmm4, xmm2
+
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm7
+	psrlw      xmm0, 6
+    packuswb   xmm0, xmm0
+    movq       [esi],xmm0
+
+    lea eax, [eax+2*edx]
+    movdqu xmm2, [eax]
+    movdqa xmm3, xmm2
+    psrldq xmm3, 1
+    punpcklbw xmm2, xmm3
+    movdqa      xmm0, xmm2
+
+    pmaddubsw  xmm4, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm4, xmm2
+    paddw      xmm4, xmm7
+	psrlw      xmm4, 6
+    packuswb   xmm4, xmm4
+    movq       [esi+edi],xmm4
+
+	sub ecx, 2
+	jnz .hloop_chroma
+	pop edi
+	pop esi
+	pop ebx
+
+	ret
+
+
--- a/codec/encoder/core/asm/mc_luma.asm
+++ b/codec/encoder/core/asm/mc_luma.asm
@ -91,29 +91,29 @@ WELS_EXTERN McHorVer20WidthEq16_sse2

 ALIGN 16
 ;***********************************************************************
-; void McHorVer20WidthEq16_sse2(  uint8_t *pSrc, 
-;								int32_t iSrcStride, 
-;								uint8_t *pDst, 
-;								int32_t iDstStride, 
+; void McHorVer20WidthEq16_sse2(  uint8_t *pSrc,
+;								int32_t iSrcStride,
+;								uint8_t *pDst,
+;								int32_t iDstStride,
 ;								int32_t iHeight,
 ;                      );
 ;***********************************************************************
 McHorVer20WidthEq16_sse2:
 	push	esi
 	push	edi
-	

-	mov esi, [esp + 12]         
-	mov eax, [esp + 16]         
-	mov edi, [esp + 20]         
-	mov ecx, [esp + 28]         
-	mov edx, [esp + 24]			
-	sub esi, 2                  
-	
+
+	mov esi, [esp + 12]
+	mov eax, [esp + 16]
+	mov edi, [esp + 20]
+	mov ecx, [esp + 28]
+	mov edx, [esp + 24]
+	sub esi, 2
+
 	WELS_Zero  xmm7
 	movdqa xmm6, [h264_w0x10_1]
 .y_loop:
-	
+
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@ -126,7 +126,7 @@ McHorVer20WidthEq16_sse2:
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@ -152,7 +152,7 @@ McHorVer20WidthEq16_sse2:
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3+8]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@ -165,8 +165,8 @@ McHorVer20WidthEq16_sse2:
 	psraw xmm0, 5
 	packuswb xmm0, xmm7
 	movq [edi+8], xmm0
-	
-	
+
+
 	add esi, eax
 	add edi, edx
 	dec ecx
@ -178,9 +178,9 @@ McHorVer20WidthEq16_sse2:

 ALIGN 16
 ;***********************************************************************
-; void McHorVer22Width8HorFirst_sse2( uint8_t*pSrc, 
-;									int32_t iSrcStride, 
-;									uint8_t* pTap,	
+; void McHorVer22Width8HorFirst_sse2( uint8_t*pSrc,
+;									int32_t iSrcStride,
+;									uint8_t* pTap,
 ;									int32_t iTapStride,
 ;									int32_t iHeight);
 ;***********************************************************************
@ -193,11 +193,11 @@ McHorVer22Width8HorFirst_sse2:
 	mov edi, [esp+24]		;tap
 	mov edx, [esp+28]	;tap_stride
 	mov ebx, [esp+32]	;i_height
-	pxor xmm7, xmm7	
-	
+	pxor xmm7, xmm7
+
 	sub esi, eax				;;;;;;;;need more 5 lines.
 	sub esi, eax
-		
+
 .yloop_width_8:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
@ -211,7 +211,7 @@ McHorVer22Width8HorFirst_sse2:
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@ -221,7 +221,7 @@ McHorVer22Width8HorFirst_sse2:
 	psllw xmm4, 2
 	paddw xmm0, xmm4
 	movdqa [edi], xmm0
-		
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@ -230,30 +230,30 @@ McHorVer22Width8HorFirst_sse2:
 	pop edi
 	pop esi
 	ret
-	
+
 ;***********************************************************************
-; void McHorVer02WidthEq8_sse2( uint8_t *pSrc, 
-;                       int32_t iSrcStride, 
-;                       uint8_t *pDst, 
-;                       int32_t iDstStride, 
+; void McHorVer02WidthEq8_sse2( uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;                       uint8_t *pDst,
+;                       int32_t iDstStride,
 ;                       int32_t iHeight )
 ;***********************************************************************
 ALIGN 16
 McHorVer02WidthEq8_sse2:
 	push esi
 	push edi
-	
-	mov esi, [esp + 12]           
-	mov edx, [esp + 16]	          
-	mov edi, [esp + 20]           
-	mov eax, [esp + 24]           
-	mov ecx, [esp + 28]           
+
+	mov esi, [esp + 12]
+	mov edx, [esp + 16]
+	mov edi, [esp + 20]
+	mov eax, [esp + 24]
+	mov ecx, [esp + 28]

 	sub esi, edx
 	sub esi, edx

 	WELS_Zero xmm7
-			
+
 	SSE_LOAD_8P xmm0, xmm7, [esi]
 	SSE_LOAD_8P xmm1, xmm7, [esi+edx]
 	lea esi, [esi+2*edx]
@ -262,8 +262,8 @@ McHorVer02WidthEq8_sse2:
 	lea esi, [esi+2*edx]
 	SSE_LOAD_8P xmm4, xmm7, [esi]
 	SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-	
-.start:	
+
+.start:
 	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .xx_exit
@ -273,7 +273,7 @@ McHorVer02WidthEq8_sse2:
 	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
 	dec ecx
 	jz near .xx_exit
-	
+
 	lea edi, [edi+2*eax]
 	SSE_LOAD_8P xmm7, xmm0, [esi+edx]
 	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
@ -356,11 +356,11 @@ WELS_EXTERN McHorVer22HorFirst_sse2


 ;***********************************************************************
-; void McHorVer02_sse2(	uint8_t *pSrc, 
-;                       int32_t iSrcStride, 
-;                       uint8_t *pDst, 
+; void McHorVer02_sse2(	uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;                       uint8_t *pDst,
 ;                       int32_t iDstStride,
-;						int32_t iWidth, 
+;						int32_t iWidth,
 ;                       int32_t iHeight )
 ;***********************************************************************
 ALIGN 16
@ -368,19 +368,19 @@ McHorVer02_sse2:
 	push esi
 	push edi
 	push ebx
-	
-	mov esi, [esp + 16]           
-	mov edx, [esp + 20]	          
-	mov edi, [esp + 24]           
-	mov eax, [esp + 28]           
-	mov ecx, [esp + 36]           
-	mov ebx, [esp + 32]			  
+
+	mov esi, [esp + 16]
+	mov edx, [esp + 20]
+	mov edi, [esp + 24]
+	mov eax, [esp + 28]
+	mov ecx, [esp + 36]
+	mov ebx, [esp + 32]
 	shr ebx, 3
 	sub esi, edx
 	sub esi, edx
-	
-.xloop:	
-	WELS_Zero xmm7			
+
+.xloop:
+	WELS_Zero xmm7
 	SSE_LOAD_8P xmm0, xmm7, [esi]
 	SSE_LOAD_8P xmm1, xmm7, [esi+edx]
 	lea esi, [esi+2*edx]
@ -389,7 +389,7 @@ McHorVer02_sse2:
 	lea esi, [esi+2*edx]
 	SSE_LOAD_8P xmm4, xmm7, [esi]
 	SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-	
+
 	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	lea esi, [esi+2*edx]
@ -402,8 +402,8 @@ McHorVer02_sse2:
 	movdqa xmm5,xmm6
 	add edi, eax
 	sub esi, edx
-	
-.start:	
+
+.start:
 	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .x_loop_dec
@ -413,7 +413,7 @@ McHorVer02_sse2:
 	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*eax]
 	SSE_LOAD_8P xmm7, xmm0, [esi+edx]
 	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
@ -454,16 +454,16 @@ McHorVer02_sse2:
 	SSE_LOAD_8P xmm5, xmm6, [esi+edx]
 	jmp near .start

-.x_loop_dec:	
+.x_loop_dec:
 	dec ebx
 	jz  near .xx_exit
-	mov esi, [esp + 16]           
-	mov edi, [esp + 24]           
+	mov esi, [esp + 16]
+	mov edi, [esp + 24]
 	sub esi, edx
 	sub esi, edx
 	add esi, 8
 	add edi, 8
-	mov ecx, [esp + 36] 
+	mov ecx, [esp + 36]
 	jmp near .xloop

 .xx_exit:
@ -473,12 +473,12 @@ McHorVer02_sse2:
 	ret


-ALIGN 16                  
+ALIGN 16
 ;***********************************************************************
-; void McHorVer20_sse2(		uint8_t *pSrc, 
-;                       int32_t iSrcStride, 
-;						uint8_t *pDst, 
-;						int32_t iDstStride, 
+; void McHorVer20_sse2(		uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride,
 ;						int32_t iWidth,
 ;						int32_t iHeight
 ;                      );
@ -487,19 +487,19 @@ McHorVer20_sse2:
 	push esi
 	push edi
 	push ebx
-	mov esi, [esp+16]     
-	mov eax, [esp+20]	
-	mov edi, [esp+24]	
-	mov edx, [esp+28]	
-	mov ecx, [esp+32]	
-	mov ebx, [esp+36]	
+	mov esi, [esp+16]
+	mov eax, [esp+20]
+	mov edi, [esp+24]
+	mov edx, [esp+28]
+	mov ecx, [esp+32]
+	mov ebx, [esp+36]
 	sub esi, 2
-	pxor xmm7, xmm7	
-	
+	pxor xmm7, xmm7
+
 	cmp ecx, 9
-	jne near .width_17	
-	
-.yloop_width_9:	
+	jne near .width_17
+
+.yloop_width_9:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@ -512,7 +512,7 @@ McHorVer20_sse2:
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	movdqa xmm7, xmm2
 	paddw   xmm7, xmm3
 	movdqa xmm6, xmm4
@ -526,12 +526,12 @@ McHorVer20_sse2:
 	paddw xmm0, [h264_w0x10_1]
 	psraw  xmm0, 5
 	packuswb xmm0, xmm0
-	movd [edi], xmm0	
-	
+	movd [edi], xmm0
+
 	pxor  xmm7, xmm7
 	movq xmm0, [esi+6]
 	punpcklbw xmm0, xmm7
-	
+
 	paddw xmm4, xmm1
 	paddw xmm5, xmm3
 	psllw xmm5, 2
@ -543,8 +543,8 @@ McHorVer20_sse2:
 	paddw xmm2, [h264_w0x10_1]
 	psraw  xmm2, 5
 	packuswb xmm2, xmm2
-	movq [edi+1], xmm2	
-		
+	movq [edi+1], xmm2
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@ -553,8 +553,8 @@ McHorVer20_sse2:
 	pop edi
 	pop esi
 	ret
-	
-	
+
+
 .width_17:
 .yloop_width_17:
 	movq xmm0, [esi]
@ -569,7 +569,7 @@ McHorVer20_sse2:
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@ -582,7 +582,7 @@ McHorVer20_sse2:
 	psraw  xmm0, 5
 	packuswb xmm0, xmm0
 	movq [edi], xmm0
-		
+
 	movq xmm0, [esi+8]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5+8]
@ -595,7 +595,7 @@ McHorVer20_sse2:
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3+8]
 	punpcklbw xmm5, xmm7
-	
+
 	movdqa xmm7, xmm2
 	paddw   xmm7, xmm3
 	movdqa xmm6, xmm4
@ -610,12 +610,12 @@ McHorVer20_sse2:
 	psraw  xmm0, 5
 	packuswb xmm0, xmm0
 	movd [edi+8], xmm0
-	
-	
+
+
 	pxor  xmm7, xmm7
 	movq xmm0, [esi+6+8]
 	punpcklbw xmm0, xmm7
-	
+
 	paddw xmm4, xmm1
 	paddw xmm5, xmm3
 	psllw xmm5, 2
@ -627,7 +627,7 @@ McHorVer20_sse2:
 	paddw xmm2, [h264_w0x10_1]
 	psraw  xmm2, 5
 	packuswb xmm2, xmm2
-	movq [edi+9], xmm2		
+	movq [edi+9], xmm2
 	add esi, eax
 	add edi, edx
 	dec ebx
@ -636,14 +636,14 @@ McHorVer20_sse2:
 	pop edi
 	pop esi
 	ret
-	
-	
+
+

 ALIGN 16
 ;***********************************************************************
 ;void McHorVer22HorFirst_sse2
-;							(uint8_t *pSrc, 
-;							int32_t iSrcStride, 
+;							(uint8_t *pSrc,
+;							int32_t iSrcStride,
 ;							uint8_t * pTap,
 ;							int32_t iTapStride,
 ;							int32_t iWidth,int32_t iHeight);
@ -652,21 +652,21 @@ McHorVer22HorFirst_sse2:
 	push esi
 	push edi
 	push ebx
-	mov esi, [esp+16]     
-	mov eax, [esp+20]	
-	mov edi, [esp+24]	
-	mov edx, [esp+28]	
-	mov ecx, [esp+32]	
-	mov ebx, [esp+36]	
-	pxor xmm7, xmm7	
-	
+	mov esi, [esp+16]
+	mov eax, [esp+20]
+	mov edi, [esp+24]
+	mov edx, [esp+28]
+	mov ecx, [esp+32]
+	mov ebx, [esp+36]
+	pxor xmm7, xmm7
+
 	sub esi, eax				;;;;;;;;need more 5 lines.
 	sub esi, eax
-	
+
 	cmp ecx, 9
-	jne near .width_17	
-	
-.yloop_width_9:	
+	jne near .width_17
+
+.yloop_width_9:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@ -679,7 +679,7 @@ McHorVer22HorFirst_sse2:
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	movdqa xmm7, xmm2
 	paddw   xmm7, xmm3
 	movdqa xmm6, xmm4
@ -690,12 +690,12 @@ McHorVer22HorFirst_sse2:
 	paddw xmm0, xmm6
 	psllw xmm6, 2
 	paddw xmm0, xmm6
-	movd [edi], xmm0	
-	
+	movd [edi], xmm0
+
 	pxor  xmm7, xmm7
 	movq xmm0, [esi+6]
 	punpcklbw xmm0, xmm7
-	
+
 	paddw xmm4, xmm1
 	paddw xmm5, xmm3
 	psllw xmm5, 2
@ -704,9 +704,9 @@ McHorVer22HorFirst_sse2:
 	paddw xmm2, xmm5
 	psllw xmm5, 2
 	paddw xmm2, xmm5
-	movq [edi+2], xmm2	
-	movhps [edi+2+8], xmm2	
-	
+	movq [edi+2], xmm2
+	movhps [edi+2+8], xmm2
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@ -715,8 +715,8 @@ McHorVer22HorFirst_sse2:
 	pop edi
 	pop esi
 	ret
-	
-	
+
+
 .width_17:
 .yloop_width_17:
 	movq xmm0, [esi]
@ -731,7 +731,7 @@ McHorVer22HorFirst_sse2:
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@ -741,7 +741,7 @@ McHorVer22HorFirst_sse2:
 	psllw xmm4, 2
 	paddw xmm0, xmm4
 	movdqa [edi], xmm0
-		
+
 	movq xmm0, [esi+8]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5+8]
@ -754,7 +754,7 @@ McHorVer22HorFirst_sse2:
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3+8]
 	punpcklbw xmm5, xmm7
-	
+
 	movdqa xmm7, xmm2
 	paddw   xmm7, xmm3
 	movdqa xmm6, xmm4
@ -766,12 +766,12 @@ McHorVer22HorFirst_sse2:
 	psllw xmm6, 2
 	paddw xmm0, xmm6
 	movd [edi+16], xmm0
-	
-	
+
+
 	pxor  xmm7, xmm7
 	movq xmm0, [esi+6+8]
 	punpcklbw xmm0, xmm7
-	
+
 	paddw xmm4, xmm1
 	paddw xmm5, xmm3
 	psllw xmm5, 2
@ -780,9 +780,9 @@ McHorVer22HorFirst_sse2:
 	paddw xmm2, xmm5
 	psllw xmm5, 2
 	paddw xmm2, xmm5
-	movq [edi+18], xmm2	
-	movhps [edi+18+8], xmm2	
-	
+	movq [edi+18], xmm2
+	movhps [edi+18+8], xmm2
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@ -791,23 +791,23 @@ McHorVer22HorFirst_sse2:
 	pop edi
 	pop esi
 	ret
-	
-	
+
+
 %macro FILTER_VER 9
 	paddw  %1, %6
 	movdqa %7, %2
 	movdqa %8, %3
-	
-	
+
+
 	paddw %7, %5
 	paddw %8, %4
-	
-	psubw  %1, %7   
-	psraw   %1, 2	  
-	paddw  %1, %8   
-	psubw  %1, %7 
-	psraw   %1, 2	
-	paddw  %8, %1   
+
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %1, %8
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %8, %1
 	paddw  %8, [h264_mc_hc_32]
 	psraw   %8, 6
 	packuswb %8, %8
@ -815,8 +815,8 @@ McHorVer22HorFirst_sse2:
 %endmacro
 ;***********************************************************************
 ;void McHorVer22VerLastAlign_sse2(
-;											uint8_t *pTap, 
-;											int32_t iTapStride, 
+;											uint8_t *pTap,
+;											int32_t iTapStride,
 ;											uint8_t * pDst,
 ;											int32_t iDstStride,
 ;											int32_t iWidth,
@ -828,15 +828,15 @@ McHorVer22HorFirst_sse2:
 	push edi
 	push ebx
 	push ebp
-	
+
 	mov esi, [esp+20]
 	mov eax, [esp+24]
 	mov edi, [esp+28]
 	mov edx, [esp+32]
 	mov ebx, [esp+36]
-	mov ecx, [esp+40]	
-	shr ebx, 3	
-	
+	mov ecx, [esp+40]
+	shr ebx, 3
+
 .width_loop:
 	movdqa xmm0, [esi]
 	movdqa xmm1, [esi+eax]
@ -846,73 +846,73 @@ McHorVer22HorFirst_sse2:
 	lea esi, [esi+2*eax]
 	movdqa xmm4, [esi]
 	movdqa xmm5, [esi+eax]
-	
+
 	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	lea esi, [esi+2*eax]
 	movdqa xmm6, [esi]
-	
+
 	movdqa xmm0, xmm1
 	movdqa xmm1, xmm2
 	movdqa xmm2, xmm3
 	movdqa xmm3, xmm4
 	movdqa xmm4, xmm5
 	movdqa xmm5, xmm6
-	
+
 	add edi, edx
-	sub esi, eax		
-	
+	sub esi, eax
+
 .start:
 	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm6, [esi]
 	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm7, [esi+eax]
 	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm0, [esi]
 	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm1, [esi+eax]
 	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm2, [esi]
 	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm3, [esi+eax]
 	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm4, [esi]
 	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm5, [esi+eax]
 	jmp near .start
-	
+
 .x_loop_dec:
 	dec ebx
 	jz near .exit
@ -922,9 +922,9 @@ McHorVer22HorFirst_sse2:
 	add esi, 16
 	add edi, 8
 	jmp .width_loop
-	
-	
-	
+
+
+
 .exit:
 	pop ebp
 	pop ebx
@ -934,8 +934,8 @@ McHorVer22HorFirst_sse2:

 ;***********************************************************************
 ;void McHorVer22VerLastUnAlign_sse2(
-;											uint8_t *pTap, 
-;											int32_t iTapStride, 
+;											uint8_t *pTap,
+;											int32_t iTapStride,
 ;											uint8_t * pDst,
 ;											int32_t iDstStride,
 ;											int32_t iWidth,
@ -947,15 +947,15 @@ McHorVer22HorFirst_sse2:
 	push edi
 	push ebx
 	push ebp
-	
+
 	mov esi, [esp+20]
 	mov eax, [esp+24]
 	mov edi, [esp+28]
 	mov edx, [esp+32]
 	mov ebx, [esp+36]
-	mov ecx, [esp+40]	
-	shr ebx, 3	
-	
+	mov ecx, [esp+40]
+	shr ebx, 3
+
 .width_loop:
 	movdqu xmm0, [esi]
 	movdqu xmm1, [esi+eax]
@ -965,73 +965,73 @@ McHorVer22HorFirst_sse2:
 	lea esi, [esi+2*eax]
 	movdqu xmm4, [esi]
 	movdqu xmm5, [esi+eax]
-	
+
 	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	lea esi, [esi+2*eax]
 	movdqu xmm6, [esi]
-	
+
 	movdqa xmm0, xmm1
 	movdqa xmm1, xmm2
 	movdqa xmm2, xmm3
 	movdqa xmm3, xmm4
 	movdqa xmm4, xmm5
 	movdqa xmm5, xmm6
-	
+
 	add edi, edx
-	sub esi, eax		
-	
+	sub esi, eax
+
 .start:
 	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqu xmm6, [esi]
 	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqu xmm7, [esi+eax]
 	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqu xmm0, [esi]
 	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqu xmm1, [esi+eax]
 	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqu xmm2, [esi]
 	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqu xmm3, [esi+eax]
 	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqu xmm4, [esi]
 	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqu xmm5, [esi+eax]
 	jmp near .start
-	
+
 .x_loop_dec:
 	dec ebx
 	jz near .exit
@ -1041,9 +1041,9 @@ McHorVer22HorFirst_sse2:
 	add esi, 16
 	add edi, 8
 	jmp .width_loop
-	
-	
-	
+
+
+
 .exit:
 	pop ebp
 	pop ebx
--- a/codec/encoder/core/asm/memzero.asm
+++ b/codec/encoder/core/asm/memzero.asm
@ -32,7 +32,7 @@
 ;*  memzero.asm
 ;*
 ;*  Abstract
-;*      
+;*
 ;*
 ;*  History
 ;*      9/16/2009 Created
@ -47,8 +47,8 @@ BITS 32
 ; Code
 ;***********************************************************************

-SECTION .text			
-		
+SECTION .text
+
 ALIGN 16
 ;***********************************************************************
 ;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@ -57,7 +57,7 @@ WELS_EXTERN WelsPrefetchZero_mmx
 WelsPrefetchZero_mmx:
 	mov  eax,[esp+4]
 	prefetchnta [eax]
-	ret 			
+	ret


 ALIGN 16
@ -69,7 +69,7 @@ WelsSetMemZeroAligned64_sse2:
 		mov		eax,	[esp + 4]          ; dst
 		mov		ecx,	[esp + 8]
 		neg		ecx
-			
+
 		pxor	xmm0,		xmm0
 .memzeroa64_sse2_loops:
 		movdqa	[eax],		xmm0
@ -77,11 +77,11 @@ WelsSetMemZeroAligned64_sse2:
 		movdqa	[eax+32],	xmm0
 		movdqa	[eax+48],	xmm0
 		add		eax, 0x40
-		
+
 		add ecx, 0x40
 		jnz near .memzeroa64_sse2_loops
-			
-		ret	
+
+		ret

 ALIGN 16
 ;***********************************************************************
@ -92,7 +92,7 @@ WelsSetMemZeroSize64_mmx:
 		mov		eax,	[esp + 4]          ; dst
 		mov		ecx,	[esp + 8]
 		neg		ecx
-			
+
 		pxor	mm0,		mm0
 .memzero64_mmx_loops:
 		movq	[eax],		mm0
@ -102,16 +102,16 @@ WelsSetMemZeroSize64_mmx:
 		movq	[eax+32],	mm0
 		movq	[eax+40],	mm0
 		movq	[eax+48],	mm0
-		movq	[eax+56],	mm0		
+		movq	[eax+56],	mm0
 		add		eax,		0x40
-		
+
 		add ecx, 0x40
 		jnz near .memzero64_mmx_loops
-			
-		WELSEMMS	
-		ret	
-	
-ALIGN 16		
+
+		WELSEMMS
+		ret
+
+ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
 ;***********************************************************************
@ -119,17 +119,17 @@ WELS_EXTERN WelsSetMemZeroSize8_mmx
 WelsSetMemZeroSize8_mmx:
 		mov		eax,	[esp + 4]		; dst
 		mov		ecx,	[esp + 8]		; size
-		neg		ecx			
+		neg		ecx
 		pxor	mm0,		mm0
-		
+
 .memzero8_mmx_loops:
 		movq	[eax],		mm0
 		add		eax,		0x08
-	
+
 		add		ecx,		0x08
 		jnz near .memzero8_mmx_loops
-		
-		WELSEMMS	
-		ret	

-							
+		WELSEMMS
+		ret
+
+
--- a/codec/encoder/core/asm/quant.asm
+++ b/codec/encoder/core/asm/quant.asm
@ -44,17 +44,17 @@

 BITS 32

-SECTION .text	
+SECTION .text
 ;************************************************
-;NEW_QUANT 
+;NEW_QUANT
 ;************************************************

 %macro SSE2_Quant8  5
 		MOVDQ	%1, %5
-		pxor	%2, %2							
-		pcmpgtw	%2, %1							
-		pxor	%1, %2							
-		psubw	%1, %2							
+		pxor	%2, %2
+		pcmpgtw	%2, %1
+		pxor	%1, %2
+		psubw	%1, %2
 		paddusw	%1, %3
 		pmulhuw	%1, %4
 		pxor	%1, %2
@ -64,10 +64,10 @@ SECTION .text

 %macro SSE2_QuantMax8  6
 		MOVDQ	%1, %5
-		pxor	%2, %2							
-		pcmpgtw	%2, %1							
-		pxor	%1, %2							
-		psubw	%1, %2								
+		pxor	%2, %2
+		pcmpgtw	%2, %1
+		pxor	%1, %2
+		psubw	%1, %2
 		paddusw	%1, %3
 		pmulhuw	%1, %4
 		pmaxsw	%6, %1
@ -86,17 +86,17 @@ SECTION .text
 WELS_EXTERN WelsQuant4x4_sse2
 align 16
 WelsQuant4x4_sse2:
-		mov		eax,  [ff]		
-		mov		ecx,  [mf]			
+		mov		eax,  [ff]
+		mov		ecx,  [mf]
 		MOVDQ	xmm2, [eax]
 		MOVDQ	xmm3, [ecx]
-		
+
 		mov		edx,  [pDct]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]	
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]

 		ret
-	
+
 ;***********************************************************************
 ;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
 ;***********************************************************************
@ -104,37 +104,37 @@ WELS_EXTERN WelsQuant4x4Dc_sse2
 align 16
 WelsQuant4x4Dc_sse2:
 		mov		ax,		[mf]
-		SSE2_Copy8Times xmm3, eax						
-		
+		SSE2_Copy8Times xmm3, eax
+
 		mov		cx, [ff]
-		SSE2_Copy8Times xmm2, ecx						
+		SSE2_Copy8Times xmm2, ecx

 		mov		edx,  [pDct]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
-				
-		ret		
-		
+
+		ret
+
 ;***********************************************************************
 ;	void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
 ;***********************************************************************
 WELS_EXTERN WelsQuantFour4x4_sse2
 align 16
 WelsQuantFour4x4_sse2:
-		mov		eax,  [ff]		
-		mov		ecx,  [mf]			
+		mov		eax,  [ff]
+		mov		ecx,  [mf]
 		MOVDQ	xmm2, [eax]
 		MOVDQ	xmm3, [ecx]
-		
-		mov		edx,  [pDct]	
+
+		mov		edx,  [pDct]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]	
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x20]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x30]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x40]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x50]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x60]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x70]	
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x70]

 		ret

@ -144,17 +144,17 @@ WelsQuantFour4x4_sse2:
 WELS_EXTERN WelsQuantFour4x4Max_sse2
 align 16
 WelsQuantFour4x4Max_sse2:
-		mov		eax,  [ff]		
-		mov		ecx,  [mf]			
+		mov		eax,  [ff]
+		mov		ecx,  [mf]
 		MOVDQ	xmm2, [eax]
 		MOVDQ	xmm3, [ecx]
-		
-		mov		edx,  [pDct]		
+
+		mov		edx,  [pDct]
 		pxor	xmm4, xmm4
 		pxor	xmm5, xmm5
 		pxor	xmm6, xmm6
 		pxor	xmm7, xmm7
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx	   ], xmm4		
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx	   ], xmm4
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10], xmm4
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x20], xmm5
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x30], xmm5
@ -162,19 +162,19 @@ WelsQuantFour4x4Max_sse2:
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x50], xmm6
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x60], xmm7
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x70], xmm7
-		
+
 		SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
-		pmaxsw  xmm0,  xmm4	
+		pmaxsw  xmm0,  xmm4
 		pmaxsw  xmm0,  xmm5
-		pmaxsw  xmm0,  xmm7			
+		pmaxsw  xmm0,  xmm7
 		movdqa	xmm1,  xmm0
 		punpckhqdq	xmm0, xmm1
 		pmaxsw	xmm0, xmm1

-		mov		edx,  [max]	
-		movq	[edx], xmm0	
-			
-		ret		
+		mov		edx,  [max]
+		movq	[edx], xmm0
+
+		ret

 %macro  MMX_Copy4Times 2
 		movd		%1, %2
@ -185,10 +185,10 @@ WelsQuantFour4x4Max_sse2:
 SECTION .text

 %macro MMX_Quant4  4
-		pxor	%2, %2							
-		pcmpgtw	%2, %1							
-		pxor	%1, %2							
-		psubw	%1, %2							
+		pxor	%2, %2
+		pcmpgtw	%2, %1
+		pxor	%1, %2
+		psubw	%1, %2
 		paddusw	%1, %3
 		pmulhuw	%1, %4
 		pxor	%1, %2
@ -211,13 +211,13 @@ WelsHadamardQuant2x2_mmx:
 		movd		mm3,			[eax + 0x40]
 		movd		mm1,			[eax + 0x60]
 		punpcklwd	mm3,			mm1
-		
+
 		mov			cx,				0
 		mov			[eax],			cx
 		mov			[eax + 0x20],	cx
 		mov			[eax + 0x40],	cx
 		mov			[eax + 0x60],	cx
-		
+
 		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
 		movq		mm5,			mm3
 		paddw		mm3,			mm0
@ -229,32 +229,32 @@ WelsHadamardQuant2x2_mmx:
 		paddw		mm1,			mm3
 		psubw		mm3,			mm5
 		punpcklwd	mm1,			mm3
-		
+
 		;quant_2x2_dc
 		mov			ax,				[mf]
-		MMX_Copy4Times	mm3,		eax		
+		MMX_Copy4Times	mm3,		eax
 		mov			cx,				[ff]
 		MMX_Copy4Times	mm2,		ecx
 		MMX_Quant4		mm1,	mm0,	mm2,	mm3
-		
+
 		; store dct_2x2
-		mov			edx,			[dct2x2]	
+		mov			edx,			[dct2x2]
 		movq		[edx],			mm1
 		mov			ecx,			[iChromaDc]
 		movq		[ecx],			mm1
-		
+
 		; pNonZeroCount of dct_2x2
-		pcmpeqb		mm2,			mm2		; mm2 = FF 
+		pcmpeqb		mm2,			mm2		; mm2 = FF
 		pxor		mm3,			mm3
 		packsswb	mm1,			mm3
 		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
 		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
 		psadbw		mm1,			mm3		;
 		movd		eax,			mm1
-					
+
 		WELSEMMS
 		ret
-	
+
 ;***********************************************************************
 ;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff,  int16_t mf);
 ;***********************************************************************
@ -269,7 +269,7 @@ WelsHadamardQuant2x2Skip_mmx:
 		movd		mm3,			[eax + 0x40]
 		movd		mm1,			[eax + 0x60]
 		punpcklwd	mm3,			mm1
-		
+
 		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
 		movq		mm5,			mm3
 		paddw		mm3,			mm0
@ -281,32 +281,32 @@ WelsHadamardQuant2x2Skip_mmx:
 		paddw		mm1,			mm3
 		psubw		mm3,			mm5
 		punpcklwd	mm1,			mm3
-		
+
 		;quant_2x2_dc
 		mov			ax,				[mf]
-		MMX_Copy4Times	mm3,		eax		
+		MMX_Copy4Times	mm3,		eax
 		mov			cx,				[ff]
 		MMX_Copy4Times	mm2,		ecx
 		MMX_Quant4		mm1,	mm0,	mm2,	mm3
-		
+
 		; pNonZeroCount of dct_2x2
-		pcmpeqb		mm2,			mm2		; mm2 = FF 
+		pcmpeqb		mm2,			mm2		; mm2 = FF
 		pxor		mm3,			mm3
 		packsswb	mm1,			mm3
 		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
 		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
 		psadbw		mm1,			mm3		;
 		movd		eax,			mm1
-			
-		WELSEMMS		
-		ret	
-		
-		
-%macro SSE2_DeQuant8 3  
+
+		WELSEMMS
+		ret
+
+
+%macro SSE2_DeQuant8 3
    MOVDQ  %2, %1
    pmullw %2, %3
    MOVDQ  %1, %2
-%endmacro 
+%endmacro


 ALIGN  16
@ -329,7 +329,7 @@ WelsDequant4x4_sse2:
 ;***********************************************************************====
 ;void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* mf);
 ;***********************************************************************====
-    
+
 align 16

 WELS_EXTERN WelsDequantFour4x4_sse2
@ -356,39 +356,38 @@ WelsDequantFour4x4_sse2:
 WELS_EXTERN WelsDequantIHadamard4x4_sse2
 align 16
 WelsDequantIHadamard4x4_sse2:
-		mov			eax,			[esp + 4]				
+		mov			eax,			[esp + 4]
 		mov			cx,				[esp + 8]
-		
+
 		; WelsDequantLumaDc4x4
-		SSE2_Copy8Times	xmm1,		ecx		
+		SSE2_Copy8Times	xmm1,		ecx
 		;psrlw		xmm1,		2		; for the (>>2) in ihdm
 		MOVDQ		xmm0,		[eax]
 		MOVDQ		xmm2,		[eax+0x10]
-		pmullw		xmm0,		xmm1		
+		pmullw		xmm0,		xmm1
 		pmullw		xmm2,		xmm1

 		; ihdm_4x4
 		movdqa		xmm1,		xmm0
 		psrldq		xmm1,		8
 		movdqa		xmm3,		xmm2
-		psrldq		xmm3,		8		
-		
-		SSE2_SumSub		xmm0, xmm3,	xmm5					; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3	
-		SSE2_SumSub		xmm1, xmm2, xmm5					; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2														
+		psrldq		xmm3,		8
+
+		SSE2_SumSub		xmm0, xmm3,	xmm5					; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
+		SSE2_SumSub		xmm1, xmm2, xmm5					; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
 		SSE2_SumSub		xmm3, xmm2, xmm5					; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
 		SSE2_SumSub		xmm0, xmm1, xmm5               		; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1

-		SSE2_TransTwo4x4W	xmm2, xmm1, xmm3, xmm0, xmm4		
-		SSE2_SumSub		xmm2, xmm4,	xmm5		
-		SSE2_SumSub		xmm1, xmm0, xmm5																		
-		SSE2_SumSub		xmm4, xmm0, xmm5							
-		SSE2_SumSub		xmm2, xmm1, xmm5 
+		SSE2_TransTwo4x4W	xmm2, xmm1, xmm3, xmm0, xmm4
+		SSE2_SumSub		xmm2, xmm4,	xmm5
+		SSE2_SumSub		xmm1, xmm0, xmm5
+		SSE2_SumSub		xmm4, xmm0, xmm5
+		SSE2_SumSub		xmm2, xmm1, xmm5
 		SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
-		
+
 		punpcklqdq	xmm0,		xmm1
 		MOVDQ		[eax],		xmm0
-		
+
 		punpcklqdq	xmm2,		xmm3
-		MOVDQ		[eax+16],	xmm2			
+		MOVDQ		[eax+16],	xmm2
 		ret
-	
--- a/codec/encoder/core/asm/satd_sad.asm
+++ b/codec/encoder/core/asm/satd_sad.asm
--- a/codec/encoder/core/asm/score.asm
+++ b/codec/encoder/core/asm/score.asm
@ -45,7 +45,7 @@
 bits 32

 ;***********************************************************************
-; Macros 
+; Macros
 ;***********************************************************************

 ;***********************************************************************
@ -59,7 +59,7 @@ align 16
 sse2_1: dw 1, 1, 1, 1, 1, 1, 1, 1
 align 16
 sse2_b1: db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 
+i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 align 16
 sse2_plane_inc_minus: dw -7, -6, -5, -4, -3, -2, -1, 0
 align 16
@ -139,7 +139,7 @@ low_mask_table:
    db  4, 8, 5, 8, 8,12, 1, 4, 4, 8
    db  4, 7, 7,11, 4, 8, 7,11, 8,11
    db 11,15, 1, 4, 3, 7, 4, 7, 7,11
-    db  3, 7, 6,10, 7,10,10,14, 4, 7 
+    db  3, 7, 6,10, 7,10,10,14, 4, 7
    db  7,11, 7,10,10,14, 7,11,10,14
    db 11,14,14,18, 0, 4, 3, 7, 3, 6
    db  6,10, 3, 7, 6,10, 7,10,10,14
@ -191,7 +191,7 @@ WelsScan4x4DcAc_sse2:
 	movdqa     [eax],xmm0
 	movdqa     [eax+16], xmm1
 	ret
-	
+
 ;***********************************************************************
 ;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
 ;***********************************************************************
@ -206,7 +206,7 @@ WelsScan4x4DcAc_ssse3:
 	pinsrw		xmm0, eax, 7			; xmm0[7]	=	[8]
 	pinsrw		xmm1, ecx, 0			; xmm1[0]	=	[7]
 	pshufb		xmm1, [pb_scanacdc_maskb]
-	pshufb		xmm0, [pb_scanacdc_maska]	
+	pshufb		xmm0, [pb_scanacdc_maska]

 	mov        eax,  [esp+4]
 	movdqa     [eax],xmm0
@ -224,7 +224,7 @@ WelsScan4x4Ac_sse2:
 	movdqa     xmm2, xmm0
 	punpcklqdq xmm0, xmm1
 	punpckhqdq xmm2, xmm1
-	
+
 	movdqa     xmm3, xmm0
 	punpckldq  xmm0, xmm2
 	punpckhdq  xmm3, xmm2
@ -236,10 +236,10 @@ WelsScan4x4Ac_sse2:
 	pextrw     edx,  xmm3, 0
 	pinsrw     xmm3, eax,  0
 	pinsrw     xmm0, edx,  3
-	
+
 	pshufhw    xmm1, xmm0, 0x93
 	pshuflw    xmm2, xmm3, 0x39
-    
+
    movdqa     xmm3, xmm2
    psrldq     xmm1, 2
    pslldq     xmm3, 14
@ -255,13 +255,13 @@ WelsScan4x4Ac_sse2:
 ;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
 ;***********************************************************************
 ALIGN 16
-WELS_EXTERN WelsCalculateSingleCtr4x4_sse2 
+WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
 WelsCalculateSingleCtr4x4_sse2:
 	push      ebx
 	mov       eax,  [esp+8]
 	movdqa    xmm0, [eax]
 	movdqa    xmm1, [eax+16]
-	
+
 	packsswb  xmm0, xmm1

    pxor      xmm3, xmm3
@ -317,7 +317,7 @@ WelsGetNoneZeroCount_sse2:
 	and       edx,  0xff
 	shr       ecx,  8
 ;	and       ecx,  0xff	; we do not need this due to high 16bits equal to 0 yet
-	xor       eax,  eax	
+	xor       eax,  eax
 	add       al,  [nozero_count_table+ecx]
 	add       al,  [nozero_count_table+edx]
 	ret
--- a/codec/encoder/core/asm/vaa.asm
+++ b/codec/encoder/core/asm/vaa.asm
@ -38,7 +38,7 @@
 ;*      04/14/2010	Created
 ;*		06/07/2010	Added AnalysisVaaInfoIntra_sse2(ssse3)
 ;*		06/10/2010	Tune rc_sad_frame_sse2 and got about 40% improvement
-;*		08/11/2010	Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2 
+;*		08/11/2010	Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
 ;*
 ;*************************************************************************/
 %include "asm_inc.asm"
@ -167,7 +167,7 @@ AnalysisVaaInfoIntra_sse2:
 	mov ebp, esp
 	and ebp, 0fh
 	sub esp, ebp
-	sub esp, 32	
+	sub esp, 32
 	%define PUSH_SIZE	52	; 20 + 32

 	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
@ -179,31 +179,31 @@ AnalysisVaaInfoIntra_sse2:
 	add edx, ecx		; iLineSize x 3 [edx]
 	mov eax, ebx
 	sal eax, $1			; iLineSize x 4 [eax]
-	
+
 	pxor xmm7, xmm7
-	
+
 	; loops
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0	
+	movq [esp], xmm0

 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+8], xmm0	
+	movq [esp+8], xmm0

 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0	
+	movq [esp+16], xmm0

 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
 	movq [esp+24], xmm0
-		
+
 	movdqa xmm0, [esp]		; block 0~7
 	movdqa xmm1, [esp+16]	; block 8~15
 	movdqa xmm2, xmm0
 	paddw xmm0, xmm1
 	SUM_WORD_8x2_SSE2 xmm0, xmm3
-	
+
 	pmullw xmm1, xmm1
 	pmullw xmm2, xmm2
 	movdqa xmm3, xmm1
@ -219,7 +219,7 @@ AnalysisVaaInfoIntra_sse2:
 	paddd xmm1, xmm2
 	pshufd xmm2, xmm1, 0B1h
 	paddd xmm1, xmm2
-	
+
 	movd ebx, xmm0
 	and ebx, 0ffffh		; effective low word truncated
 	mov ecx, ebx
@ -227,7 +227,7 @@ AnalysisVaaInfoIntra_sse2:
 	sar ebx, $4
 	movd eax, xmm1
 	sub eax, ebx
-	
+
 	%undef PUSH_SIZE
 	add esp, 32
 	add esp, ebp
@ -253,7 +253,7 @@ AnalysisVaaInfoIntra_ssse3:
 	mov ebp, esp
 	and ebp, 0fh
 	sub esp, ebp
-	sub esp, 32	
+	sub esp, 32
 	%define PUSH_SIZE	52	; 20 + 32

 	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
@ -265,25 +265,25 @@ AnalysisVaaInfoIntra_ssse3:
 	add edx, ecx		; iLineSize x 3 [edx]
 	mov eax, ebx
 	sal eax, $1			; iLineSize x 4 [eax]
-	
+
 	pxor xmm7, xmm7
-	
+
 	; loops
 	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0	
+	movq [esp], xmm0

 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
-	movq [esp+8], xmm1	
+	movq [esp+8], xmm1

 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0	
+	movq [esp+16], xmm0

 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
 	movq [esp+24], xmm1
-		
+
 	movdqa xmm0, [esp]		; block 0~7
 	movdqa xmm1, [esp+16]	; block 8~15
 	movdqa xmm2, xmm0
@ -305,7 +305,7 @@ AnalysisVaaInfoIntra_ssse3:
 	paddd xmm1, xmm2
 	pshufd xmm2, xmm1, 0B1h
 	paddd xmm1, xmm2
-	
+
 	movd ebx, xmm0
 	and ebx, 0ffffh		; effective low work truncated
 	mov ecx, ebx
@ -313,7 +313,7 @@ AnalysisVaaInfoIntra_ssse3:
 	sar ebx, $4
 	movd eax, xmm1
 	sub eax, ebx
-	
+
 	%undef PUSH_SIZE
 	add esp, 32
 	add esp, ebp
@ -323,7 +323,7 @@ AnalysisVaaInfoIntra_ssse3:
 	pop edx
 	pop ebx
 	ret
-	
+
 WELS_EXTERN MdInterAnalysisVaaInfo_sse41
 ;***********************************************************************
 ;	uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
@ -331,18 +331,18 @@ WELS_EXTERN MdInterAnalysisVaaInfo_sse41
 ALIGN 16
 MdInterAnalysisVaaInfo_sse41:
 	mov eax, [esp+4]
-	movdqa xmm0, [eax]	; load 4 sad_8x8	
+	movdqa xmm0, [eax]	; load 4 sad_8x8
 	pshufd xmm1, xmm0, 01Bh
 	paddd xmm1, xmm0
 	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2	
+	paddd xmm1, xmm2
 	psrad xmm1, 02h		; iAverageSad
 	movdqa xmm2, xmm1
 	psrad xmm2, 06h
 	movdqa xmm3, xmm0	; iSadBlock
 	psrad xmm3, 06h
 	psubd xmm3, xmm2
-	pmulld xmm3, xmm3	; [comment]: pmulld from SSE4.1 instruction sets	
+	pmulld xmm3, xmm3	; [comment]: pmulld from SSE4.1 instruction sets
 	pshufd xmm4, xmm3, 01Bh
 	paddd xmm4, xmm3
 	pshufd xmm3, xmm4, 0B1h
@ -354,7 +354,7 @@ MdInterAnalysisVaaInfo_sse41:
 	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
 	movmskps eax, xmm0
 	ret
-.threshold_exit:	
+.threshold_exit:
 	mov eax, 15
 	ret

@ -365,28 +365,28 @@ WELS_EXTERN MdInterAnalysisVaaInfo_sse2
 ALIGN 16
 MdInterAnalysisVaaInfo_sse2:
 	mov eax, [esp+4]
-	movdqa xmm0, [eax]	; load 4 sad_8x8	
+	movdqa xmm0, [eax]	; load 4 sad_8x8
 	pshufd xmm1, xmm0, 01Bh
 	paddd xmm1, xmm0
 	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2	
+	paddd xmm1, xmm2
 	psrad xmm1, 02h		; iAverageSad
 	movdqa xmm2, xmm1
 	psrad xmm2, 06h
 	movdqa xmm3, xmm0	; iSadBlock
 	psrad xmm3, 06h
 	psubd xmm3, xmm2
-	
+
 	; to replace pmulld functionality as below
-	movdqa xmm2, xmm3	
+	movdqa xmm2, xmm3
 	pmuludq xmm2, xmm3
 	pshufd xmm4, xmm3, 0B1h
 	pmuludq xmm4, xmm4
 	movdqa xmm5, xmm2
 	punpckldq xmm5, xmm4
 	punpckhdq xmm2, xmm4
-	punpcklqdq xmm5, xmm2	
-	
+	punpcklqdq xmm5, xmm2
+
 	pshufd xmm4, xmm5, 01Bh
 	paddd xmm4, xmm5
 	pshufd xmm5, xmm4, 0B1h
@ -398,6 +398,6 @@ MdInterAnalysisVaaInfo_sse2:
 	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
 	movmskps eax, xmm0
 	ret
-.threshold_exit:	
+.threshold_exit:
 	mov eax, 15
 	ret
--- a/codec/encoder/plus/res/welsenc.rc
+++ b/codec/encoder/plus/res/welsenc.rc
@ -27,18 +27,18 @@ LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
 // TEXTINCLUDE
 //

-1 TEXTINCLUDE 
+1 TEXTINCLUDE
 BEGIN
    "resource.h\0"
 END

-2 TEXTINCLUDE 
+2 TEXTINCLUDE
 BEGIN
    "#include ""windows.h""\r\n"
    "\0"
 END

-3 TEXTINCLUDE 
+3 TEXTINCLUDE
 BEGIN
    "\r\n"
    "\0"
--- a/processing/build/linux/makefile
+++ b/processing/build/linux/makefile
@ -1,94 +1,94 @@
-NASM = 1
-NAME      = libwelsvp
-
-OUTDIR    = ../../../bin/linux
-BINDIR    = ../../bin
-OBJDIR    = ../../obj  
-SRCDIRS   = ../../src/asm \
-            ../../src/common \
-            ../../src/adaptivequantization \
-            ../../src/backgounddetection \
-            ../../src/denoise \
-            ../../src/downsample \
-            ../../src/scenechangedetection \
-            ../../src/vaacalc \
-            ../../src/complexityanalysis 
-SRCDIRS  += ../../src/imagerotate
-
-
-TARGETLIB =  $(BINDIR)/$(NAME).so
-
-CC        = $(shell which gcc)
-AS        = $(shell which nasm)
-GCC       = gcc -m32
-
-CPPFLAGS  = -Wall -g -O3
-ifeq ($(NASM), 1)
-CPPFLAGS += -DX86_ASM
-endif
-ASMFLAGS  = -f elf -DNOPREFIX  -I ../../src/asm/
-LDFLAGS   = -lstdc++ -ldl
-          
-SRCEXTS  = .cpp
-ifeq ($(NASM), 1)
-SRCEXTS += .asm
-endif
-HDREXTS  = .h
-SOURCES  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
-HEADERS  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS))))
-SRC_CPP  = $(filter %.cpp,$(SOURCES))
-SRC_ASM  = $(filter %.asm,$(SOURCES))
-OBJS     = $(addsuffix .o, $(basename $(SOURCES)))
-DEPS     = $(OBJS:.o=.d)
-
-DEP_OPT  = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \
-                  echo "-MM -MP"; else echo "-M"; fi )
-DEPEND_cpp.d  = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS))
-DEPEND_asm.d  = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS))
-COMPILE.cpp   = $(GCC) $(CPPFLAGS) -c
-COMPILE.asm   = $(AS)  $(ASMFLAGS)
-LINK          = $(GCC) $(LDFLAGS)
-
-.PHONY: all objs tags ctags clean distclean
-
-.SUFFIXES:
-
-all: $(TARGETLIB)
-	
-%.d:%.cpp
-	@echo -n $(dir $<) > $@
-	@$(DEPEND_cpp.d) $< >> $@
-	
-%.d:%.asm
-	@echo -n $(dir $<) > $@
-	@$(DEPEND_asm.d) $< >> $@
-
-objs:$(OBJS)
-
-%.o:%.cpp
-	$(COMPILE.cpp) $< -o $@
-	
-%.o:%.asm
-	$(COMPILE.asm) $< -o $@	
-
-tags: $(HEADERS) $(SOURCES)
-	etags $(HEADERS) $(SOURCES)
-
-ctags: $(HEADERS) $(SOURCES)
-	ctags $(HEADERS) $(SOURCES)
-
-$(TARGETLIB):$(OBJS)
-	@if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi
-	$(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@
-	@echo produce the lib to $(TARGETLIB).
-	@if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi
-	@cp -f $(TARGETLIB) $(OUTDIR)
-	@cp -f $(TARGETLIB) ../../../testbin
-	@echo copy the lib to $(OUTDIR).
-
-clean:
-	rm -f $(OBJS) $(TARGETLIB)
-
-distclean: clean
-	rm -f $(DEPS) TAGS
-
+NASM = 1
+NAME      = libwelsvp
+
+OUTDIR    = ../../../bin/linux
+BINDIR    = ../../bin
+OBJDIR    = ../../obj
+SRCDIRS   = ../../src/asm \
+            ../../src/common \
+            ../../src/adaptivequantization \
+            ../../src/backgounddetection \
+            ../../src/denoise \
+            ../../src/downsample \
+            ../../src/scenechangedetection \
+            ../../src/vaacalc \
+            ../../src/complexityanalysis
+SRCDIRS  += ../../src/imagerotate
+
+
+TARGETLIB =  $(BINDIR)/$(NAME).so
+
+CC        = $(shell which gcc)
+AS        = $(shell which nasm)
+GCC       = gcc -m32
+
+CPPFLAGS  = -Wall -g -O3
+ifeq ($(NASM), 1)
+CPPFLAGS += -DX86_ASM
+endif
+ASMFLAGS  = -f elf -DNOPREFIX  -I ../../src/asm/
+LDFLAGS   = -lstdc++ -ldl
+
+SRCEXTS  = .cpp
+ifeq ($(NASM), 1)
+SRCEXTS += .asm
+endif
+HDREXTS  = .h
+SOURCES  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
+HEADERS  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS))))
+SRC_CPP  = $(filter %.cpp,$(SOURCES))
+SRC_ASM  = $(filter %.asm,$(SOURCES))
+OBJS     = $(addsuffix .o, $(basename $(SOURCES)))
+DEPS     = $(OBJS:.o=.d)
+
+DEP_OPT  = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \
+                  echo "-MM -MP"; else echo "-M"; fi )
+DEPEND_cpp.d  = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS))
+DEPEND_asm.d  = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS))
+COMPILE.cpp   = $(GCC) $(CPPFLAGS) -c
+COMPILE.asm   = $(AS)  $(ASMFLAGS)
+LINK          = $(GCC) $(LDFLAGS)
+
+.PHONY: all objs tags ctags clean distclean
+
+.SUFFIXES:
+
+all: $(TARGETLIB)
+
+%.d:%.cpp
+	@echo -n $(dir $<) > $@
+	@$(DEPEND_cpp.d) $< >> $@
+
+%.d:%.asm
+	@echo -n $(dir $<) > $@
+	@$(DEPEND_asm.d) $< >> $@
+
+objs:$(OBJS)
+
+%.o:%.cpp
+	$(COMPILE.cpp) $< -o $@
+
+%.o:%.asm
+	$(COMPILE.asm) $< -o $@
+
+tags: $(HEADERS) $(SOURCES)
+	etags $(HEADERS) $(SOURCES)
+
+ctags: $(HEADERS) $(SOURCES)
+	ctags $(HEADERS) $(SOURCES)
+
+$(TARGETLIB):$(OBJS)
+	@if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi
+	$(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@
+	@echo produce the lib to $(TARGETLIB).
+	@if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi
+	@cp -f $(TARGETLIB) $(OUTDIR)
+	@cp -f $(TARGETLIB) ../../../testbin
+	@echo copy the lib to $(OUTDIR).
+
+clean:
+	rm -f $(OBJS) $(TARGETLIB)
+
+distclean: clean
+	rm -f $(DEPS) TAGS
+
--- a/processing/src/asm/asm_inc.asm
+++ b/processing/src/asm/asm_inc.asm
@ -43,7 +43,7 @@
 ; Options, for DEBUG
 ;***********************************************************************

-%if 1 
+%if 1
 	%define MOVDQ movdqa
 %else
 	%define MOVDQ movdqu
@ -58,7 +58,7 @@
 BITS 32

 ;***********************************************************************
-; Macros 
+; Macros
 ;***********************************************************************

 %macro WELS_EXTERN 1
@ -74,7 +74,7 @@ BITS 32
 	pxor        %2, %2
    psubw       %2, %1
    pmaxsw      %1, %2
-%endmacro 	
+%endmacro

 %macro MMX_XSwap  4
    movq		%4, %2
@ -105,7 +105,7 @@ BITS 32
    SSE2_XSawp qdq, %5, %2, %3
 %endmacro

-;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4 
+;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
 %macro SSE2_TransTwo4x4W 5
    SSE2_XSawp wd,  %1, %2, %5
    SSE2_XSawp wd,  %3, %4, %2
@ -125,26 +125,26 @@ BITS 32
 	movdqa	%6, %9
 	movdqa	%9, %4
 	SSE2_XSawp bw,  %7, %6, %4
-	
-	SSE2_XSawp wd,  %1, %3, %6	
+
+	SSE2_XSawp wd,  %1, %3, %6
 	SSE2_XSawp wd,  %8, %2, %3
 	SSE2_XSawp wd,  %5, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %3	
+	movdqa	%9, %3
 	SSE2_XSawp wd,  %7, %4, %3
-	
-	SSE2_XSawp dq,  %1, %5, %4	
+
+	SSE2_XSawp dq,  %1, %5, %4
 	SSE2_XSawp dq,  %6, %2, %5
 	SSE2_XSawp dq,  %8, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %5		
+	movdqa	%9, %5
 	SSE2_XSawp dq,  %7, %3, %5
-	
+
 	SSE2_XSawp qdq,  %1, %8, %3
 	SSE2_XSawp qdq,  %4, %2, %8
 	SSE2_XSawp qdq,  %6, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %1		
+	movdqa	%9, %1
 	SSE2_XSawp qdq,  %7, %5, %1
 	movdqa	%5, %9
 %endmacro
@ -170,9 +170,9 @@ BITS 32
 %macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
 	mov %3h, %3l
 	movd %1, e%3x		; i.e, 1% = eax (=b0)
-	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0	
-	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0	
-%endmacro  
+	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
+	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro

 ;copy a dw into a xmm for 8 times
 %macro  SSE2_Copy8Times 2
--- a/processing/src/asm/cpuid.asm
+++ b/processing/src/asm/cpuid.asm
@ -84,12 +84,12 @@ ALIGN 16
 ;   void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
 ;****************************************************************************************************
 WelsCPUId:
-	push	ebx	
+	push	ebx
 	push	edi
-	
+
 	mov     eax, [esp+12]	; operating index
    cpuid					; cpuid
-	
+
 	; processing various information return
 	mov     edi, [esp+16]
    mov     [edi], eax
@ -100,10 +100,10 @@ WelsCPUId:
    mov     edi, [esp+28]
    mov     [edi], edx

-	pop		edi	
+	pop		edi
    pop     ebx
 	ret
-	
+
 WELS_EXTERN WelsCPUSupportAVX
 ; need call after cpuid=1 and eax, ecx flag got then
 ALIGN 16
@ -139,7 +139,7 @@ ALIGN 16
 WelsCPUSupportFMA:
 	mov eax, [esp+4]
 	mov ecx, [esp+8]
-	
+
 	; refer to detection of FMA addressed in INTEL AVX manual document
 	and ecx, 018001000H
 	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
@ -153,7 +153,7 @@ WelsCPUSupportFMA:
 	mov eax, 1
 	ret
 fma_not_supported:
-	mov eax, 0	
+	mov eax, 0
 	ret

 WELS_EXTERN WelsEmms
--- a/processing/src/asm/denoisefilter.asm
+++ b/processing/src/asm/denoisefilter.asm
@ -1,263 +1,263 @@
-;*!
-;* \copy
-;*     Copyright (c)  2010-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  predenoise.asm
-;*
-;*  Abstract
-;*      denoise for SVC2.1
-;*  History
-;*      4/13/2010 Created
-;*      7/30/2010 Modified
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Constant
-;***********************************************************************
-SECTION .rodata align=16
-
-sse2_32 times 8 dw 32
-sse2_20 times 8 dw 20
-
-
-BITS 32
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-	
-%macro	WEIGHT_LINE	9
-		movq		%2,	%9
-		punpcklbw	%2,	%7
-		movdqa		%8,	%2
-		
-		movdqa		%1,	%6
-		psubusb		%1,	%8
-		psubusb		%8,	%6
-		por			%8,	%1		; ABS(curPixel - centerPixel);
-		
-		movdqa		%1,	%3
-		psubusb		%1,	%8
-
-		pmullw		%1,	%1
-		psrlw		%1,	5
-		pmullw		%2,	%1		
-		paddusw		%4,	%1
-		paddusw		%5,	%2	
-%endmacro
-
-%macro	WEIGHT_LINE1_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-%endmacro
-
-%macro	WEIGHT_LINE2_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-%endmacro
-
-%macro	WEIGHT_LINE3_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		pmullw		%2,	[sse2_20]
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-%endmacro
-
-ALIGN 16
-WELS_EXTERN BilateralLumaFilter8_sse2
-;***********************************************************************
-;  BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-;	1	2	3
-;	4	0	5
-;	6	7	8
-;	0:	the center point
-%define		pushsize	4
-%define		pixel		esp + pushsize + 4
-%define		stride		esp + pushsize + 8
-BilateralLumaFilter8_sse2:
-		push		ebx
-		
-		pxor		xmm7,	xmm7
-		mov			eax,	[pixel]
-		mov			ebx,	eax
-		movq		xmm6,	[eax]
-		punpcklbw	xmm6,	xmm7
-		movdqa		xmm3,	[sse2_32]
-		pxor		xmm4,	xmm4		; nTotWeight
-		pxor		xmm5,	xmm5		; nSum
-		
-		dec			eax
-		mov			ecx,	[stride]
-		
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 4
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 5
-		
-		sub			eax,	ecx
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 1
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 2
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 3
-		
-		lea			eax,	[eax + ecx * 2]
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 6
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 7
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 8
-		
-		pcmpeqw		xmm0,	xmm0
-		psrlw		xmm0,	15
-		psllw		xmm0,	8
-		psubusw		xmm0,	xmm4
-		pmullw		xmm0,	xmm6
-		paddusw		xmm5,	xmm0
-		psrlw		xmm5,	8
-		packuswb	xmm5,	xmm5
-		movq		[ebx],	xmm5		
-		
-		pop ebx
-		ret	
-
-WELS_EXTERN WaverageChromaFilter8_sse2
-;***********************************************************************
-; void		WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-;5x5 filter:
-;1	1	2	1	1
-;1	2	4	2	1
-;2	4	20	4	2
-;1	2	4	2	1
-;1	1	2	1	1
-
-ALIGN 16
-WaverageChromaFilter8_sse2:
-		mov		edx,	[esp + 4]	; pixels
-		mov		ecx,	[esp + 8]	; stride
-		
-		mov		eax,	ecx
-		add		eax,	eax
-		sub		edx,	eax			; pixels - 2 * stride
-		sub		edx,	2
-			
-		pxor	xmm0,	xmm0	
-		pxor	xmm3,	xmm3
-	
-		movdqu		xmm1,	[edx]
-		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
-		
-		movdqu		xmm1,	[edx + ecx]
-		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0	
-		
-		add		edx,	eax	
-		movdqu		xmm1,	[edx]
-		WEIGHT_LINE3_UV	xmm1,	xmm2,	xmm3,	xmm0
-		
-		movdqu		xmm1,	[edx + ecx]
-		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0	
-		
-		movdqu		xmm1,	[edx + ecx * 2]
-		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0		
-	
-		psrlw		xmm3,		6
-		packuswb	xmm3,		xmm3
-		movq		[edx + 2],		xmm3			
-
-		ret	
+;*!
+;* \copy
+;*     Copyright (c)  2010-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  predenoise.asm
+;*
+;*  Abstract
+;*      denoise for SVC2.1
+;*  History
+;*      4/13/2010 Created
+;*      7/30/2010 Modified
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Constant
+;***********************************************************************
+SECTION .rodata align=16
+
+sse2_32 times 8 dw 32
+sse2_20 times 8 dw 20
+
+
+BITS 32
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+%macro	WEIGHT_LINE	9
+		movq		%2,	%9
+		punpcklbw	%2,	%7
+		movdqa		%8,	%2
+
+		movdqa		%1,	%6
+		psubusb		%1,	%8
+		psubusb		%8,	%6
+		por			%8,	%1		; ABS(curPixel - centerPixel);
+
+		movdqa		%1,	%3
+		psubusb		%1,	%8
+
+		pmullw		%1,	%1
+		psrlw		%1,	5
+		pmullw		%2,	%1
+		paddusw		%4,	%1
+		paddusw		%5,	%2
+%endmacro
+
+%macro	WEIGHT_LINE1_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+%endmacro
+
+%macro	WEIGHT_LINE2_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+%endmacro
+
+%macro	WEIGHT_LINE3_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		pmullw		%2,	[sse2_20]
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+%endmacro
+
+ALIGN 16
+WELS_EXTERN BilateralLumaFilter8_sse2
+;***********************************************************************
+;  BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+;	1	2	3
+;	4	0	5
+;	6	7	8
+;	0:	the center point
+%define		pushsize	4
+%define		pixel		esp + pushsize + 4
+%define		stride		esp + pushsize + 8
+BilateralLumaFilter8_sse2:
+		push		ebx
+
+		pxor		xmm7,	xmm7
+		mov			eax,	[pixel]
+		mov			ebx,	eax
+		movq		xmm6,	[eax]
+		punpcklbw	xmm6,	xmm7
+		movdqa		xmm3,	[sse2_32]
+		pxor		xmm4,	xmm4		; nTotWeight
+		pxor		xmm5,	xmm5		; nSum
+
+		dec			eax
+		mov			ecx,	[stride]
+
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 4
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 5
+
+		sub			eax,	ecx
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 1
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 2
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 3
+
+		lea			eax,	[eax + ecx * 2]
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 6
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 7
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 8
+
+		pcmpeqw		xmm0,	xmm0
+		psrlw		xmm0,	15
+		psllw		xmm0,	8
+		psubusw		xmm0,	xmm4
+		pmullw		xmm0,	xmm6
+		paddusw		xmm5,	xmm0
+		psrlw		xmm5,	8
+		packuswb	xmm5,	xmm5
+		movq		[ebx],	xmm5
+
+		pop ebx
+		ret
+
+WELS_EXTERN WaverageChromaFilter8_sse2
+;***********************************************************************
+; void		WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+;5x5 filter:
+;1	1	2	1	1
+;1	2	4	2	1
+;2	4	20	4	2
+;1	2	4	2	1
+;1	1	2	1	1
+
+ALIGN 16
+WaverageChromaFilter8_sse2:
+		mov		edx,	[esp + 4]	; pixels
+		mov		ecx,	[esp + 8]	; stride
+
+		mov		eax,	ecx
+		add		eax,	eax
+		sub		edx,	eax			; pixels - 2 * stride
+		sub		edx,	2
+
+		pxor	xmm0,	xmm0
+		pxor	xmm3,	xmm3
+
+		movdqu		xmm1,	[edx]
+		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		movdqu		xmm1,	[edx + ecx]
+		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		add		edx,	eax
+		movdqu		xmm1,	[edx]
+		WEIGHT_LINE3_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		movdqu		xmm1,	[edx + ecx]
+		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		movdqu		xmm1,	[edx + ecx * 2]
+		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		psrlw		xmm3,		6
+		packuswb	xmm3,		xmm3
+		movq		[edx + 2],		xmm3
+
+		ret
--- a/processing/src/asm/downsample_bilinear.asm
+++ b/processing/src/asm/downsample_bilinear.asm
--- a/processing/src/asm/intra_pred.asm
+++ b/processing/src/asm/intra_pred.asm
@ -1,145 +1,145 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  intra_pred.asm
-;*
-;*  Abstract
-;*      sse2 function for intra predict operations
-;*
-;*  History
-;*      18/09/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "../../src/asm/asm_inc.asm"
-
-BITS 32
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata data
-%else
-SECTION .rodata align=16
-%endif
-
-
-align 16
-mmx_01bytes:		times 16	db 1
-
-;***********************************************************************
-; macros
-;***********************************************************************
-%macro  COPY_16_TIMES 2
-		movdqa		%2,	[%1-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
-%endmacro
-
-%macro  COPY_16_TIMESS 3
-		movdqa		%2,	[%1+%3-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
-%endmacro
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-;***********************************************************************
-; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-
-%macro SSE2_PRED_H_16X16_TWO_LINE 1
-    lea     eax,	[eax+ecx*2]
-    
-    COPY_16_TIMES eax,	xmm0
-    movdqa  [edx+%1],	xmm0
-    COPY_16_TIMESS eax,	xmm0,	ecx
-    movdqa  [edx+%1+0x10],	xmm0
-%endmacro
-
-WELS_EXTERN WelsI16x16LumaPredH_sse2
-WelsI16x16LumaPredH_sse2:
-    mov     edx, [esp+4]    ; pred
-    mov     eax, [esp+8]	; pRef
-    mov     ecx, [esp+12]   ; stride
-    
-    COPY_16_TIMES eax,	xmm0
-    movdqa  [edx],		xmm0
-    COPY_16_TIMESS eax,	xmm0,	ecx
-    movdqa  [edx+0x10],	xmm0
-    
-	SSE2_PRED_H_16X16_TWO_LINE   0x20 
-	SSE2_PRED_H_16X16_TWO_LINE   0x40
-	SSE2_PRED_H_16X16_TWO_LINE   0x60
-	SSE2_PRED_H_16X16_TWO_LINE   0x80
-	SSE2_PRED_H_16X16_TWO_LINE   0xa0
-	SSE2_PRED_H_16X16_TWO_LINE   0xc0
-	SSE2_PRED_H_16X16_TWO_LINE   0xe0
-   
-    ret
-    
-;***********************************************************************
-; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredV_sse2
-WelsI16x16LumaPredV_sse2:
-    mov     edx, [esp+4]    ; pred
-    mov     eax, [esp+8]	; pRef
-    mov     ecx, [esp+12]   ; stride
-    
-    sub     eax, ecx
-    movdqa  xmm0, [eax]
-    
-    movdqa  [edx], xmm0
-    movdqa  [edx+10h], xmm0
-    movdqa  [edx+20h], xmm0
-    movdqa  [edx+30h], xmm0
-    movdqa  [edx+40h], xmm0
-    movdqa  [edx+50h], xmm0
-    movdqa  [edx+60h], xmm0
-    movdqa  [edx+70h], xmm0
-    movdqa  [edx+80h], xmm0
-    movdqa  [edx+90h], xmm0
-    movdqa  [edx+160], xmm0 
-	movdqa  [edx+176], xmm0
-    movdqa  [edx+192], xmm0
-    movdqa  [edx+208], xmm0
-    movdqa  [edx+224], xmm0
-    movdqa  [edx+240], xmm0
-    
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  intra_pred.asm
+;*
+;*  Abstract
+;*      sse2 function for intra predict operations
+;*
+;*  History
+;*      18/09/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "../../src/asm/asm_inc.asm"
+
+BITS 32
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata data
+%else
+SECTION .rodata align=16
+%endif
+
+
+align 16
+mmx_01bytes:		times 16	db 1
+
+;***********************************************************************
+; macros
+;***********************************************************************
+%macro  COPY_16_TIMES 2
+		movdqa		%2,	[%1-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+%macro  COPY_16_TIMESS 3
+		movdqa		%2,	[%1+%3-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+;***********************************************************************
+; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+
+%macro SSE2_PRED_H_16X16_TWO_LINE 1
+    lea     eax,	[eax+ecx*2]
+
+    COPY_16_TIMES eax,	xmm0
+    movdqa  [edx+%1],	xmm0
+    COPY_16_TIMESS eax,	xmm0,	ecx
+    movdqa  [edx+%1+0x10],	xmm0
+%endmacro
+
+WELS_EXTERN WelsI16x16LumaPredH_sse2
+WelsI16x16LumaPredH_sse2:
+    mov     edx, [esp+4]    ; pred
+    mov     eax, [esp+8]	; pRef
+    mov     ecx, [esp+12]   ; stride
+
+    COPY_16_TIMES eax,	xmm0
+    movdqa  [edx],		xmm0
+    COPY_16_TIMESS eax,	xmm0,	ecx
+    movdqa  [edx+0x10],	xmm0
+
+	SSE2_PRED_H_16X16_TWO_LINE   0x20
+	SSE2_PRED_H_16X16_TWO_LINE   0x40
+	SSE2_PRED_H_16X16_TWO_LINE   0x60
+	SSE2_PRED_H_16X16_TWO_LINE   0x80
+	SSE2_PRED_H_16X16_TWO_LINE   0xa0
+	SSE2_PRED_H_16X16_TWO_LINE   0xc0
+	SSE2_PRED_H_16X16_TWO_LINE   0xe0
+
+    ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredV_sse2
+WelsI16x16LumaPredV_sse2:
+    mov     edx, [esp+4]    ; pred
+    mov     eax, [esp+8]	; pRef
+    mov     ecx, [esp+12]   ; stride
+
+    sub     eax, ecx
+    movdqa  xmm0, [eax]
+
+    movdqa  [edx], xmm0
+    movdqa  [edx+10h], xmm0
+    movdqa  [edx+20h], xmm0
+    movdqa  [edx+30h], xmm0
+    movdqa  [edx+40h], xmm0
+    movdqa  [edx+50h], xmm0
+    movdqa  [edx+60h], xmm0
+    movdqa  [edx+70h], xmm0
+    movdqa  [edx+80h], xmm0
+    movdqa  [edx+90h], xmm0
+    movdqa  [edx+160], xmm0
+	movdqa  [edx+176], xmm0
+    movdqa  [edx+192], xmm0
+    movdqa  [edx+208], xmm0
+    movdqa  [edx+224], xmm0
+    movdqa  [edx+240], xmm0
+
    ret
--- a/processing/src/asm/sad.asm
+++ b/processing/src/asm/sad.asm
@ -1,79 +1,79 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  pixel_sse2.asm
-;*
-;*  Abstract
-;*      WelsSampleSad8x8_sse21
-;*
-;*  History
-;*      8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-%macro SAD_8x4 0
-	movq   xmm0,   [eax]
-	movq   xmm1,   [eax+ebx]
-	lea    eax,    [eax+2*ebx]
-	movhps xmm0,   [eax]
-	movhps xmm1,   [eax+ebx]
-
-	movq   xmm2,   [ecx]
-	movq   xmm3,   [ecx+edx]
-	lea    ecx,    [ecx+2*edx]
-	movhps xmm2,   [ecx]
-	movhps xmm3,   [ecx+edx]
-	psadbw xmm0,   xmm2
-	psadbw xmm1,   xmm3
-	paddw  xmm6,   xmm0
-	paddw  xmm6,   xmm1
-%endmacro
-
-
-  
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and    %1,  0x1f|(%3>>1)
-cmp    %1,  (32-%2)|(%3>>1)
-%endmacro
-
-
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  pixel_sse2.asm
+;*
+;*  Abstract
+;*      WelsSampleSad8x8_sse21
+;*
+;*  History
+;*      8/5/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+%macro SAD_8x4 0
+	movq   xmm0,   [eax]
+	movq   xmm1,   [eax+ebx]
+	lea    eax,    [eax+2*ebx]
+	movhps xmm0,   [eax]
+	movhps xmm1,   [eax+ebx]
+
+	movq   xmm2,   [ecx]
+	movq   xmm3,   [ecx+edx]
+	lea    ecx,    [ecx+2*edx]
+	movhps xmm2,   [ecx]
+	movhps xmm3,   [ecx+edx]
+	psadbw xmm0,   xmm2
+	psadbw xmm1,   xmm3
+	paddw  xmm6,   xmm0
+	paddw  xmm6,   xmm1
+%endmacro
+
+
+
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and    %1,  0x1f|(%3>>1)
+cmp    %1,  (32-%2)|(%3>>1)
+%endmacro
+
+
 %macro SSE2_GetSad8x4 0
 	movq   xmm0,   [eax]
 	movq   xmm1,   [eax+ebx]
@ -90,12 +90,12 @@ cmp    %1,  (32-%2)|(%3>>1)
 	psadbw xmm1,   xmm3
 	paddw  xmm6,   xmm0
 	paddw  xmm6,   xmm1
-%endmacro
+%endmacro


-;***********************************************************************
-; Code
-;***********************************************************************
+;***********************************************************************
+; Code
+;***********************************************************************
 SECTION .text

 WELS_EXTERN WelsSampleSad8x8_sse21
@ -108,15 +108,15 @@ WelsSampleSad8x8_sse21:
 	push   edi
 	mov    eax,    [esp+12]
 	mov    ebx,    [esp+16]
-    
+
    pxor   xmm7,   xmm7
-    
+
    mov    edi,    ecx
    and    edi,    0x07
-    sub    ecx,    edi   
+    sub    ecx,    edi
    mov    edx,    8
    sub    edx,    edi
-    
+
    shl    edi,    3
    shl    edx,    3
    movd   xmm5,   edi
@ -124,10 +124,10 @@ WelsSampleSad8x8_sse21:
 	mov    edi,    8
 	add    edi,    ecx
    mov    edx,    [esp+24]
-    
+
    movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@ -135,35 +135,17 @@ WelsSampleSad8x8_sse21:
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-	
-	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-	lea    edi,    [edi+2*edx]
-	 
-    movq   xmm0,   [eax]
-	movhps xmm0,   [eax+ebx]
-		
-	movq   xmm1,   [ecx]
-	movq   xmm2,   [edi]
-	movhps xmm1,   [ecx+edx]
-	movhps xmm2,   [edi+edx]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0

 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
    movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@ -171,17 +153,17 @@ WelsSampleSad8x8_sse21:
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
    movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@ -189,10 +171,28 @@ WelsSampleSad8x8_sse21:
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
+	lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+	lea    edi,    [edi+2*edx]
+
+    movq   xmm0,   [eax]
+	movhps xmm0,   [eax+ebx]
+
+	movq   xmm1,   [ecx]
+	movq   xmm2,   [edi]
+	movhps xmm1,   [ecx+edx]
+	movhps xmm2,   [edi+edx]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
    movhlps    xmm0, xmm7
 	paddw      xmm0, xmm7
 	movd       eax,  xmm0
@ -202,12 +202,12 @@ WelsSampleSad8x8_sse21:
    push   ebx
    mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
 	pxor   xmm6,   xmm6
 	SSE2_GetSad8x4
    lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
-    SSE2_GetSad8x4    
+    SSE2_GetSad8x4
    movhlps    xmm0, xmm6
 	paddw      xmm0, xmm6
 	movd       eax,  xmm0
--- a/processing/src/asm/vaa.asm
+++ b/processing/src/asm/vaa.asm
--- a/processing/src/common/WelsVP.def
+++ b/processing/src/common/WelsVP.def
@ -1,36 +1,36 @@
-;*!
-;* \copy
-;*     Copyright (c)  2011-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-
-LIBRARY		    welsvp.dll
-EXPORTS
-                CreateVpInterface    PRIVATE
-                DestroyVpInterface   PRIVATE      
+;*!
+;* \copy
+;*     Copyright (c)  2011-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+
+LIBRARY		    welsvp.dll
+EXPORTS
+                CreateVpInterface    PRIVATE
+                DestroyVpInterface   PRIVATE
--- a/processing/src/common/WelsVP.rc
+++ b/processing/src/common/WelsVP.rc
@ -27,18 +27,18 @@ LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
 // TEXTINCLUDE
 //

-1 TEXTINCLUDE 
+1 TEXTINCLUDE
 BEGIN
    "resource.h\0"
 END

-2 TEXTINCLUDE 
+2 TEXTINCLUDE
 BEGIN
    "#include ""windows.h""\r\n"
    "\0"
 END

-3 TEXTINCLUDE 
+3 TEXTINCLUDE
 BEGIN
    "\r\n"
    "\0"
--- a/testbin/AutoBuild_Windows_VS2008.bat
+++ b/testbin/AutoBuild_Windows_VS2008.bat
@ -23,7 +23,7 @@ rem ************************************************
 rem call VP build
 echo "Welsvp Building....."
 cd %VPProjectDir%
-rem vcclean 
+rem vcclean
 %VCBUILDEXE% WelsVP_2008.vcproj


@ -33,7 +33,7 @@ echo "WelsEncoder Building....."

 cd %CurDir%
 cd %EncoderProjectDir%
-rem vcclean 
+rem vcclean
 %VCBUILDEXE% WelsEncCore.vcproj
 %VCBUILDEXE% WelsEncPlus.vcproj
 %VCBUILDEXE% encConsole.vcproj
@ -44,7 +44,7 @@ echo "WelsDecoder Building....."

 cd %CurDir%
 cd %DecoderProjectDir%
-rem vcclean 
+rem vcclean
 %VCBUILDEXE% WelsDecCore.vcproj
 %VCBUILDEXE% WelsDecPlus.vcproj
 %VCBUILDEXE% decConsole.vcproj
--- a/testbin/AutoBuild_Windows_VS2010.bat
+++ b/testbin/AutoBuild_Windows_VS2010.bat
@ -36,7 +36,7 @@ echo "WelsEncoder Building....."
 cd %CurDir%
 cd %EncoderProjectDir%
 echo current directory is %EncoderProjectDir%
-rem vcclean 
+rem vcclean

 echo %VCMSBUILDEXE_RELEASE% WelsEncoder_2010.sln
 %VCMSBUILDEXE_RELEASE% WelsEncoder_2010.sln
@ -49,7 +49,7 @@ echo "WelsDecoder Building....."
 cd %CurDir%
 cd %DecoderProjectDir%
 echo current directory is %DecoderProjectDir%
-rem vcclean 
+rem vcclean

 echo %VCMSBUILDEXE_RELEASE% WelsDecoder_2010.sln

--- a/testbin/AutoBuild_Windows_VS2012.bat
+++ b/testbin/AutoBuild_Windows_VS2012.bat
@ -36,7 +36,7 @@ echo "WelsEncoder Building....."
 cd %CurDir%
 cd %EncoderProjectDir%
 echo current directory is %EncoderProjectDir%
-rem vcclean 
+rem vcclean

 echo %VCMSBUILDEXE_RELEASE% WelsEncoder_2012.sln
 %VCMSBUILDEXE_RELEASE% WelsEncoder_2012.sln
@ -49,7 +49,7 @@ echo "WelsDecoder Building....."
 cd %CurDir%
 cd %DecoderProjectDir%
 echo current directory is %DecoderProjectDir%
-rem vcclean 
+rem vcclean

 echo %VCMSBUILDEXE_RELEASE% WelsDecoder_2012.sln

--- a/testbin/layer2.cfg
+++ b/testbin/layer2.cfg
@ -1,39 +1,39 @@
-# Layer Configuration File
-
-
-#============================== INPUT / OUTPUT ==============================
-SourceWidth     320                     # Input  frame width
-SourceHeight    192                    # Input  frame height
-FrameRateIn     12                      # Input  frame rate [Hz]
-FrameRateOut    12                     # Output frame rate [Hz]
-InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
-ReconFile       rec_layer2.yuv          # Reconstructed file
-
-#============================== CODING ==============================
-ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
-
-InitialQP       24			# Quantization parameters for base quality layer
-#================================ RATE CONTROL ===============================
-SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
-#============================== MultiSlice Slice Argument ==============================
-# for S/M Slice(s) mode settings
-SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
-SliceSize			1500
-SliceNum			1		# multiple slices number specified
-
-SlicesAssign0		960		# count number of MBs in slice #0
-SlicesAssign1		0		# count number of MBs in slice #1
-SlicesAssign2		0		# count number of MBs in slice #2
-SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
-SlicesAssign4		0		# count number of MBs in slice #4
-SlicesAssign5		0		# count number of MBs in slice #5
-SlicesAssign6		0		# count number of MBs in slice #6
-SlicesAssign7		0		# count number of MBs in slice #7
-
-### DESIGN OF SLICE MODE ####
-# 0 SM_SINGLE_SLICE			| SliceNum==1
-# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
-# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
-
+# Layer Configuration File
+
+
+#============================== INPUT / OUTPUT ==============================
+SourceWidth     320                     # Input  frame width
+SourceHeight    192                    # Input  frame height
+FrameRateIn     12                      # Input  frame rate [Hz]
+FrameRateOut    12                     # Output frame rate [Hz]
+InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
+ReconFile       rec_layer2.yuv          # Reconstructed file
+
+#============================== CODING ==============================
+ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
+
+InitialQP       24			# Quantization parameters for base quality layer
+#================================ RATE CONTROL ===============================
+SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
+#============================== MultiSlice Slice Argument ==============================
+# for S/M Slice(s) mode settings
+SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
+SliceSize			1500
+SliceNum			1		# multiple slices number specified
+
+SlicesAssign0		960		# count number of MBs in slice #0
+SlicesAssign1		0		# count number of MBs in slice #1
+SlicesAssign2		0		# count number of MBs in slice #2
+SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
+SlicesAssign4		0		# count number of MBs in slice #4
+SlicesAssign5		0		# count number of MBs in slice #5
+SlicesAssign6		0		# count number of MBs in slice #6
+SlicesAssign7		0		# count number of MBs in slice #7
+
+### DESIGN OF SLICE MODE ####
+# 0 SM_SINGLE_SLICE			| SliceNum==1
+# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
+# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
+
--- a/testbin/layer2_vd.cfg
+++ b/testbin/layer2_vd.cfg
@ -1,39 +1,39 @@
-# Layer Configuration File
-
-
-#============================== INPUT / OUTPUT ==============================
-SourceWidth     320                     # Input  frame width
-SourceHeight    192                    # Input  frame height
-FrameRateIn     12                      # Input  frame rate [Hz]
-FrameRateOut    12                     # Output frame rate [Hz]
-InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
-ReconFile       rec_layer2.yuv          # Reconstructed file
-
-#============================== CODING ==============================
-ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
-
-InitialQP       24			# Quantization parameters for base quality layer
-#================================ RATE CONTROL ===============================
-SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
-#============================== MultiSlice Slice Argument ==============================
-# for S/M Slice(s) mode settings
-SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
-SliceSize			1500
-SliceNum			1		# multiple slices number specified
-
-SlicesAssign0		960		# count number of MBs in slice #0
-SlicesAssign1		0		# count number of MBs in slice #1
-SlicesAssign2		0		# count number of MBs in slice #2
-SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
-SlicesAssign4		0		# count number of MBs in slice #4
-SlicesAssign5		0		# count number of MBs in slice #5
-SlicesAssign6		0		# count number of MBs in slice #6
-SlicesAssign7		0		# count number of MBs in slice #7
-
-### DESIGN OF SLICE MODE, 100804, Sijia ####
-# 0 SM_SINGLE_SLICE			| SliceNum==1
-# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
-# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
-
+# Layer Configuration File
+
+
+#============================== INPUT / OUTPUT ==============================
+SourceWidth     320                     # Input  frame width
+SourceHeight    192                    # Input  frame height
+FrameRateIn     12                      # Input  frame rate [Hz]
+FrameRateOut    12                     # Output frame rate [Hz]
+InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
+ReconFile       rec_layer2.yuv          # Reconstructed file
+
+#============================== CODING ==============================
+ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
+
+InitialQP       24			# Quantization parameters for base quality layer
+#================================ RATE CONTROL ===============================
+SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
+#============================== MultiSlice Slice Argument ==============================
+# for S/M Slice(s) mode settings
+SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
+SliceSize			1500
+SliceNum			1		# multiple slices number specified
+
+SlicesAssign0		960		# count number of MBs in slice #0
+SlicesAssign1		0		# count number of MBs in slice #1
+SlicesAssign2		0		# count number of MBs in slice #2
+SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
+SlicesAssign4		0		# count number of MBs in slice #4
+SlicesAssign5		0		# count number of MBs in slice #5
+SlicesAssign6		0		# count number of MBs in slice #6
+SlicesAssign7		0		# count number of MBs in slice #7
+
+### DESIGN OF SLICE MODE, 100804, Sijia ####
+# 0 SM_SINGLE_SLICE			| SliceNum==1
+# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
+# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
+
--- a/testbin/layer2_vd_rc.cfg
+++ b/testbin/layer2_vd_rc.cfg
@ -1,39 +1,39 @@
-# Layer Configuration File
-
-
-#============================== INPUT / OUTPUT ==============================
-SourceWidth     320                     # Input  frame width
-SourceHeight    192                    # Input  frame height
-FrameRateIn     12                      # Input  frame rate [Hz]
-FrameRateOut    12                     # Output frame rate [Hz]
-InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
-ReconFile       rec_layer2.yuv          # Reconstructed file
-
-#============================== CODING ==============================
-ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
-
-InitialQP       24			# Quantization parameters for base quality layer
-#================================ RATE CONTROL ===============================
-SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
-#============================== MultiSlice Slice Argument ==============================
-# for S/M Slice(s) mode settings
-SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
-SliceSize			1500
-SliceNum			1		# multiple slices number specified
-
-SlicesAssign0		960		# count number of MBs in slice #0
-SlicesAssign1		0		# count number of MBs in slice #1
-SlicesAssign2		0		# count number of MBs in slice #2
-SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
-SlicesAssign4		0		# count number of MBs in slice #4
-SlicesAssign5		0		# count number of MBs in slice #5
-SlicesAssign6		0		# count number of MBs in slice #6
-SlicesAssign7		0		# count number of MBs in slice #7
-
-### DESIGN OF SLICE MODE, 100804, Sijia ####
-# 0 SM_SINGLE_SLICE			| SliceNum==1
-# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
-# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
-
+# Layer Configuration File
+
+
+#============================== INPUT / OUTPUT ==============================
+SourceWidth     320                     # Input  frame width
+SourceHeight    192                    # Input  frame height
+FrameRateIn     12                      # Input  frame rate [Hz]
+FrameRateOut    12                     # Output frame rate [Hz]
+InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
+ReconFile       rec_layer2.yuv          # Reconstructed file
+
+#============================== CODING ==============================
+ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
+
+InitialQP       24			# Quantization parameters for base quality layer
+#================================ RATE CONTROL ===============================
+SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
+#============================== MultiSlice Slice Argument ==============================
+# for S/M Slice(s) mode settings
+SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
+SliceSize			1500
+SliceNum			1		# multiple slices number specified
+
+SlicesAssign0		960		# count number of MBs in slice #0
+SlicesAssign1		0		# count number of MBs in slice #1
+SlicesAssign2		0		# count number of MBs in slice #2
+SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
+SlicesAssign4		0		# count number of MBs in slice #4
+SlicesAssign5		0		# count number of MBs in slice #5
+SlicesAssign6		0		# count number of MBs in slice #6
+SlicesAssign7		0		# count number of MBs in slice #7
+
+### DESIGN OF SLICE MODE, 100804, Sijia ####
+# 0 SM_SINGLE_SLICE			| SliceNum==1
+# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
+# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
+
--- a/testbin/welsenc.cfg
+++ b/testbin/welsenc.cfg
@ -1,63 +1,63 @@
-# Cisco Scalable H.264/AVC Extension Encoder Configuration File
-
-#============================== GENERAL ==============================
-OutputFile              test.264               # Bitstream file
-MaxFrameRate            30                     # Maximum frame rate [Hz]
-FramesToBeEncoded       -1                    # Number of frames (at input frame rate)
-
-GOPSize                 4                     # GOP Size (at maximum frame rate), 16
-IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
-EnableSpsPpsIDAddition  1
-
-EnableFrameCropping 	1 		       # enable frame cropping flag
-
-#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
-LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
-
-InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
-InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
-
-#============================== SOFTWARE IMPLEMENTATION ==============================
-MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-
-#============================== RATE CONTROL ==============================
-EnableRC				1						# ENABLE RC
-TargetBitrate			5000				    # Unit: kbps, controled by EnableRC also
-
-#============================== DENOISE CONTROL ==============================
-EnableDenoise                   0              # Enable Denoise (1: enable, 0: disable)
-
-#============================== SCENE CHANGE DETECTION CONTROL =======================
-EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
-
-#============================== BACKGROUND DETECTION CONTROL ==============================
-EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
-
-#============================== ADAPTIVE QUANTIZATION CONTROL =======================
-EnableAdaptiveQuantization			1			# Enable Adaptive Quantization (1: enable, 0: disable)
-
-#============================== LONG TERM REFERENCE CONTROL ==============================
-EnableLongTermReference             0              # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod                       30             # Long Term Reference Marking Period 
-
-#============================== LAYER DEFINITION ==============================
-PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
-												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
-												# Can be disabled when no inter spatial layer prediction in case of its value as 0
-NumLayers              1                      # Number of layers
-//LayerCfg                layer0.cfg		# Layer 0 configuration file
-//LayerCfg                layer1.cfg		# Layer 1 configuration file
-LayerCfg                layer2.cfg		# Layer 2 configuration file
+# Cisco Scalable H.264/AVC Extension Encoder Configuration File
+
+#============================== GENERAL ==============================
+OutputFile              test.264               # Bitstream file
+MaxFrameRate            30                     # Maximum frame rate [Hz]
+FramesToBeEncoded       -1                    # Number of frames (at input frame rate)
+
+GOPSize                 4                     # GOP Size (at maximum frame rate), 16
+IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
+EnableSpsPpsIDAddition  1
+
+EnableFrameCropping 	1 		       # enable frame cropping flag
+
+#============================== LOOP FILTER ==============================
+LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off,
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off (w.r.t. idc=0)
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
+LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
+
+InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off,
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
+InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
+
+#============================== SOFTWARE IMPLEMENTATION ==============================
+MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+
+#============================== RATE CONTROL ==============================
+EnableRC				1						# ENABLE RC
+TargetBitrate			5000				    # Unit: kbps, controled by EnableRC also
+
+#============================== DENOISE CONTROL ==============================
+EnableDenoise                   0              # Enable Denoise (1: enable, 0: disable)
+
+#============================== SCENE CHANGE DETECTION CONTROL =======================
+EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
+
+#============================== BACKGROUND DETECTION CONTROL ==============================
+EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
+
+#============================== ADAPTIVE QUANTIZATION CONTROL =======================
+EnableAdaptiveQuantization			1			# Enable Adaptive Quantization (1: enable, 0: disable)
+
+#============================== LONG TERM REFERENCE CONTROL ==============================
+EnableLongTermReference             0              # Enable Long Term Reference (1: enable, 0: disable)
+LtrMarkPeriod                       30             # Long Term Reference Marking Period
+
+#============================== LAYER DEFINITION ==============================
+PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
+												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
+												# Can be disabled when no inter spatial layer prediction in case of its value as 0
+NumLayers              1                      # Number of layers
+//LayerCfg                layer0.cfg		# Layer 0 configuration file
+//LayerCfg                layer1.cfg		# Layer 1 configuration file
+LayerCfg                layer2.cfg		# Layer 2 configuration file
--- a/testbin/welsenc_vd_1d.cfg
+++ b/testbin/welsenc_vd_1d.cfg
@ -1,63 +1,63 @@
-# Cisco Scalable H.264/AVC Extension Encoder Configuration File
-
-#============================== GENERAL ==============================
-OutputFile              test_vd_1d.264               # Bitstream file
-MaxFrameRate            30                     # Maximum frame rate [Hz]
-FramesToBeEncoded       -1                    # Number of frames (at input frame rate)
-
-GOPSize                 4                     # GOP Size (at maximum frame rate), 16
-IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
-EnableSpsPpsIDAddition  1
-
-EnableFrameCropping 	1 		       # enable frame cropping flag
-
-#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
-LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
-
-InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
-InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
-
-#============================== SOFTWARE IMPLEMENTATION ==============================
-MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-
-#============================== RATE CONTROL ==============================
-EnableRC				0						# ENABLE RC
-TargetBitrate			5000				    # Unit: kbps, controled by EnableRC also
-
-#============================== DENOISE CONTROL ==============================
-EnableDenoise                   0              # Enable Denoise (1: enable, 0: disable)
-
-#============================== SCENE CHANGE DETECTION CONTROL =======================
-EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
-
-#============================== BACKGROUND DETECTION CONTROL ==============================
-EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
-
-#============================== ADAPTIVE QUANTIZATION CONTROL =======================
-EnableAdaptiveQuantization			0			# Enable Adaptive Quantization (1: enable, 0: disable)
-
-#============================== LONG TERM REFERENCE CONTROL ==============================
-EnableLongTermReference             1              # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod                       30             # Long Term Reference Marking Period 
-
-#============================== LAYER DEFINITION ==============================
-PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
-												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
-												# Can be disabled when no inter spatial layer prediction in case of its value as 0
-NumLayers              1                      # Number of layers
-//LayerCfg                layer0_vd.cfg		# Layer 0 configuration file
-//LayerCfg                layer1_vd.cfg		# Layer 1 configuration file
-LayerCfg                layer2_vd.cfg		# Layer 2 configuration file
+# Cisco Scalable H.264/AVC Extension Encoder Configuration File
+
+#============================== GENERAL ==============================
+OutputFile              test_vd_1d.264               # Bitstream file
+MaxFrameRate            30                     # Maximum frame rate [Hz]
+FramesToBeEncoded       -1                    # Number of frames (at input frame rate)
+
+GOPSize                 4                     # GOP Size (at maximum frame rate), 16
+IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
+EnableSpsPpsIDAddition  1
+
+EnableFrameCropping 	1 		       # enable frame cropping flag
+
+#============================== LOOP FILTER ==============================
+LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off,
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off (w.r.t. idc=0)
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
+LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
+
+InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off,
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
+InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
+
+#============================== SOFTWARE IMPLEMENTATION ==============================
+MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+
+#============================== RATE CONTROL ==============================
+EnableRC				0						# ENABLE RC
+TargetBitrate			5000				    # Unit: kbps, controled by EnableRC also
+
+#============================== DENOISE CONTROL ==============================
+EnableDenoise                   0              # Enable Denoise (1: enable, 0: disable)
+
+#============================== SCENE CHANGE DETECTION CONTROL =======================
+EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
+
+#============================== BACKGROUND DETECTION CONTROL ==============================
+EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
+
+#============================== ADAPTIVE QUANTIZATION CONTROL =======================
+EnableAdaptiveQuantization			0			# Enable Adaptive Quantization (1: enable, 0: disable)
+
+#============================== LONG TERM REFERENCE CONTROL ==============================
+EnableLongTermReference             1              # Enable Long Term Reference (1: enable, 0: disable)
+LtrMarkPeriod                       30             # Long Term Reference Marking Period
+
+#============================== LAYER DEFINITION ==============================
+PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
+												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
+												# Can be disabled when no inter spatial layer prediction in case of its value as 0
+NumLayers              1                      # Number of layers
+//LayerCfg                layer0_vd.cfg		# Layer 0 configuration file
+//LayerCfg                layer1_vd.cfg		# Layer 1 configuration file
+LayerCfg                layer2_vd.cfg		# Layer 2 configuration file
--- a/testbin/welsenc_vd_rc.cfg
+++ b/testbin/welsenc_vd_rc.cfg
@ -1,63 +1,63 @@
-# Cisco Scalable H.264/AVC Extension Encoder Configuration File
-
-#============================== GENERAL ==============================
-OutputFile              test_vd_rc.264               # Bitstream file
-MaxFrameRate            30                     # Maximum frame rate [Hz]
-FramesToBeEncoded       -1                    # Number of frames (at input frame rate), -1
-
-GOPSize                 8                     # GOP Size (at maximum frame rate), 16
-IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
-EnableSpsPpsIDAddition  1
-
-EnableFrameCropping 	1 		       # enable frame cropping flag
-
-#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
-LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
-
-InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
-InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
-
-#============================== SOFTWARE IMPLEMENTATION ==============================
-MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-
-#============================== RATE CONTROL ==============================
-EnableRC			1						# ENABLE RC
-TargetBitrate			600				    # Unit: kbps, controled by EnableRC also
-
-#============================== DENOISE CONTROL ==============================
-EnableDenoise                   1              # Enable Denoise (1: enable, 0: disable)
-
-#============================== SCENE CHANGE DETECTION CONTROL =======================
-EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
-
-#============================== BACKGROUND DETECTION CONTROL ==============================
-EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
-
-#============================== ADAPTIVE QUANTIZATION CONTROL =======================
-EnableAdaptiveQuantization			1			# Enable Adaptive Quantization (1: enable, 0: disable)
-
-#============================== LONG TERM REFERENCE CONTROL ==============================
-EnableLongTermReference             1              # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod                       30             # Long Term Reference Marking Period 
-
-#============================== LAYER DEFINITION ==============================
-PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
-												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
-												# Can be disabled when no inter spatial layer prediction in case of its value as 0
-NumLayers              1                      # Number of layers
-//LayerCfg                layer0_vd.cfg		# Layer 0 configuration file
-//LayerCfg                layer1_vd.cfg		# Layer 1 configuration file
-LayerCfg                layer2_vd_rc.cfg		# Layer 2 configuration file
+# Cisco Scalable H.264/AVC Extension Encoder Configuration File
+
+#============================== GENERAL ==============================
+OutputFile              test_vd_rc.264               # Bitstream file
+MaxFrameRate            30                     # Maximum frame rate [Hz]
+FramesToBeEncoded       -1                    # Number of frames (at input frame rate), -1
+
+GOPSize                 8                     # GOP Size (at maximum frame rate), 16
+IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
+EnableSpsPpsIDAddition  1
+
+EnableFrameCropping 	1 		       # enable frame cropping flag
+
+#============================== LOOP FILTER ==============================
+LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off,
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off (w.r.t. idc=0)
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
+LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
+
+InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off,
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
+InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
+
+#============================== SOFTWARE IMPLEMENTATION ==============================
+MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+
+#============================== RATE CONTROL ==============================
+EnableRC			1						# ENABLE RC
+TargetBitrate			600				    # Unit: kbps, controled by EnableRC also
+
+#============================== DENOISE CONTROL ==============================
+EnableDenoise                   1              # Enable Denoise (1: enable, 0: disable)
+
+#============================== SCENE CHANGE DETECTION CONTROL =======================
+EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
+
+#============================== BACKGROUND DETECTION CONTROL ==============================
+EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
+
+#============================== ADAPTIVE QUANTIZATION CONTROL =======================
+EnableAdaptiveQuantization			1			# Enable Adaptive Quantization (1: enable, 0: disable)
+
+#============================== LONG TERM REFERENCE CONTROL ==============================
+EnableLongTermReference             1              # Enable Long Term Reference (1: enable, 0: disable)
+LtrMarkPeriod                       30             # Long Term Reference Marking Period
+
+#============================== LAYER DEFINITION ==============================
+PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
+												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
+												# Can be disabled when no inter spatial layer prediction in case of its value as 0
+NumLayers              1                      # Number of layers
+//LayerCfg                layer0_vd.cfg		# Layer 0 configuration file
+//LayerCfg                layer1_vd.cfg		# Layer 1 configuration file
+LayerCfg                layer2_vd_rc.cfg		# Layer 2 configuration file