;r_resize.S
;
;Contains routines for resizing bitmaps on the screen.
;

extern _dv_viewwidth
extern _dv_viewheight
extern _dv_ylookup
extern _dv_columnofs
extern _vb_depth

%ifndef CODE_SECTION
%define CODE_SECTION .text
%endif
SECTION CODE_SECTION

;
; Base function. Not very optimised, but has a simple,
; small, working inner loop. Other routines can use this as base.
;

global _R_EnlargeView8_2_2_base
_R_EnlargeView8_2_2_base:
	sub esp, 20
	push ebp
	push edi
	push esi
	push ebx
	mov edi, [_dv_viewwidth]
	mov [esp+32], edi
	mov edi, [_dv_viewheight]
	mov [esp+28], edi
	xor ebp,ebp
	cmp ebp, edi
	jge r22lbl1
	mov edi, [32 + esp]
	neg edi
	mov [esp+20], edi
align 4
r22lbl3:
	lea edx, [ebp * 2]
	sub edx, [28 + esp]
	mov ebx, [_dv_ylookup]
	mov ecx, [_dv_columnofs]
	mov edi, [20 + esp]
	mov eax, [ecx + edi * 4]
	mov esi, [ebx + edx * 4]
	add esi, eax
	mov edi, [32 + esp]
	lea esi, [esi + edi *2]
	mov edi, esi
	add edi, [_vb_depth]
	mov [24 + esp], edi
	mov eax, [ecx]
	add eax, [ebx + ebp * 4]
	mov [16 +esp], eax
	mov edi, [32 + esp]
	add [16 + esp], edi
	mov edx, [20 + esp]
	test edx,edx
	je r22lbl2
	mov edi, [24 + esp]
	mov ecx, [16 +esp]
align 4
r22loop: ; INNER LOOP
	mov al, [ecx + edx]
	mov [edi + edx * 2], al
	mov [edi + edx * 2 + 1], al
	mov [esi + edx * 2], al
	mov [esi + edx * 2 + 1], al
	inc edx
	jnz r22loop
r22lbl2:
	inc ebp
	cmp [28 + esp], ebp
	jg r22lbl3
r22lbl1:
	pop ebx
	pop esi
	pop edi
	pop ebp
	add esp, 20
	ret



global _R_EnlargeView8_2_2_mmx
_R_EnlargeView8_2_2_mmx:
	sub esp, 20
	push ebp
	push edi
	push esi
	push ebx
	mov edi, [_dv_viewwidth]
	mov [esp+32], edi
	mov edi, [_dv_viewheight]
	mov [esp+28], edi
	xor ebp,ebp
	cmp ebp, edi
	jge near r22mlbl1
	mov edi, [32 + esp]
	neg edi
	mov [esp+20], edi
align 4
r22mlbl3:
	lea edx, [ebp * 2]
	sub edx, [28 + esp]
	mov ebx, [_dv_ylookup]
	mov ecx, [_dv_columnofs]
	mov edi, [20 + esp]
	mov eax, [ecx + edi * 4]
	mov esi, [ebx + edx * 4]
	add esi, eax
	mov edi, [32 + esp]
	lea esi, [esi + edi *2]
	mov edi, esi
	add edi, [_vb_depth]
	mov [24 + esp], edi
	mov eax, [ecx]
	add eax, [ebx + ebp * 4]
	mov [16 +esp], eax
	mov edi, [32 + esp]
	add [16 + esp], edi
	mov edx, [20 + esp]
	test edx,edx
	je r22mlbl2
	mov edi, [24 + esp]
	mov ecx, [16 +esp]



; copy first 8 bytes.
	movd mm0, [ecx + edx]
   punpcklbw mm0, mm0
   movq [edi + edx * 2], mm0
   movq [esi + edx * 2], mm0

   lea ebx, [edi + edx * 2]
; align: new address is (dest+8)&~7.
   lea eax, [8 + ebx]
; if x is odd, skip alignment.
   test ebx, 1
   jnz r22modd
; it's even
   and eax, ~7
r22modd:
; eax=number of bytes to skip
   sub eax, ebx

; adjust counter
   sar eax, 1
   add edx, eax

; source and dest are now adjusted.
; now set eax to the offset we are going to place the last 8-byte chunk at.
   mov eax, edx
   and eax, 3
   lea edi, [edi + eax * 2 - 8]
   lea esi, [esi + eax * 2 - 8]
   lea ecx, [eax + ecx - 4]

   neg eax
; now adjust the loop counter.
   sar edx, 2
   inc edx
;
; edi: dest 1
; esi: dest 2
; edx: counter
; ecx: source
; mm0: src pixel
;
; I love MMX unpacking... The inner loop couldn't look simpler. Could run at
; 3 clock cycles for each 16 output pixels, which is even faster than rep;movs
;
; The loop could be more optimised, but I've decided not to, since the
; only bottleneck now is memory bandwidth.
;

align 4
r22mloop:
	movd mm0, [ecx + edx * 4]
   punpcklbw mm0, mm0
   movq [edi + edx * 8], mm0
   movq [esi + edx * 8], mm0
	inc edx
	jl r22mloop

; copy last few bytes
   movd mm0, [ecx + eax]
   punpcklbw mm0, mm0
   movq [edi + eax * 2], mm0
   movq [esi + eax * 2], mm0

r22mlbl2:
	inc ebp
	cmp [28 + esp], ebp
	jg near r22mlbl3
r22mlbl1:
	pop ebx
	pop esi
	pop edi
	pop ebp
	add esp, 20
   emms
	ret


