extern _ds_y
extern _ylookup
extern _columnofs
extern _ds_x2
extern _ds_xstep
extern _ds_ystep
extern _ds_colourmap
extern _ds_x1
extern _ds_xfrac
extern _ds_yfrac
extern _ds_source
extern _pixelcount
extern _loopcount
extern _mmxcomm

;//
;// id's code. Probably optimal for 486.
;// By id software, ported by ES 1998-08-11
;//
;// -ES- 1998/08/25 Fixed alignment

%ifndef CODE_SECTION
%define CODE_SECTION .text
%endif
SECTION CODE_SECTION

align	16
global _R_DrawSpan8_id
_R_DrawSpan8_id:

   push esi
	push edi
   push ebp
	push ebx

; find loop count

	mov		eax,[_ds_x2]
	inc		eax
	sub      eax,[_ds_x1]      ; pixel count
	mov		[_pixelcount],eax	; save for final pixel
	js near _rds8idone
global _rds8ioffs1
_rds8ioffs1: ; _rds8idone
	shr		eax,1					; double pixel count
	mov		[_loopcount],eax

; build composite position

	mov	ebp,[_ds_xfrac]
	shl	ebp,10
	and	ebp,0ffff0000h
	mov	eax,[_ds_yfrac]
	shr	eax,6
	and	eax,0ffffh
	or    ebp,eax

	mov	esi,[_ds_source]

; calculate screen dest

	mov	edi, [_ds_y]
        mov   eax, [_ylookup]
	mov	edi, [eax+edi*4]
	mov	eax, [_ds_x1]
	mov   ebx, [_columnofs]
	add   edi, [ebx+eax*4]

; build composite step

	mov	ebx,[_ds_xstep]
	shl	ebx,10
	and	ebx,0ffff0000h
	mov	eax,[_ds_ystep]
	shr	eax,6
	and	eax,0ffffh
	or    ebx,eax

	mov dword [dword _rds8ipatch1-4], ebx ; patch imms, to free some regs
global _rds8ipatcher1
_rds8ipatcher1:
  mov dword [dword _rds8ipatch2-4], ebx
global _rds8ipatcher2
_rds8ipatcher2:
	
; eax		aligned colormap
; ebx		aligned colormap
; ecx,edx	scratch
; esi		virtual source
; edi		moving destination pointer
; ebp		frac
	
	shld  ecx,ebp,22				; begin calculating third pixel (y units)
	shld  ecx,ebp,6				; begin calculating third pixel (x units)
	add	ebp,ebx					; advance frac pointer
	and   ecx,4095				   ; finish calculation for third pixel
	shld  edx,ebp,22				; begin calculating fourth pixel (y units)
	shld  edx,ebp,6				; begin calculating fourth pixel (x units)
	add	ebp,ebx					; advance frac pointer
	and   edx,4095				   ; finish calculation for fourth pixel
	mov   eax,[_ds_colourmap]
	mov   ebx,eax
	mov	al,[esi+ecx]			; get first pixel
	mov	bl,[esi+edx]			; get second pixel
	mov	al,[eax]				   ; color translate first pixel
	mov	bl,[ebx]				   ; color translate second pixel
	
	test  dword	[_pixelcount],0fffffffeh
	
	jnz	near _rds8iloop				; at least two pixels to map
global _rds8ioffs2
_rds8ioffs2: ; _rds8iloop
	jmp	near _rds8ichecklast
global _rds8ioffs3
_rds8ioffs3: ; _rds8ichecklast
	

align	16
global _rds8iloop
_rds8iloop:
	shld  ecx,ebp,22				; begin calculating third pixel (y units)
	shld  ecx,ebp,6				   ; begin calculating third pixel (x units)
	add	ebp,0deadbeefh			; advance frac pointer
global _rds8ipatch1
_rds8ipatch1:
	mov	[edi],al				   ; write first pixel
	and   ecx,4095				   ; finish calculation for third pixel
	shld  edx,ebp,22				; begin calculating fourth pixel (y units)
	shld  edx,ebp,6				   ; begin calculating fourth pixel (x units)
	add	ebp,0deadbeefh			; advance frac pointer
global _rds8ipatch2
_rds8ipatch2:
	mov	[edi+1],bl				; write second pixel
	and   edx,4095				   ; finish calculation for fourth pixel
	mov	al,[esi+ecx]			; get third pixel
	add	edi,2					   ; advance to third pixel destination
	mov	bl,[esi+edx]			; get fourth pixel
	dec	dword [_loopcount]				; done with loop?
	mov	al,[eax]				   ; color translate third pixel
	mov	bl,[ebx]				   ; color translate fourth pixel
	jnz	_rds8iloop

; check for final pixel
global _rds8ichecklast
_rds8ichecklast:
	test	dword [_pixelcount], 1
	jz	_rds8idone
	mov	[edi],al				   ; write final pixel
	
global _rds8idone
_rds8idone:
	pop ebx
   pop ebp
	pop edi
   pop esi
	ret

global _R_DrawSpan8_id_end
_R_DrawSpan8_id_end:
; alloc 31 extra bytes, just to be able to align
times 31 db 0


;
; id's code, further optimised by ES 1998/08/11
;

ALIGN 4
GLOBAL _R_DrawSpan8_id_erik
_R_DrawSpan8_id_erik:
        push  esi
        push  edi
        push  ebp
        push  ebx

;
; find loop count
;       
        mov  eax,[_ds_x2]
        inc  eax
        sub  eax,[_ds_x1]                         ; pixel count
        mov  [_pixelcount],eax                    ; save for final pixel
        js near _rds8edone                           ; nothing to scale
GLOBAL _rds8eoffs1
_rds8eoffs1:  ; _rds8edone
        shr  eax,1                              ; double pixel count
        mov  [_loopcount],eax

;
; build composite position
;
        mov     ebp,[_ds_xfrac]
        shl     ebp,10
        and     ebp,0ffff0000h
        mov     eax,[_ds_yfrac]
        shr     eax,6
        and     eax,0FFFFh
        or      ebp,eax

        mov     esi,[_ds_source]

;
; calculate screen dest
;
        mov  eax,[_ylookup]
        mov  edi,[_ds_y]
        mov  edi, [eax+edi*4]
        mov  eax,[_ds_x1]
        mov  ebx,[_columnofs]
        add  edi, [ebx+eax*4]

;
; build composite step
;
        mov     ebx,[_ds_xstep]
        shl     ebx,10
        and     ebx,0ffff0000h
        mov     eax,[_ds_ystep]
        shr     eax,6
        and     eax,0ffffh
        or      ebx,eax

        mov  [_rds8epatch1-4],ebx   ; patch imms, to free some regs
GLOBAL _rds8epatcher1
_rds8epatcher1:
        mov  [_rds8epatch2-4],ebx
GLOBAL _rds8epatcher2
_rds8epatcher2:
        mov  [_rds8epatch3-4],esi
GLOBAL _rds8epatcher3
_rds8epatcher3:
        mov  [_rds8epatch4-4],esi
GLOBAL _rds8epatcher4
_rds8epatcher4:

; eax           aligned colourmap
; ebx           aligned colourmap
; ecx,edx       scratch
; esi           virtual source
; edi           moving destination pointer
; ebp           frac

        shld  ecx, ebp, 22                      ; begin calculating third pixel (y units)
        shld  ecx, ebp, 6                       ; begin calculating third pixel (x units)
        add     ebp,ebx                                         ; advance frac pointer
        and  ecx,4095                           ; finish calculation for third pixel
        shld  edx, ebp, 22                      ; begin calculating fourth pixel (y units)
        shld  edx, ebp, 6                       ; begin calculating fourth pixel (x units)
        add     ebp,ebx                                         ; advance frac pointer
        and  edx,4095                           ; finish calculation for fourth pixel
        mov  eax,[_ds_colourmap]
        mov  ebx,eax
        mov  al, [esi + ecx]
        mov  bl, [esi + edx]
        mov     al, [eax]                               ; colour translate first pixel
        mov     bl, [ebx]                               ; colour translate second pixel

        shld  ecx, ebp, 22                      ; begin calculating third pixel (y units)
        mov  esi,[_loopcount]

        test dword [_pixelcount], 0fffffffeh
        jnz near _rds8eloop                              ; at least two pixels to map
GLOBAL _rds8eoffs2
_rds8eoffs2:  ; _rds8eloop
        jmp near _rds8echecklast
GLOBAL _rds8eoffs3
_rds8eoffs3:  ; _rds8echecklast

GLOBAL _rds8eloop
_rds8eloop:
        shld  ecx, ebp, 6                       ; begin calculating third pixel (x units)
        add   ebp,0DeadBeefh                    ; advance frac pointer
GLOBAL _rds8epatch1
_rds8epatch1:
        and  ecx,4095                           ; finish calculation for third pixel
        shld edx, ebp, 22                       ; begin calculating fourth pixel (y units)
        mov  [edi],al                                   ; write first pixel
        mov  [edi+1],bl                                 ; write second pixel
        add  edi,2                                      ; advance to third pixel destination
        shld  edx, ebp, 6                       ; begin calculating fourth pixel (x units)
        and  edx,4095                           ; finish calculation for fourth pixel
        mov  al, [ecx+0DeadBeefh]                       ; get third pixel
GLOBAL _rds8epatch3
_rds8epatch3:
        mov  bl, [edx+0DeadBeefh]                       ; get fourth pixel
GLOBAL _rds8epatch4
_rds8epatch4:
        add   ebp,0DeadBeefh                    ; advance frac pointer
GLOBAL _rds8epatch2
_rds8epatch2:
        shld ecx, ebp, 22                       ; begin calculating third pixel (y units)
        mov  al, [eax]                          ; colour translate third pixel
        mov  bl, [ebx]                          ; colour translate fourth pixel
        dec  esi                                ; done with loop?
        jnz  _rds8eloop

; check for final pixel
GLOBAL _rds8echecklast
_rds8echecklast:
        test    dword [_pixelcount], 1
        jz _rds8edone
        mov  [edi],al                           ; write final pixel

GLOBAL _rds8edone
_rds8edone:
        pop  ebx
        pop  ebp
        pop  edi
        pop  esi
        ret


global _R_DrawSpan8_id_erik_end
_R_DrawSpan8_id_erik_end:
; alloc 31 extra bytes, just to be able to align
times 31 db 0

; -ES- 1998/07/24 Wrote MMX version
global _R_DrawSpan8_mmx
_R_DrawSpan8_mmx:
    push  esi
    push  edi
    push  ebp
    push  ebx

    mov  eax,esp    ; Push 8 or 12, so that (%esp) gets aligned by 8
    and  eax,7
    add  eax,8
    mov  [_mmxcomm],eax   ; Temp storage in mmxcomm: (%esp) is used instead
    sub  esp,eax

        mov  esi,[_ds_source]                 ; src in esi
        mov  eax,[_ds_y]
        mov  ebx,[_ylookup]
        mov  edi, [ebx+eax*4]
        mov  eax,[_ds_x2]
        mov  ebx,[_columnofs]
        add  edi, [ebx+eax*4]             ; dest in edi

        mov  ecx,[_ds_x1]
        sub  ecx,eax                       ; count in ecx

        mov  eax,[_ds_xfrac]                  ; convert to 6.26 fix
        mov  ebx,[_ds_yfrac]
        shl  eax,10
        shl  ebx,10
        shl  dword [_ds_xstep], 10
        shl  dword [_ds_ystep], 10

        lea  ebp, [edi + ecx - 1]         ; load dest to L1 cache
        and  ebp, ~31                     ; (speeds up Pentium MMX)
        sub  ebp,edi
        jng rds8mload_done
rds8mload:
        mov dl, [edi + ebp]
        sub  ebp,32
        jg rds8mload
rds8mload_done:
        mov dl, [edi + ebp]

        cmp  ecx,-4
        jl rds8mmany
        ; Less than six pixels to draw, write one pixel at a time without MMX to
        ; avoid MMX register setup and loop initialisation
rds8mfew: 
        mov  ebp,ebx
        shr  ebp,20
        and  ebp,0FC0h
        mov  edx,eax
        shr  edx,26
        add  ebp,edx
        movzx  edx, byte [esi+ebp]
        mov  ebp,[_ds_colourmap]
        mov  dl, [edx + ebp]
        mov  [edi + ecx], dl
        add  eax,[_ds_xstep]
        add  ebx,[_ds_ystep]
        inc  ecx
        jle rds8mfew
        jmp near _rds8mdone
GLOBAL _rds8moffs1
_rds8moffs1:

rds8mmany: 
        mov  dword [esp+0],0FC000000h
        mov  dword [esp+4],0FC000000h
        movq mm7,[esp+0]

        mov  [esp+0],eax
        add  eax,[_ds_xstep]
        mov  [esp+4],eax
        movq mm0,[esp+0]                   ; xfrac in mm0

        mov  [esp+0],ebx
        add  ebx,[_ds_ystep]
        mov  [esp+4],ebx
        movq mm1,[esp+0]                   ; yfrac in mm1

        mov  eax,[_ds_xstep]
        add  eax,eax
        mov  [esp+0],eax
        mov  [esp+4],eax
        movq mm2,[esp+0]                   ; xstep in mm2

        mov  ebx,[_ds_ystep]
        add  ebx,ebx
        mov  [esp+0],ebx
        mov  [esp+4],ebx
        movq mm3,[esp+0]                   ; ystep in mm3

        mov  eax,[_ds_colourmap]
        mov  ebx,eax

; The loop handles 10 pixels at a time. We must now prepare the eight first
; ones. This is extremely messy, but fast :-)

        ; Prepare pixels 0-1
        movq mm4,mm0
        paddd mm0,mm2
        psrld mm4,26
        movq mm5,mm1
        paddd mm1,mm3
        db 0Fh, 0DBh, 0EFh
        psrld mm5,20
        por mm4,mm5
        movq [esp+0],mm4
        mov  ebp, [esp+0]
        mov  edx, [esp+4]
        mov al, [esi + ebp]
        mov bl, [esi + edx]

        ; Prepare pixels 2-3
        movq mm4,mm0
        paddd mm0,mm2
        psrld mm4,26
        movq mm5,mm1
        paddd mm1,mm3
        db 0Fh,0DBh,0EFh
        psrld mm5,20
        por mm4,mm5
        movq [esp+0],mm4
        mov  ebp, [esp+0]
        mov  edx, [esp+4]

        ; Prepare pixels 4-5
        movq mm4,mm0
        psrld mm4,26
        movq mm5,mm1
        db 0Fh,0DBh,0EFh
        psrld mm5,20

        ; Prepare pixels 6-7
        paddd mm1,mm3

        inc  ecx

GLOBAL _rds8mloop
_rds8mloop:
        paddd mm0,mm2                      ; 3
        mov  al, [eax+0]                   ; 3

        por mm4,mm5                        ; 3
        mov [edi + ecx + 1-2], al

        movq mm5,mm1                       ; 3
        db 08Ah,05Ch,023h,0h

        movq [esp+0],mm4                   ; 7
        mov al, [esi + ebp + 0]

        paddd mm1,mm3                      ; 3
        mov [edi + ecx + 2-2], bl

        movq mm4,mm0                       ; 3
        mov bl, [esi + edx + 0]

        db 0Fh,0DBh,0EFh
        mov  ebp, [esp+0]                  ; 6

        psrld mm4,26                        ; 4
        mov  edx, [esp+4]                ; 6

        psrld mm5,20                        ; 4

        add  ecx,2                              ; 1 # I would prefer to use LOOP instead, but PMMX wouldn't...
        jl _rds8mloop                         ; 2
        mov  al, [eax]                    ; Write last 1 or 2 pixel(s)
        mov [edi + ecx + 1-2], al
        jg rds8mlastodd
        mov  ah, [ebx]                    ; 3
        mov  [edi + ecx + 2-2], ah
rds8mlastodd: 

GLOBAL _rds8mdone;
_rds8mdone:
    emms  ; fixme: To end of rendering

    add  esp,[_mmxcomm]

    pop  ebx
    pop  ebp
    pop  edi
    pop  esi
    ret

global _R_DrawSpan8_mmx_end
_R_DrawSpan8_mmx_end:
; alloc 31 extra bytes, just to be able to align
times 31 db 0


ALIGN 4
GLOBAL _R_DrawSpan8_rasem
_R_DrawSpan8_rasem:
        push  esi
        push  edi
        push  ebp
        push  ebx
        mov  edi,[_ds_xfrac]
        mov  ebp,[_ds_yfrac]
        mov  ebx,[_ds_y]
        mov  eax,[_columnofs]
        mov  edx,[_ds_x1]
        mov  esi,[_ylookup]
        mov  eax, [eax+edx*4]
        mov  ebx, [esi+ebx*4]
        mov  esi,[_ds_x2]
        add  ebx,eax
        sub  esi,edx

ALIGN 4, db 90h
L2:
        mov  edx,ebp
        mov  eax,edi
        sar  edx,10
        sar  eax,16
        and  edx,4032
        and  eax,63
        add  edx,eax
        mov  eax,[_ds_source]
        xor  ecx,ecx
        add  edi,[_ds_xstep]
        mov  cl, [edx + eax]
        mov  eax,[_ds_colourmap]
        mov  al, [ecx + eax]
        add  ebp,[_ds_ystep]
        mov  [ebx],al
        inc  ebx
        sub  esi,1
        jnc L2
        pop  ebx
        pop  ebp
        pop  edi
        pop  esi
        ret


ALIGN 4
GLOBAL _R_DrawSpan16_rasem
_R_DrawSpan16_rasem:
        push  ebp
        push  edi
        push  esi
        push  ebx
        mov  edi,[_ds_xfrac]
        mov  esi,[_ds_yfrac]
        mov  ebx,[_ds_y]
        mov  ecx,[_ylookup]
        mov  edx,[_ds_x1]
        mov  eax,[_columnofs]
        mov  eax, [eax+edx*4]
        mov  ebx, [ecx+ebx*4]
        add  ebx,eax
        mov  ecx,[_ds_x2]
        sub  ecx,edx
        mov  ebp,[_ds_colourmap]

ALIGN 4, db 90h
L3:
        mov  edx,esi
        mov  eax,edi
        sar  edx,10
        sar  eax,16
        and  edx,4032
        and  eax,63
        add  edx,eax
        mov  eax,[_ds_source]
        add  edi,[_ds_xstep]
        add  esi,[_ds_ystep]
        mov  al, [edx + eax]
        and  eax,255
        mov  ax, [ebp+eax*2]
        mov  [ebx],ax
        add  ebx,2
        sub  ecx,1
        jnc L3
        pop  ebx
        pop  esi
        pop  edi
        pop  ebp
        ret


