Some calling convention work for x86-64:

- Implement x86-64 extern(C), hopefully correctly. - Tried to be a bit smarter about extern(D) while I was there. Interestingly, this code seems to be generating more efficient code than gcc and llvm-gcc in some edge cases, like returning a `{ [7 x i8] }` loaded from a stack slot from an extern(C) function. (gcc generates 7 1-byte loads, while this code generates a 4-byte, a 2-byte and a 1-byte load) I also added some changes to make sure structs being returned from functions or passed in as parameters are stored in memory where the rest of the backend seems to expect them to be. These should be removed when support for first-class aggregates improves.
2025-05-14 15:16:07 +03:00 · 2009-03-06 16:00:47 +01:00 · 2009-03-06 16:00:47 +01:00 · 27d3ab4546
commit 27d3ab4546
parent 79bc6230df
10 changed files with 749 additions and 118 deletions
--- a/gen/abi-x86-64.cpp
+++ b/gen/abi-x86-64.cpp
@ -0,0 +1,680 @@
+/* TargetABI implementation for x86-64.
+ * Written for LDC by Frits van Bommel in 2009.
+ * 
+ * extern(D) follows no particular external ABI, but tries to be smart about
+ * passing structs and returning them. It should probably be reviewed if the
+ * way LLVM implements fastcc on this platform ever changes.
+ * (Specifically, the number of return registers of various types is hardcoded)
+ * 
+ * 
+ * extern(C) implements the C calling convention for x86-64, as found in
+ * http://www.x86-64.org/documentation/abi-0.99.pdf
+ * 
+ * Note:
+ *   Where a discrepancy was found between llvm-gcc and the ABI documentation,
+ *   llvm-gcc behavior was used for compatibility (after it was verified that
+ *   regular gcc has the same behavior).
+ * 
+ * LLVM gets it right for most types, but complex numbers and structs need some
+ * help. To make sure it gets those right we essentially bitcast small structs
+ * to a type to which LLVM assigns the appropriate registers, and pass that
+ * instead. Structs that are required to be passed in memory are explicitly
+ * marked with the ByVal attribute to ensure no part of them ends up in
+ * registers when only a subset of the desired registers are available.
+ * 
+ * We don't perform the same transformation for D-specific types that contain
+ * multiple parts, such as dynamic arrays and delegates. They're passed as if
+ * the parts were passed as separate parameters. This helps make things like
+ * printf("%.*s", o.toString()) work as expected; if we didn't do this that
+ * wouldn't work if there were 4 other integer/pointer arguments before the
+ * toString() call because the string got bumped to memory with one integer
+ * register still free. Keeping it untransformed puts the length in a register
+ * and the pointer in memory, as printf expects it.
+ */
+
+#include "dmd/mtype.h"
+#include "dmd/declaration.h"
+#include "dmd/aggregate.h"
+
+#include "gen/llvm.h"
+#include "gen/tollvm.h"
+#include "gen/logger.h"
+#include "gen/dvalue.h"
+#include "gen/llvmhelpers.h"
+#include "gen/abi.h"
+#include "gen/abi-x86-64.h"
+#include "ir/irfunction.h"
+
+#include <cassert>
+#include <map>
+#include <string>
+#include <utility>
+
+// Implementation details for extern(C)
+namespace {
+    /**
+     * This function helps filter out things that look like structs to C,
+     * but should be passed to C in separate arguments anyway.
+     * 
+     * (e.g. dynamic arrays are passed as separate length and ptr. This
+     * is both less work and makes printf("%.*s", o.toString()) work)
+     */
+    inline bool keepUnchanged(Type* t) {
+        switch (t->ty) {
+            case Tarray:    // dynamic array
+            case Taarray:   // assoc array
+            case Tdelegate:
+                return true;
+            
+            default:
+                return false;
+        }
+    }
+    
+    enum ArgClass {
+        Integer, Sse, SseUp, X87, X87Up, ComplexX87, NoClass, Memory
+    };
+    
+    struct Classification {
+        bool isMemory;
+        ArgClass classes[2];
+        
+        Classification() : isMemory(false) {
+            classes[0] = NoClass;
+            classes[1] = NoClass;
+        }
+        
+        void addField(unsigned offset, ArgClass cl) {
+            if (isMemory)
+                return;
+            
+            // Note that we don't need to bother checking if it crosses 8 bytes.
+            // We don't get here with unaligned fields, and anything that can be
+            // big enough to cross 8 bytes (cdoubles, reals, structs and arrays)
+            // is special-cased in classifyType()
+            int idx = (offset < 8 ? 0 : 1);
+            
+            ArgClass nw = merge(classes[idx], cl);
+            if (nw != classes[idx]) {
+                classes[idx] = nw;
+                
+                if (nw == Memory) {
+                    classes[1-idx] = Memory;
+                    isMemory = true;
+                }
+            }
+        }
+        
+    private:
+        ArgClass merge(ArgClass accum, ArgClass cl) {
+            if (accum == cl)
+                return accum;
+            if (accum == NoClass)
+                return cl;
+            if (cl == NoClass)
+                return accum;
+            if (accum == Memory || cl == Memory)
+                return Memory;
+            if (accum == Integer || cl == Integer)
+                return Integer;
+            if (accum == X87 || accum == X87Up || accum == ComplexX87 ||
+                cl == X87 || cl == X87Up || cl == ComplexX87)
+                return Memory;
+            return Sse;
+        }
+    };
+    
+    void classifyType(Classification& accum, Type* ty, d_uns64 offset) {
+        if (Logger::enabled())
+            Logger::cout() << "Classifying " << ty->toChars() << " @ " << offset << '\n';
+        
+        ty = ty->toBasetype();
+        
+        if (ty->isintegral() || ty->ty == Tpointer) {
+            accum.addField(offset, Integer);
+        } else if (ty->ty == Tfloat80 || ty->ty == Timaginary80) {
+            accum.addField(offset, X87);
+            accum.addField(offset+8, X87Up);
+        } else if (ty->ty == Tcomplex80) {
+            accum.addField(offset, ComplexX87);
+            // make sure other half knows about it too:
+            accum.addField(offset+16, ComplexX87);
+        } else if (ty->ty == Tcomplex64) {
+            accum.addField(offset, Sse);
+            accum.addField(offset+8, Sse);
+        } else if (ty->ty == Tcomplex32) {
+            accum.addField(offset, Sse);
+            accum.addField(offset+4, Sse);
+        } else if (ty->isfloating()) {
+            accum.addField(offset, Sse);
+        } else if (ty->size() > 16 || hasUnalignedFields(ty)) {
+            // This isn't creal, yet is > 16 bytes, so pass in memory.
+            // Must be after creal case but before arrays and structs,
+            // the other types that can get bigger than 16 bytes
+            accum.addField(offset, Memory);
+        } else if (ty->ty == Tsarray) {
+            d_uns64 eltsize = ty->next->size();
+            if (eltsize > 0) {
+                d_uns64 dim = ty->size() / eltsize;
+                assert(dim <= 16
+                        && "Array of non-empty type <= 16 bytes but > 16 elements?");
+                for (d_uns64 i = 0; i < dim; i++) {
+                    classifyType(accum, ty->next, offset);
+                    offset += eltsize;
+                }
+            }
+        } else if (ty->ty == Tstruct) {
+            Array* fields = &((TypeStruct*) ty)->sym->fields;
+            for (size_t i = 0; i < fields->dim; i++) {
+                VarDeclaration* field = (VarDeclaration*) fields->data[i];
+                classifyType(accum, field->type, offset + field->offset);
+            }
+        } else {
+            if (Logger::enabled())
+                Logger::cout() << "x86-64 ABI: Implicitly handled type: "
+                               << ty->toChars() << '\n';
+            // arrays, delegates, etc. (pointer-sized fields, <= 16 bytes)
+            assert(offset == 0 || offset == 8 
+                    && "must be aligned and doesn't fit otherwise");
+            assert(ty->size() % 8 == 0 && "Not a multiple of pointer size?");
+            
+            accum.addField(offset, Integer);
+            if (ty->size() > 8)
+                accum.addField(offset+8, Integer);
+        }
+    }
+    
+    Classification classify(Type* ty) {
+        typedef std::map<Type*, Classification> ClassMap;
+        static ClassMap cache;
+        
+        ClassMap::iterator it = cache.find(ty);
+        if (it != cache.end()) {
+            return it->second;
+        } else {
+            Classification cl;
+            classifyType(cl, ty, 0);
+            cache[ty] = cl;
+            return cl;
+        }
+    }
+    
+    /// Returns the type to pass as, or null if no transformation is needed.
+    LLType* getAbiType(Type* ty) {
+        ty = ty->toBasetype();
+        
+        // First, check if there's any need of a transformation:
+        
+        if (keepUnchanged(ty))
+            return 0;
+        
+        if (ty->ty != Tcomplex32 && ty->ty != Tstruct)
+            return 0; // Nothing to do,
+        
+        Classification cl = classify(ty);
+        assert(!cl.isMemory);
+        
+        if (cl.classes[0] == NoClass) {
+            assert(cl.classes[1] == NoClass && "Non-empty struct with empty first half?");
+            return 0; // Empty structs should also be handled correctly by LLVM
+        }
+        
+        // Okay, we may need to transform. Figure out a canonical type:
+        
+        std::vector<const LLType*> parts;
+        
+        unsigned size = ty->size();
+        
+        switch (cl.classes[0]) {
+            case Integer: {
+                unsigned bits = (size >= 8 ? 64 : (size * 8));
+                parts.push_back(LLIntegerType::get(bits));
+                break;
+            }
+            
+            case Sse:
+                parts.push_back(size <= 4 ? LLType::FloatTy : LLType::DoubleTy);
+                break;
+            
+            case X87:
+                assert(cl.classes[1] == X87Up && "Upper half of real not X87Up?");
+                /// The type only contains a single real/ireal field,
+                /// so just use that type.
+                return const_cast<LLType*>(LLType::X86_FP80Ty);
+            
+            default:
+                assert(0 && "Unanticipated argument class");
+        }
+        
+        switch(cl.classes[1]) {
+            case NoClass:
+                assert(parts.size() == 1);
+                // No need to use a single-element struct type.
+                // Just use the element type instead.
+                return const_cast<LLType*>(parts[0]);
+                break;
+            
+            case Integer: {
+                assert(size > 8);
+                unsigned bits = (size - 8) * 8;
+                parts.push_back(LLIntegerType::get(bits));
+                break;
+            }
+            case Sse:
+                parts.push_back(size <= 12 ? LLType::FloatTy : LLType::DoubleTy);
+                break;
+            
+            case X87Up:
+                if(cl.classes[0] == X87) {
+                    // This won't happen: it was short-circuited while
+                    // processing the first half.
+                } else {                    
+                    // I can't find this anywhere in the ABI documentation,
+                    // but this is what gcc does (both regular and llvm-gcc).
+                    // (This triggers for types like union { real r; byte b; })
+                    parts.push_back(LLType::DoubleTy);
+                }
+                break;
+            
+            default:
+                assert(0 && "Unanticipated argument class for second half");
+        }
+        return LLStructType::get(parts);
+    }
+}
+
+
+// Implementation details for extern(D)
+namespace x86_64_D_cc {
+    struct DRegCount {
+        unsigned ints;
+        unsigned sse;
+        unsigned x87;
+        
+        DRegCount(unsigned ints_, unsigned sse_, unsigned x87_)
+        : ints(ints_), sse(sse_), x87(x87_) {}
+    };
+    
+    // Count the number of registers needed for a simple type.
+    // (Not a struct or static array)
+    DRegCount regsNeededForSimpleType(Type* t) {
+        DRegCount r(0, 0, 0);
+        switch(t->ty) {
+            case Tstruct:
+            case Tsarray:
+                assert(0 && "Not a simple type!");
+                // Return huge numbers if assertions are disabled, so it'll always get
+                // bumped to memory.
+                r.ints = r.sse = r.x87 = (unsigned)-1;
+                break;
+            
+            // Floats, doubles and such are passed in SSE registers
+            case Tfloat32:
+            case Tfloat64:
+            case Timaginary32:
+            case Timaginary64:
+                r.sse = 1;
+                break;
+            
+            case Tcomplex32:
+            case Tcomplex64:
+                r.sse = 2;
+                break;
+            
+            // Reals, ireals and creals are passed in x87 registers
+            case Tfloat80:
+            case Timaginary80:
+                r.x87 = 1;
+                break;
+            
+            case Tcomplex80:
+                r.x87 = 2;
+                break;
+            
+            // Anything else is passed in one or two integer registers,
+            // depending on its size.
+            default: {
+                int needed = (t->size() + 7) / 8;
+                assert(needed <= 2);
+                r.ints = needed;
+                break;
+            }
+        }
+        return r;
+    }
+    
+    // Returns true if it's possible (and a good idea) to pass the struct in the
+    // specified number of registers.
+    // (May return false if it's a bad idea to pass the type in registers for
+    // reasons other than it not fitting)
+    // Note that if true is returned, 'left' is also modified to contain the
+    // number of registers left. This property is used in the recursive case.
+    // If false is returned, 'left' is garbage.
+    bool shouldPassStructInRegs(TypeStruct* t, DRegCount& left) {
+        // If it has unaligned fields, there's probably a reason for it,
+        // so keep it in memory.
+        if (hasUnalignedFields(t))
+            return false;
+        
+        Array* fields = &t->sym->fields;
+        d_uns64 nextbyte = 0;
+        for (d_uns64 i = 0; i < fields->dim; i++) {
+            VarDeclaration* field = (VarDeclaration*) fields->data[i];
+            
+            // This depends on ascending order of field offsets in structs
+            // without overlapping fields.
+            if (field->offset < nextbyte) {
+                // Don't return unions (or structs containing them) in registers.
+                return false;
+            }
+            nextbyte = field->offset + field->type->size();
+            
+            switch (field->type->ty) {
+                case Tstruct:
+                    if (!shouldPassStructInRegs((TypeStruct*) field->type, left))
+                        return false;
+                    break;
+                
+                case Tsarray:
+                    // Don't return static arrays in registers
+                    // (indexing registers doesn't work well)
+                    return false;
+                
+                default: {
+                    DRegCount needed = regsNeededForSimpleType(field->type);
+                    if (needed.ints > left.ints || needed.sse > left.sse || needed.x87 > left.x87)
+                        return false;
+                    left.ints -= needed.ints;
+                    left.sse -= needed.sse;
+                    left.x87 -= needed.x87;
+                    break;
+                }
+            }
+        }
+        return true;
+    }
+    
+    // Returns true if the struct fits in return registers in the x86-64 fastcc
+    // calling convention.
+    bool retStructInRegs(TypeStruct* st) {
+        // 'fastcc' allows returns in up to two registers of each kind:
+        DRegCount state(2, 2, 2);
+        return shouldPassStructInRegs(st, state);
+    }
+    
+    // Heuristic for determining whether to pass a struct type directly or
+    // bump it to memory.
+    bool passStructTypeDirectly(TypeStruct* st) {
+        // If the type fits in a reasonable number of registers,
+        // pass it directly.
+        // This does not necessarily mean it will actually be passed in
+        // registers. For example, x87 registers are never actually used for
+        // parameters.
+        DRegCount state(2, 2, 2);
+        return shouldPassStructInRegs(st, state);
+        
+        // This doesn't work well: Since the register count can differ depending
+        // on backend options, there's no way to be exact anyway.
+        /*
+        // Regular fastcc:      6 int, 8 sse, 0 x87
+        // fastcc + tailcall:   5 int, 8 sse, 0 x87
+        RegCount state(5, 8, 0);
+        */
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// Just store to memory and it's readable as the other type.
+struct X86_64_C_struct_rewrite : ABIRewrite {
+    // Get struct from ABI-mangled representation
+    LLValue* get(Type* dty, DValue* v)
+    {
+        LLValue* lval;
+        if (v->isLVal()) {
+            lval = v->getLVal();
+        } else {
+            // No memory location, create one.
+            LLValue* rval = v->getRVal();
+            lval = DtoAlloca(rval->getType());
+            DtoStore(rval, lval);
+        }
+        
+        const LLType* pTy = getPtrToType(DtoType(dty));
+        return DtoLoad(DtoBitCast(lval, pTy), "get-result");
+    }
+    
+    // Get struct from ABI-mangled representation, and store in the provided location.
+    void getL(Type* dty, DValue* v, llvm::Value* lval) {
+        LLValue* rval = v->getRVal();
+        const LLType* pTy = getPtrToType(rval->getType());
+        DtoStore(rval, DtoBitCast(lval, pTy));
+    }
+    
+    // Turn a struct into an ABI-mangled representation
+    LLValue* put(Type* dty, DValue* v)
+    {
+        LLValue* lval;
+        if (v->isLVal()) {
+            lval = v->getLVal();
+        } else {
+            // No memory location, create one.
+            LLValue* rval = v->getRVal();
+            lval = DtoAlloca(rval->getType());
+            DtoStore(rval, lval);
+        }
+        
+        LLType* abiTy = getAbiType(dty);
+        assert(abiTy && "Why are we rewriting a non-rewritten type?");
+        
+        const LLType* pTy = getPtrToType(abiTy);
+        return DtoLoad(DtoBitCast(lval, pTy), "put-result");
+    }
+    
+    /// should return the transformed type for this rewrite
+    const LLType* type(Type* dty, const LLType* t)
+    {
+        return getAbiType(dty);
+    }
+};
+
+
+struct RegCount {
+    unsigned char int_regs, sse_regs;
+};
+
+
+struct X86_64TargetABI : TargetABI {
+    X86_64_C_struct_rewrite struct_rewrite;
+    
+    void newFunctionType(TypeFunction* tf) {
+        funcTypeStack.push_back(FuncTypeData(tf->linkage));
+    }
+    
+    bool returnInArg(TypeFunction* tf);
+    
+    bool passByVal(Type* t);
+    
+    void rewriteFunctionType(TypeFunction* tf);
+    
+    void doneWithFunctionType() {
+        funcTypeStack.pop_back();
+    }
+    
+private:
+    struct FuncTypeData {
+        LINK linkage;       // Linkage of the function type currently under construction
+        RegCount state;     // bookkeeping for extern(C) parameter registers
+        
+        FuncTypeData(LINK linkage_)
+        : linkage(linkage_)
+        {
+            state.int_regs = 6;
+            state.sse_regs = 8;
+        }
+    };
+    std::vector<FuncTypeData> funcTypeStack;
+    
+    LINK linkage() {
+        assert(funcTypeStack.size() != 0);
+        return funcTypeStack.back().linkage;
+    }
+    
+    RegCount& state() {
+        assert(funcTypeStack.size() != 0);
+        return funcTypeStack.back().state;
+    }
+    
+    void fixup(IrFuncTyArg& arg);
+};
+
+
+// The public getter for abi.cpp
+TargetABI* getX86_64TargetABI() {
+    return new X86_64TargetABI;
+}
+
+
+bool X86_64TargetABI::returnInArg(TypeFunction* tf) {
+    assert(linkage() == tf->linkage);
+    Type* rt = tf->next->toBasetype();
+    
+    if (tf->linkage == LINKd) {
+        assert(rt->ty != Tsarray && "Update calling convention for static array returns");
+        
+        // All non-structs can be returned in registers.
+        if (rt->ty != Tstruct)
+            return false;
+        
+        // Try to figure out whether the struct fits in return registers
+        // and whether it's a good idea to put it there.
+        return !x86_64_D_cc::retStructInRegs((TypeStruct*) rt);
+    } else {
+        if (rt == Type::tvoid || keepUnchanged(rt))
+            return false;
+        
+        Classification cl = classify(rt);
+        return cl.isMemory;
+    }
+}
+
+bool X86_64TargetABI::passByVal(Type* t) {
+    if (linkage() == LINKd) {
+        if (t->ty != Tstruct)
+            return false;
+        
+        // Try to be smart about which structs are passed in memory.
+        return !x86_64_D_cc::passStructTypeDirectly((TypeStruct*) t);
+    } else {
+        // This implements the C calling convention for x86-64.
+        // It might not be correct for other calling conventions.
+        Classification cl = classify(t);
+        if (cl.isMemory)
+            return true;
+        
+        // Figure out how many registers we want for this arg:
+        RegCount wanted = { 0, 0 };
+        for (int i = 0 ; i < 2; i++) {
+            if (cl.classes[i] == Integer)
+                wanted.int_regs++;
+            else if (cl.classes[i] == Sse)
+                wanted.sse_regs++;
+        }
+        
+        // See if they're available:
+        RegCount& state = this->state();
+        if (wanted.int_regs <= state.int_regs && wanted.sse_regs <= state.sse_regs) {
+            state.int_regs -= wanted.int_regs;
+            state.sse_regs -= wanted.sse_regs;
+        } else {
+            if (keepUnchanged(t)) {
+                // Not enough registers available, but this is passed as if it's
+                // multiple arguments. Just use the registers there are,
+                // automatically spilling the rest to memory.
+                if (wanted.int_regs > state.int_regs)
+                    state.int_regs = 0;
+                else
+                    state.int_regs -= wanted.int_regs;
+                
+                if (wanted.sse_regs > state.sse_regs)
+                    state.sse_regs = 0;
+                else
+                    state.sse_regs -= wanted.sse_regs;
+            } else if (t->iscomplex() || t->ty == Tstruct) {
+                // Spill entirely to memory, even if some of the registers are
+                // available.
+                
+                // FIXME: Don't do this if *none* of the wanted registers are available,
+                //        (i.e. only when absolutely necessary for abi-compliance)
+                //        so it gets alloca'd by the callee and -scalarrepl can
+                //        more easily break it up?
+                // Note: this won't be necessary if the following LLVM bug gets fixed:
+                //       http://llvm.org/bugs/show_bug.cgi?id=3741
+                return true;
+            } else {
+                assert(t == Type::tfloat80 || t == Type::timaginary80 || t->size() < 8
+                    && "What other big types are there?"); // other than static arrays...
+                // In any case, they shouldn't be represented as structs in LLVM:
+                assert(!isaStruct(DtoType(t)));
+            }
+        }
+        // Everything else that's passed in memory is handled by LLVM.
+        return false;
+    }
+}
+
+// Helper function for rewriteFunctionType.
+// Return type and parameters are passed here (unless they're already in memory)
+// to get the rewrite applied (if necessary).
+void X86_64TargetABI::fixup(IrFuncTyArg& arg) {
+    LLType* abiTy = getAbiType(arg.type);
+    
+    if (abiTy && abiTy != arg.ltype) {
+        assert(arg.type == Type::tcomplex32 || arg.type->ty == Tstruct);
+        arg.ltype = abiTy;
+        arg.rewrite = &struct_rewrite;
+    }
+}
+
+void X86_64TargetABI::rewriteFunctionType(TypeFunction* tf) {
+    // extern(D) is handled entirely by passByVal and returnInArg
+    
+    if (tf->linkage != LINKd) {
+        // TODO: See if this is correct for more than just extern(C).
+        
+        IrFuncTy* fty = tf->fty;
+        
+        if (!fty->arg_sret) {
+            Logger::println("x86-64 ABI: Transforming return type");
+            Type* rt = fty->ret->type->toBasetype();
+            if (rt != Type::tvoid)
+                fixup(*fty->ret);
+        }
+        
+        
+        Logger::println("x86-64 ABI: Transforming arguments");
+        LOG_SCOPE;
+        
+        for (IrFuncTy::ArgIter I = fty->args.begin(), E = fty->args.end(); I != E; ++I) {
+            IrFuncTyArg& arg = **I;
+            
+            if (Logger::enabled())
+                Logger::cout() << "Arg: " << arg.type->toChars() << '\n';
+            
+            // Arguments that are in memory are of no interest to us.
+            if (arg.byref)
+                continue;
+            
+            Type* ty = arg.type->toBasetype();
+            
+            fixup(arg);
+            
+            if (Logger::enabled())
+                Logger::cout() << "New arg type: " << *arg.ltype << '\n';
+        }
+    }
+}
--- a/gen/abi-x86-64.h
+++ b/gen/abi-x86-64.h
@ -0,0 +1,10 @@
+#ifndef __LDC_GEN_ABI_X86_64_H__
+#define __LDC_GEN_ABI_X86_64_H__
+
+#include "gen/abi.h"
+
+
+TargetABI* getX86_64TargetABI();
+
+
+#endif
--- a/gen/abi.cpp
+++ b/gen/abi.cpp
@ -261,101 +261,7 @@ struct X86TargetABI : TargetABI
 //////////////////////////////////////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////

-struct X86_64_cfloat_rewrite : ABIRewrite
-{
-    // {double} -> {float,float}
-    LLValue* get(Type*, DValue* dv)
-    {
-        LLValue* in = dv->getRVal();
-
-        // extract double
-        LLValue* v = gIR->ir->CreateExtractValue(in, 0);
-        // cast to i64
-        v = gIR->ir->CreateBitCast(v, LLType::Int64Ty);
-
-        // extract real part
-        LLValue* rpart = gIR->ir->CreateTrunc(v, LLType::Int32Ty);
-        rpart = gIR->ir->CreateBitCast(rpart, LLType::FloatTy, ".re");
-
-        // extract imag part
-        LLValue* ipart = gIR->ir->CreateLShr(v, LLConstantInt::get(LLType::Int64Ty, 32, false));
-        ipart = gIR->ir->CreateTrunc(ipart, LLType::Int32Ty);
-        ipart = gIR->ir->CreateBitCast(ipart, LLType::FloatTy, ".im");
-
-        // return {float,float} aggr pair with same bits
-        return DtoAggrPair(rpart, ipart, ".final_cfloat");
-    }
-
-    // {float,float} -> {double}
-    LLValue* put(Type*, DValue* dv)
-    {
-        LLValue* v = dv->getRVal();
-
-        // extract real
-        LLValue* r = gIR->ir->CreateExtractValue(v, 0);
-        // cast to i32
-        r = gIR->ir->CreateBitCast(r, LLType::Int32Ty);
-        // zext to i64
-        r = gIR->ir->CreateZExt(r, LLType::Int64Ty);
-
-        // extract imag
-        LLValue* i = gIR->ir->CreateExtractValue(v, 1);
-        // cast to i32
-        i = gIR->ir->CreateBitCast(i, LLType::Int32Ty);
-        // zext to i64
-        i = gIR->ir->CreateZExt(i, LLType::Int64Ty);
-        // shift up
-        i = gIR->ir->CreateShl(i, LLConstantInt::get(LLType::Int64Ty, 32, false));
-
-        // combine
-        v = gIR->ir->CreateOr(r, i);
-
-        // cast to double
-        v = gIR->ir->CreateBitCast(v, LLType::DoubleTy);
-
-        // return {double}
-        const LLType* t = LLStructType::get(LLType::DoubleTy, NULL);
-        LLValue* undef = llvm::UndefValue::get(t);
-        return gIR->ir->CreateInsertValue(undef, v, 0);
-    }
-
-    // {float,float} -> {double}
-    const LLType* type(Type*, const LLType* t)
-    {
-        return LLStructType::get(LLType::DoubleTy, NULL);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////////
-
-struct X86_64TargetABI : TargetABI
-{
-    X86_64_cfloat_rewrite cfloat_rewrite;
-
-    bool returnInArg(TypeFunction* tf)
-    {
-        Type* rt = tf->next->toBasetype();
-        return (rt->ty == Tstruct);
-    }
-
-    bool passByVal(Type* t)
-    {
-        return t->toBasetype()->ty == Tstruct;
-    }
-
-    void rewriteFunctionType(TypeFunction* tf)
-    {
-        IrFuncTy* fty = tf->fty;
-        Type* rt = fty->ret->type->toBasetype();
-
-        // rewrite cfloat return for !extern(D)
-        if (tf->linkage != LINKd && rt == Type::tcomplex32)
-        {
-            fty->ret->rewrite = &cfloat_rewrite;
-            fty->ret->ltype = cfloat_rewrite.type(fty->ret->type, fty->ret->ltype);
-        }
-    }
-};
+#include "gen/abi-x86-64.h"

 //////////////////////////////////////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
@ -395,7 +301,7 @@ TargetABI * TargetABI::getTarget()
    case ARCHx86:
        return new X86TargetABI;
    case ARCHx86_64:
-        return new X86_64TargetABI;
+        return getX86_64TargetABI();
    default:
        Logger::cout() << "WARNING: Unknown ABI, guessing...\n";
        return new UnknownTargetABI;
--- a/gen/abi.h
+++ b/gen/abi.h
@ -35,8 +35,10 @@ struct TargetABI
 {
    static TargetABI* getTarget();

+    virtual void newFunctionType(TypeFunction* tf) {}
    virtual bool returnInArg(TypeFunction* tf) = 0;
    virtual bool passByVal(Type* t) = 0;
+    virtual void doneWithFunctionType() {}

    virtual void rewriteFunctionType(TypeFunction* t) = 0;
 };
--- a/gen/classes.cpp
+++ b/gen/classes.cpp
@ -957,6 +957,7 @@ DValue* DtoNewClass(Loc loc, TypeClass* tc, NewExp* newexp)
    // call constructor
    if (newexp->member)
    {
+        Logger::println("Calling constructor");
        assert(newexp->arguments != NULL);
        DtoForceDeclareDsymbol(newexp->member);
        DFuncValue dfn(newexp->member, newexp->member->ir.irFunc->func, mem);
--- a/gen/functions.cpp
+++ b/gen/functions.cpp
@ -24,6 +24,9 @@

 const llvm::FunctionType* DtoFunctionType(Type* type, Type* thistype, Type* nesttype, bool ismain)
 {
+    if (Logger::enabled())
+        Logger::println("DtoFunctionType(%s)", type->toChars());
+    LOG_SCOPE
    // sanity check
    assert(type->ty == Tfunction);
    TypeFunction* f = (TypeFunction*)type;
@ -34,6 +37,9 @@ const llvm::FunctionType* DtoFunctionType(Type* type, Type* thistype, Type* nest
        return llvm::cast<llvm::FunctionType>(type->ir.type->get());
    }

+    // Tell the ABI we're resolving a new function type
+    gABI->newFunctionType(f);
+
    // create new ir funcTy
    assert(f->fty == NULL);
    f->fty = new IrFuncTy();
@ -158,6 +164,9 @@ const llvm::FunctionType* DtoFunctionType(Type* type, Type* thistype, Type* nest
    // let the abi rewrite the types as necesary
    gABI->rewriteFunctionType(f);

+    // Tell the ABI we're done with this function type
+    gABI->doneWithFunctionType();
+
    // build the function type
    std::vector<const LLType*> argtypes;
    argtypes.reserve(lidx);
@ -184,6 +193,8 @@ const llvm::FunctionType* DtoFunctionType(Type* type, Type* thistype, Type* nest
    llvm::FunctionType* functype = llvm::FunctionType::get(f->fty->ret->ltype, argtypes, f->fty->c_vararg);
    f->ir.type = new llvm::PATypeHolder(functype);

+    Logger::cout() << "Final function type: " << *functype << "\n";
+
    return functype;
 }

@ -571,6 +582,7 @@ void DtoDefineFunction(FuncDeclaration* fd)

    assert(fd->ir.declared);

+    if (Logger::enabled())
        Logger::println("DtoDefineFunc(%s): %s", fd->toPrettyChars(), fd->loc.toChars());
    LOG_SCOPE;

--- a/gen/naked.cpp
+++ b/gen/naked.cpp
@ -168,13 +168,6 @@ void DtoDefineNakedFunction(FuncDeclaration* fd)

 //////////////////////////////////////////////////////////////////////////////////////////

-static LLValue* x86_64_cfloatRetFixup(IRBuilderHelper b, LLValue* orig) {
-    assert(orig->getType() == LLType::DoubleTy);
-    LLType* retty = LLStructType::get(LLType::DoubleTy, NULL);
-    LLValue* undef = llvm::UndefValue::get(retty);
-    return b->CreateInsertValue(undef, orig, 0, "asm.ret");
-}
-
 void emitABIReturnAsmStmt(IRAsmBlock* asmblock, Loc loc, FuncDeclaration* fdecl)
 {
    Logger::println("emitABIReturnAsmStmt(%s)", fdecl->mangle());
@ -186,6 +179,9 @@ void emitABIReturnAsmStmt(IRAsmBlock* asmblock, Loc loc, FuncDeclaration* fdecl)
    asmblock->retty = llretTy;
    asmblock->retn = 1;

+    // FIXME: This should probably be handled by the TargetABI somehow.
+    //        It should be able to do this for a greater variety of types.
+
    // x86
    if (global.params.cpu == ARCHx86)
    {
@ -293,7 +289,6 @@ void emitABIReturnAsmStmt(IRAsmBlock* asmblock, Loc loc, FuncDeclaration* fdecl)
                // extern(C) cfloat -> %xmm0 (extract two floats)
                as->out_c = "={xmm0},";
                asmblock->retty = LLType::DoubleTy;
-                asmblock->retfixup = &x86_64_cfloatRetFixup;
            } else if (rt->iscomplex()) {
                // cdouble and extern(D) cfloat -> re=%xmm0, im=%xmm1
                as->out_c = "={xmm0},={xmm1},";
--- a/gen/statements.cpp
+++ b/gen/statements.cpp
@ -97,6 +97,15 @@ void ReturnStatement::toIR(IRState* p)
            if (Logger::enabled())
                Logger::cout() << "return value is '" <<*v << "'\n";

+            IrFunction* f = p->func();
+            // Hack around LDC assuming structs are in memory:
+            // If the function returns a struct, and the return value is a
+            // pointer to a struct, load from it before returning.
+            if (f->type->next->ty == Tstruct && isaPointer(v->getType())) {
+                Logger::println("Loading struct type for return");
+                v = DtoLoad(v);
+            }
+
            // can happen for classes and void main
            if (v->getType() != p->topfunc()->getReturnType())
            {
--- a/gen/tocall.cpp
+++ b/gen/tocall.cpp
@ -217,6 +217,11 @@ void DtoBuildDVarArgList(std::vector<LLValue*>& args, std::vector<llvm::Attribut

 DValue* DtoCallFunction(Loc& loc, Type* resulttype, DValue* fnval, Expressions* arguments)
 {
+    if (Logger::enabled()) {
+        Logger::println("DtoCallFunction()");
+    }
+    LOG_SCOPE
+
    // the callee D type
    Type* calleeType = fnval->getType();

@ -386,6 +391,15 @@ DValue* DtoCallFunction(Loc& loc, Type* resulttype, DValue* fnval, Expressions*

            int j = tf->fty->reverseParams ? beg + n - i - 1 : beg + i;

+            // Hack around LDC assuming structs are in memory:
+            // If the function wants a struct, and the argument value is a
+            // pointer to a struct, load from it before passing it in.
+            if (argval->getType()->ty == Tstruct
+                    && isaPointer(arg) && !isaPointer(callableTy->getParamType(j))) {
+                Logger::println("Loading struct type for function argument");
+                arg = DtoLoad(arg);
+            }
+
            // parameter type mismatch, this is hard to get rid of
            if (arg->getType() != callableTy->getParamType(j))
            {
@ -468,24 +482,24 @@ DValue* DtoCallFunction(Loc& loc, Type* resulttype, DValue* fnval, Expressions*
    // get return value
    LLValue* retllval = (retinptr) ? args[0] : call.getInstruction();

-    if (tf->linkage == LINKintrinsic)
-    {
    // Ignore ABI for intrinsics
-        Type* rettype = tf->next;
-        if (rettype->ty == Tstruct) {
-            // LDC assumes structs are in memory, so put it there.
-            LLValue* mem = DtoAlloca(retllval->getType());
-            DtoStore(retllval, mem);
-            retllval = mem;
-        }
-    }
-    else if (!retinptr)
+    if (tf->linkage != LINKintrinsic && !retinptr)
    {
        // do abi specific return value fixups
        DImValue dretval(tf->next, retllval);
        retllval = tf->fty->getRet(tf->next, &dretval);
    }

+    // Hack around LDC assuming structs are in memory:
+    // If the function returns a struct, and the return value is not a
+    // pointer to a struct, store it to a stack slot before continuing.
+    if (tf->next->ty == Tstruct && !isaPointer(retllval)) {
+        Logger::println("Storing return value to stack slot");
+        LLValue* mem = DtoAlloca(retllval->getType());
+        DtoStore(retllval, mem);
+        retllval = mem;
+    }
+
    // repaint the type if necessary
    if (resulttype)
    {
--- a/ir/irfunction.h
+++ b/ir/irfunction.h
@ -64,7 +64,9 @@ struct IrFuncTy : IrBase
    IrFuncTyArg* arg_argptr;

    // normal explicit arguments
-    LLSmallVector<IrFuncTyArg*, 4> args;
+    typedef LLSmallVector<IrFuncTyArg*, 4> ArgList;
+    typedef ArgList::iterator ArgIter;
+    ArgList args;

    // C varargs
    bool c_vararg;