Here's the category on NSFileManager I have for this, using libicucore:
// NSFileManager+OTAdditions.m
// Created by Hamish Allan
// Copyright 2012 Olive Toast.
// http://creativecommons.org/licenses/by/3.0/
// Attribution requirement limited to comments in source code.
#import "ucsdet.h"
#define UOnFailReturnNil(errorCode) if (U_FAILURE(errorCode)) {
NSLog(@"%s (%d): %s", __PRETTY_FUNCTION__, __LINE__,
u_errorName(errorCode)); if (charsetDetector)
ucsdet_close(charsetDetector); return nil; }
@implementation NSFileManager (OTAdditions)
- (NSString *)otCharsetForTextFileAtPath:(NSString *)path
{
UErrorCode errorCode = U_ZERO_ERROR;
UCharsetDetector *charsetDetector = ucsdet_open(&errorCode);
UOnFailReturnNil(errorCode);
NSData *characterData = [NSData dataWithContentsOfMappedFile:path];
ucsdet_setText(charsetDetector, [characterData bytes],
[characterData length], &errorCode);
UOnFailReturnNil(errorCode);
const UCharsetMatch *bestMatch = ucsdet_detect(charsetDetector, &errorCode);
UOnFailReturnNil(errorCode);
const char *encodingName = ucsdet_getName(bestMatch, &errorCode);
UOnFailReturnNil(errorCode);
NSString *encodingNameString = [NSString stringWithUTF8String:encodingName];
ucsdet_close(charsetDetector);
return encodingNameString;
}
@end
Hope this helps,
Hamish
> Can anyone point me to some code that wraps this up neatly, and catches all the edge cases etc?
There actually isn't all that much to catch. I have a bit of sample code in this article under "Fallbacks":
http://mikeash.com/pyblog/friday-qa-2010-02-19-character-encodings.html
Note that once you get to MacOSRoman, you can stop checking any others, because MacOSRoman will successfully (if not necessarily correctly) decode any sequence of bytes you throw at it.
That approach is best if you have data that you really expect to be UTF-8, need some vaguely useful results if it's not, but don't really care about seriously detecting and correctly presenting the variety of weird encodings out there. If you really need a good chance of handling weird encodings, Hamish's approach is probably what you want to go for.
Mike