On 2/8/2023 4:33 AM, servoloro wrote:
> On 2/8/23 10:31, servoloro wrote:
>> *Newbie question*
>> I have to convert a string from the format (how it's called ?)
>> \uXXXX
>> to (again:how it's called ?)
>> %XX%XX
>> i.e. from \u00dc to %C3%9C.
>> Apart from doing a dumb replaceAll
>> I'm sure there is a smarter way.
>> Not knowing the names of the formats Google didn't help me :-(
>> Could someone give me hints/directions ?
> Sorry it is:
> > s=s.replaceAll("\\\\u00dc", "%C3%9C");
There are a lot complications here.
- "\u00dc" is 1 char but "\\u00dc" is 6 chars
- you seems to have an implicit assumption about UTF-8 encoding
- the type of encode is generally known as URL encode, but
there is some ambiguity in that like whether you want
spaces as is or converted to plus sign
But the code below should illustrate a lot.
Arne
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class UFun {
private static String encode_hack(String s) {
return s.replace("\u00dc", "%C3%9C")
.replace("\u00c6", "%C3%86")
.replace("\u00d8", "%C3%98")
.replace("\u00c5", "%C3%85")
.replace("\u00e6", "%C3%A6")
.replace("\u00f8", "%C3%B8")
.replace("\u00e5", "%C3%A5");
}
private static String encode_manual(String s) throws
UnsupportedEncodingException {
StringBuilder sb = new StringBuilder();
for(byte b : s.getBytes("UTF-8")) {
if(32 <= b && b < 127) {
sb.append((char)b);
} else if (0 <= b && b < 10) {
sb.append("%0");
sb.append(Integer.toHexString(b).toUpperCase());
} else {
sb.append('%');
sb.append(Integer.toHexString(b & 0xFF).toUpperCase());
}
}
return sb.toString();
}
private static String encode_builtin(String s) throws
UnsupportedEncodingException {
return URLEncoder.encode(s, "UTF-8").replace("+", "
").replace("%3A", ":");
}
private static void test1(String s) throws
UnsupportedEncodingException {
String s2a = encode_hack(s);
System.out.printf("%s -> %s\n", s, s2a);
String s2b = encode_manual(s);
System.out.printf("%s -> %s\n", s, s2b);
String s2c = encode_builtin(s);
System.out.printf("%s -> %s\n", s, s2c);
}
private static final Pattern p =
Pattern.compile("\\\\u([0-9A-Fa-f]{4})");
private static String decode(String s) {
Matcher m = p.matcher(s);
StringBuffer res = new StringBuffer();
while (m.find()) {
m.appendReplacement(res, Character.toString((char)
Integer.parseInt(m.group(1), 16)));
}
m.appendTail(res);
return res.toString();
}
private static String decode_encode_hack(String s) {
return encode_hack(decode(s));
}
private static String decode_encode_manual(String s) throws
UnsupportedEncodingException {
return encode_manual(decode(s));
}
private static String decode_encode_builtin(String s) throws
UnsupportedEncodingException {
return encode_builtin(decode(s));
}
private static void test2(String s) throws
UnsupportedEncodingException {
String s2a = decode_encode_hack(s);
System.out.printf("%s -> %s\n", s, s2a);
String s2b = decode_encode_manual(s);
System.out.printf("%s -> %s\n", s, s2b);
String s2c = decode_encode_builtin(s);
System.out.printf("%s -> %s\n", s, s2c);
}
public static void main(String[] args) throws
UnsupportedEncodingException {
test1("This is \u00dc and Ü and Danish: ÆØÅæøå");
test2("This is \\u00dc and Ü and Danish: ÆØÅæøå");
}
}