Text to CAS xml
Quick'n'dirty converter script I hacked up:
#!/usr/bin/perl use strict; use warnings; if (@ARGV != 1) { die <<"EODIE"; Usage: $0 filename.txt $0 filename.txt > filename.xml "filename.txt" is a tab-delimited text file, perhaps exported from Excel. The file is converted to xml format and printed on STDOUT. NB: the xml DTD is a guess based on commonChemMerge.10012008.xml The expected column layout is: 1: (ignored) 2: Name 3: CAS Number 4: Molecular Formula Any further columns are also ignored. Any row where the CAS Number field does not match the normal format for CAS format is omitted. EODIE } print <<'EOXML'; <?xml version="1.0" encoding="UTF-8"?> <CommonChemistryRecords> EOXML my $datafile = shift; if (open my $datafile_FH, '<', $datafile) { my $row = 0; local $/ = "\r"; while (defined($_=<$datafile_FH>)) { $row++; chomp; my %entry; @entry{qw/ x name cas mf /} = split /\t/; if (!defined $entry{cas}) { warn "Skip row $row: no CAS# defined\n"; next; } elsif ($entry{cas} !~ /^\d+-\d+-\d+$/) { warn "Skip row $row: '$entry{cas}' not valid CAS# format\n"; next; } print "<CommonChemistryRecord registryNumber=\"$entry{cas}\">\n"; print "<MolecularFormula>$entry{mf}</MolecularFormula>\n" if defined $entry{mf}; print "<NT1Name>$entry{name}</NT1Name>\n" if defined $entry{mf}; print "</CommonChemistryRecord>\n"; } close $datafile_FH; } else { die "Could not read $datafile: $!\n"; } print <<"EOXML"; </CommonChemistryRecords> EOXML