• Forum
  • Lounge
  • Made the mistake of trying to help out a

 
Made the mistake of trying to help out at SO again.

I guess I’ll never learn.

https://stackoverflow.com/questions/78396197/is-there-a-cleaner-way-to-write-this-c-code

Closed as “opinion-based”.



When the hoi-polloi learned to frob computer forums is when all semblance of reasoned thinking went out the window.
Meh, here’s what I played with, because it looked fun.

First, I figured I’d properly tokenize those atomic symbols into something useful. GNU provides a super-handy utility called gperf which generates a perfect hash lookup. We can massage its output pretty heavily to get:

elements.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#ifndef ATOMIC_ELEMENTS_H
#define ATOMIC_ELEMENTS_H


#if ('A' != 65)
	#error "elements.c requires ASCII alphabetic character codes."
#endif


struct Element
{
	const unsigned char number;     // Atomic Number: 1, 2, ...
	const char          symbol[3];  // Atomic Symbol: "H", "He", ...
//	const char *        name;       // Element Name:  "Hydrogen", "Helium", ...
//	const float         weight;     // Atomic Weight: 1.008, 4.002602, ...
//	...
};


extern const struct Element Elements[ 1+118 ];
// The Periodic Table, indexed by elements' atomic number.


unsigned atomic_symbol_to_number( const char * );
// Convert an element's atomic symbol ("H", "He", ...) to the corresponding atomic number.
// Returns the atomic number else 0.

const struct Element * atomic_symbol_to_element( const char * );
// Convert an element's atomic symbol ("H", "He", ...) to Element data.
// Returns a pointer into the Elements[] table else NULL.


#endif 

elements.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#include <iso646.h>
#include <string.h>
#include "elements.h"


const struct Element Elements[ 1+118 ] =
{
	// This is our Periodic Table of Elements
	// Indexed by element's atomic number
	{0,  ""  },
	{1,  "H" },{2,  "He"},{3,  "Li"},{4,  "Be"},{5,  "B" },{6,  "C" },{7,  "N" },{8,  "O" },
	{9,  "F" },{10, "Ne"},{11, "Na"},{12, "Mg"},{13, "Al"},{14, "Si"},{15, "P" },{16, "S" },
	{17, "Cl"},{18, "Ar"},{19, "K" },{20, "Ca"},{21, "Sc"},{22, "Ti"},{23, "V" },{24, "Cr"},
	{25, "Mn"},{26, "Fe"},{27, "Co"},{28, "Ni"},{29, "Cu"},{30, "Zn"},{31, "Ga"},{32, "Ge"},
	{33, "As"},{34, "Se"},{35, "Br"},{36, "Kr"},{37, "Rb"},{38, "Sr"},{39, "Y" },{40, "Zr"},
	{41, "Nb"},{42, "Mo"},{43, "Tc"},{44, "Ru"},{45, "Rh"},{46, "Pd"},{47, "Ag"},{48, "Cd"},
	{49, "In"},{50, "Sn"},{51, "Sb"},{52, "Te"},{53, "I" },{54, "Xe"},{55, "Cs"},{56, "Ba"},
	{57, "La"},{58, "Ce"},{59, "Pr"},{60, "Nd"},{61, "Pm"},{62, "Sm"},{63, "Eu"},{64, "Gd"},
	{65, "Tb"},{66, "Dy"},{67, "Ho"},{68, "Er"},{69, "Tm"},{70, "Yb"},{71, "Lu"},{72, "Hf"},
	{73, "Ta"},{74, "W" },{75, "Re"},{76, "Os"},{77, "Ir"},{78, "Pt"},{79, "Au"},{80, "Hg"},
	{81, "Tl"},{82, "Pb"},{83, "Bi"},{84, "Po"},{85, "At"},{86, "Rn"},{87, "Fr"},{88, "Ra"},
	{89, "Ac"},{90, "Th"},{91, "Pa"},{92, "U" },{93, "Np"},{94, "Pu"},{95, "Am"},{96, "Cm"},
	{97, "Bk"},{98, "Cf"},{99, "Es"},{100,"Fm"},{101,"Md"},{102,"No"},{103,"Lr"},{104,"Rf"},
	{105,"Db"},{106,"Sg"},{107,"Bh"},{108,"Hs"},{109,"Mt"},{110,"Ds"},{111,"Rg"},{112,"Cn"},
	{113,"Nh"},{114,"Fl"},{115,"Mc"},{116,"Lv"},{117,"Ts"},{118,"Og"},
};


static
const unsigned char hash_to_element[ 216 ] =
{
	// This is our hash table!
	// (key, value) --> (hash, index into Elements[])
	0,15,59,1,94,0,19,36,23,2,0,103,91,71,54,0,6,24,32,29,0,57,58,31,0,0,80,20,92,112,0,5,35,
	0,84,0,67,4,72,97,0,7,56,0,25,0,115,10,0,27,0,98,11,0,12,0,16,38,0,93,0,68,34,63,42,0,78,
	61,0,50,0,21,52,0,102,0,116,73,0,106,0,43,96,0,0,0,40,82,53,77,0,9,87,0,17,0,109,26,30,44,
	0,49,75,0,108,0,8,88,0,86,0,39,18,0,79,0,3,55,0,111,0,74,62,0,46,0,89,41,0,118,0,104,69,0,
	47,64,0,83,0,48,0,0,51,0,81,0,0,28,0,107,0,0,65,0,101,0,0,100,0,113,0,99,14,0,60,0,0,117,
	0,114,0,0,22,0,0,0,85,95,0,0,0,0,37,0,90,0,0,105,0,13,0,0,66,0,0,0,0,70,0,0,0,0,110,0,0,0,
	0,76,0,0,0,0,33,0,45
};


static
unsigned atomic_symbol_to_hash( const char * symbol )
{
	// Returns a (potentially valid) hash value in 1..214 else a (definitely invalid) 0
	static unsigned char xs[] =
	{
		215, 110,  30,  15, 100,  59,  90,  11,   2,  87, 215,   5,   9,  30,  40, 105,
		  0, 215,  95,  55,  65,  27,   7, 120,   7, 110,  84, 215, 215, 215, 215, 215,
		215,  10,  85,  14, 122,   5,  34,  22, 117, 105, 215,   7,  77,  65,  12,  32,
		 17, 215,   0, 100,  64,   2,  65, 215, 215,  90, 215, 215, 215, 215, 215, 215,
	};
	unsigned hash = 0;
	while (*symbol)
	{
		int x = (unsigned char) *symbol++;
		hash += 1 + xs[ ((x & 192) == 64) ? (x & 63) : 0 ];
	}
	return hash < 215 ? hash : 0;
}


unsigned atomic_symbol_to_number( const char * symbol )
{
	unsigned h = atomic_symbol_to_hash( symbol );
	return h and strcmp( symbol, Elements[ hash_to_element[ h ] ].symbol ) == 0
		? hash_to_element[ h ]
		: 0;
}


const struct Element * atomic_symbol_to_element( const char * symbol )
{
	unsigned h = atomic_symbol_to_hash( symbol );
	return h and strcmp( symbol, Elements[ hash_to_element[ h ] ].symbol ) == 0
		? (Elements + hash_to_element[ h ])
		: NULL;
}


Continued...
Next we can focus on our input parser to produce a list of (atom number, count) pairs. We can make a little class like this:

atom-parser.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#ifndef ATOM_PARSER_H
#define ATOM_PARSER_H

#include <stdbool.h>


// The language we will parse is:
//
//   L      ::= WS? ATOMS (WS '+' WS ATOMS)* WS?  // example: "O2 + H2O"
//   ATOMS  ::= ATOM+                             // examples: "H2O2"
//   ATOM   ::= (SYMBOL | GROUP) COUNT? ION?      // examples: "O2", "(OH)4"
//   SYMBOL ::= UPPER LOWER?                      // atomic symbol such as "K" or "Fe"
//   GROUP  ::= '(' ATOMS ')' | '[' ATOMS ']'     // stuff in parentheses
//   COUNT  ::= ('1'..'9') ('0'..'9')*            // unsigned integer number (just not "0")
//   ION    ::= '+' | '-'                         // ion charge
//   WS     ::= SP | FF | LF | CR | HT | VT       // whitespace
//
// Thus we can read things like:
//
//   (CH2O)6         glucose (sugar)
//   Al(OH)3         aluminum hydroxide
//   [Co(NH3)6]3+    complex cation
//   [CoCl4(NH3)2]-  complex anion
//   CH3OCH3 + H2O   dimethylether
//   [Fe(CN)6]4-     potassium ferrocyanide coordination complex
//
// All our parsing does is collect a list of (atom, count) pairs, possibly with duplicates.
// For example, dimethylether produces:
//
//   C1 H3 O1 C1 H3 H2 O2
//
// (Which can be later combined to get something like: H8 C2 O3.)
//
// Notice that we CANNOT handle abbreviations like "(en)" (ethylenediamine, or NH2CH2CH2NH2).
// It would be very easy to add that to the language and parser, though!


//-------------------------------------------------------------------------------------------------
struct atom
//-------------------------------------------------------------------------------------------------
// A handy linked list of the (element, count) pairs we are collecting:
{
	unsigned      number;  // Atomic element number ("H" --> 1, "He" --> 2, ...)
	unsigned      count;   // Number of atoms of this element
	struct atom * next;
};
typedef struct atom atom;

atom * make_atom( unsigned number, unsigned count, atom * next );
atom * free_atoms( atom * );


//-------------------------------------------------------------------------------------------------
struct atom_parser
//-------------------------------------------------------------------------------------------------
// This little object collects atoms from the input string.
{
	const char * s;      // source string
	atom *       atoms;  // list of atoms parsed from string
	const char * error;  // error message else NULL
};
typedef struct atom_parser atom_parser;

atom_parser make_atom_parser( const char * s );
bool parse_atoms( atom_parser * );
// Returns true if `s` was successfully parsed.
// Otherwise check the `parser.error` message and the position of `parser.s`.
// Don't forget to `free_atoms( parser.atoms )`.


#endif 

Last edited on
atom-parser.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#include <ctype.h>
#include <iso646.h>
#include <stdlib.h>

#include "elements.h"
#include "atom-parser.h"


// Notice how very like a C++ class this design is:
//   Public functions are prototyped in the header file.
//   Private functions are marked local to this file with 'static'.
//
// Each parse_X() function implements a piece of the BNF language described
// in the header file.
//
// The parse functions return whether something was successfully parsed, or
// false on error.


atom * make_atom( unsigned number, unsigned count, atom * next )
{
	atom * p = malloc( sizeof(atom) );
	atom a = { number, count, next };
	if (p) *p = a;
	return p;
}


atom * free_atoms( atom * atoms )
{
	while (atoms)
	{
		atom * next = atoms->next;
		free( atoms );
		atoms = next;
	}
	return NULL;
}


atom_parser make_atom_parser( const char * s )
{
	atom_parser parser = { .s=s, .atoms=NULL, .error=NULL };
	return parser;
}


static bool error( atom_parser * parser, const char * message )
{
	parser->error = message;
	return false;
}


static bool add_atom( atom_parser * parser, unsigned atomic_number, unsigned count )
{
	// For our purposes, order doesn't matter, so we just tack each new atom onto the
	// head of the list as we get it. If order mattered we could easily track the tail
	// of the list to append items that way. (Or just reverse the list once collected.)
	atom * atom = make_atom( atomic_number, count, parser->atoms );
	if (!atom) return error( parser, "memory allocation failure" );
	parser->atoms = atom;
	return true;
}


static bool skip_whitespace( atom_parser * parser )
{
	bool ok = 0;
	while (*parser->s and isspace( (unsigned char) *parser->s ))
		parser->s += ok = 1;
	return ok;
}


static bool peek_char( atom_parser * parser, char c )
{
	return *parser->s == c;
}


static bool parse_char( atom_parser * parser, char c )
{
	bool ok = peek_char( parser, c );
	parser->s += ok;
	return ok;
}


static unsigned parse_COUNT_ION( atom_parser * parser )
{
	// COUNT ::= ('1'..'9') ('0'..'9')*
	const char * error_s = parser->s;
	bool has_digits = false;

		unsigned count = 0;
		while (isdigit( (unsigned char) *parser->s ))
		{
			count = count * 10 + ( *(parser->s)++ - '0' );
			has_digits = true;
		}

	if (has_digits and !count)
	{
		parser->s = error_s;
		return error( parser, "zero is not a valid atom count" );
	}

	// ION ::= '+' | '-'
	if (!parse_char( parser, '+' )) parse_char( parser, '-' );

	return count ? count : 1;  // ALWAYS RETURN A VALID COUNT!
}


static bool parse_SYMBOL_COUNT_ION( atom_parser * parser )
{
	// SYMBOL ::= UPPER LOWER?
	char symbol_name[ 3 ];
	const char * parser_s = parser->s;

	// An atomic symbol is an uppercase letter followed by a lowercase letter
	char * symbol = symbol_name;
	if (isupper( (unsigned char) *parser->s )) { *symbol++ = *(parser->s)++;
	if (islower( (unsigned char) *parser->s ))   *symbol++ = *(parser->s)++; }
	*symbol = '\0';

	if (!*symbol_name) return false;

	// Convert / Validate
	unsigned number = atomic_symbol_to_number( symbol_name );
	if (!number)
	{
		parser->s = parser_s;
		return error( parser, "invalid atomic symbol" );
	}

	return add_atom( parser, number, parse_COUNT_ION( parser ) );
}


static bool parse_ATOMS( atom_parser * );
// ↑ forward declaration for mutual recursion with parse_GROUP_COUNT_ION()


static bool parse_GROUP_COUNT_ION( atom_parser * parser )
{
	// GROUP ::= '(' ATOMS ')' | '[' ATOMS ']'

	if (!parse_char( parser, '(' ) and !parse_char( parser, '[' ))
		return false;

	// Match the open parenthesis
	char close_paren = (parser->s[-1] == '(') ? ')' : ']';

	// New atoms will be added before the current
	atom * end = parser->atoms;

	// Recurse to get the parenthesized atoms
	if (!parse_ATOMS( parser )) return false;

	// Parentheses must terminate with matching parenthesis
	if (!parse_char( parser, close_paren ))
		return error( parser, close_paren == ')' ? "expected ')'" : "expected ']'" );

	// Update the counts of all the grouped atoms
	unsigned count = parse_COUNT_ION( parser );
	for (atom * iter = parser->atoms;  iter != end;  iter = iter->next)
		iter->count *= count;

	return true;
}


static bool parse_ATOM( atom_parser * parser )
{
	// ATOM ::= (SYMBOL | GROUP) COUNT? ION?
	return
		parse_GROUP_COUNT_ION( parser ) or
		parse_SYMBOL_COUNT_ION( parser );
}


static bool parse_ATOMS( atom_parser * parser )
{
	// ATOMS ::= ATOM+

	// One ATOM required
	if (!parse_ATOM( parser ))
		return !parser->error and error( parser, "expected ATOM" );

	// Additional ATOMs are optional
	while (parse_ATOM( parser ))
		;

	return true;
}


bool parse_atoms( atom_parser * parser )
{
	// This public-facing function parses the top-level L part of our language specification:
	//   L ::= WS? ATOMS (WS '+' WS ATOMS)* WS?

	// Optional leading whitespace
	skip_whitespace( parser );

	// Nothing parsed or error?
	if (!parse_ATOMS( parser )) return false;

	// While (additional terms)
	while (skip_whitespace( parser ) and parse_char( parser, '+' ))
	{
		// An additional term must exist (unless there was an error)
		if (!skip_whitespace( parser ))
			return error( parser, "expected whitespace after '+'" );

		if (!parse_ATOMS( parser ))
			return !parser->error and error( parser, "unexpected end of input" );
	}

	// Errors propagate
	if (parser->error) return false;

	// Expect end of input
	switch (*parser->s)
	{
		case  0 : break;
		case ')': return error( parser, "unexpected ')'" );
		case ']': return error( parser, "unexpected ']'" );
		default:  return error( parser, "expected end of input or \" + \"" );
	}

	return true;
}

Last edited on

Then we can pull it all together in:

main.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#include <iso646.h>
#include <stdio.h>
#include <string.h>

#include "elements.h"
#include "atom-parser.h"


int print_error( const char * s, unsigned n, const char * message )
{
	fprintf( stderr, "ERROR: %s\n", s );
	fprintf( stderr, "%*s^ %s\n", n+7, " ", message );
	return 1;
}


int main( int argc, char ** argv )
{
	// A chemical formula to parse
	char s[ 100 ] = "";
	{
		// Provided by user at the command-line
		if (argc > 1)
		{
			for (int n = 1;  n < argc;  n++)
			{
				strcat( s, " " );
				strcat( s, argv[n] );
			}
		}
		// Else ask user for it
		else
		{
			printf( "formula? " );
			fflush( stdout );
			fgets( s, sizeof(s), stdin );
			char * p = strchr( s, '\n' );
			if (p) *p = '\0';
		}
	}

	// Convert the input string into a list of (atom,count) pairs
	atom * atoms = NULL;
	{
		atom_parser parser = make_atom_parser( s );
		if (!parse_atoms( &parser ) and parser.error)
		{
			free_atoms( parser.atoms );
			return print_error( s, parser.s - s, parser.error );
		}
		atoms = parser.atoms;
	}

	// Tally the counts for each element
	unsigned counts[ 1+118 ] = { 0 };  // 1..118 elements
	for (atom * iter = atoms;  iter;  iter = iter->next)
	{
		counts[ iter->number ] += iter->count;
	}
	
	// Done with these
	free_atoms( atoms );

	// Pretty print the results (in atomic order)
	for (int n = 1, k = 0;  n < 1+118;  n++)
		if (counts[n])
		{
			if (k++) printf( " + " );
			printf( "%s%u", Elements[n].symbol, counts[n] );
		}
	puts( "" );
}


Sure, you can write this much smaller. I initially did just that. But my small-n-sweet version was significantly less capable and failed to handle errors properly.

The formulas listed in the header:
formula? (CH2O)6
H12 + C6 + O6

formula? Al(OH)3
H3 + O3 + Al1

formula? [Co(NH3)6]3+
H54 + N18 + Co3

formula? [CoCl4(NH3)2]-
H6 + N2 + Cl4 + Co1

formula? CH3OCH3 + H2O
H8 + C2 + O2

formula? [Fe(CN)6]4-
C24 + N24 + Fe4

formula? Og+O
O1 + Og1

formula? Og-O
O1 + Og1


Errors:
formula? 
ERROR: 
       ^ expected ATOM

formula? 2
ERROR: 2
       ^ expected ATOM

formula? Oz
ERROR: Oz
       ^ invalid atomic symbol

formula? (Og
ERROR: (Og
          ^ expected ')'

formula? Og)
ERROR: Og)
         ^ unexpected ')'

formula? Og+ O
ERROR: Og+ O
           ^ expected end of input or " + "

formula? Og +O
ERROR: Og +O
           ^ expected whitespace after '+'

formula? Og++O
ERROR: Og++O
          ^ expected end of input or " + "

formula? [H2O)
ERROR: [H2O)
           ^ expected ']'

Meh.


I at least had fun playing with it.
Last edited on
Topic archived. No new replies allowed.